# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [283]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [284]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-04-10.csv")

In [285]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
12742,Ford Mondeo Vignale 2.0 Hybrid,"1 999 cm3 • 187 CP • GARANTIE 2 ANI, Hibird, A...",Km88 895 kmCombustibilHibridAnul producției2018,Tunari (Ilfov),18 990
27236,Opel Astra Sports Tourer 1.7 CDTI,1 686 cm3 • 110 CP,Km240 000 kmCombustibilDieselAnul producției2013,Lugoj (Timis),5 800
11224,Dacia Logan 1.2 16V Laureate,1 149 cm3 • 75 CP • Dacia Logan 2011. Euro5. U...,Km151 000 kmCombustibilBenzinaAnul producției2...,Bucuresti (Bucuresti),3 250
13423,Ford Transit Connect,1 560 cm3 • 95 CP • Ford Transit Connect,Km210 000 kmCombustibilDieselAnul producției2015,Bucuresti (Bucuresti),7 000
16784,Jaguar I-Pace SE,400 CP • primul proprietar,Km1 000 kmCombustibilElectricAnul producției2021,Bucuresti (Bucuresti),71 876
20573,Mercedes-Benz EQA 300 4Matic AMG Line,228 CP • EQA 300 AMG Line Premium Pamorama Dri...,Km6 000 kmCombustibilElectricAnul producției2023,Arad (Arad),59 381
7824,BMW X5 xDrive30d,2 993 cm3 • 245 CP,Km351 361 kmCombustibilDieselAnul producției2011,Bucuresti (Bucuresti),9 000
16006,Hyundai Tucson 2.0 CRDi 4WD Automatik Passion ...,1 995 cm3 • 136 CP • Hyundai Tucson 2.0 CRDi X...,Km117 000 kmCombustibilDieselAnul producției2016,Braila (Braila),15 790
2162,Audi Q5 2.0 40 TDI quattro S tronic Design,1 968 cm3 • 190 CP • Audi Q5 2.0 Quattro Desig...,Km199 000 kmCombustibilDieselAnul producției2019,Baia Mare (Maramures),25 990
42042,Volvo V60 Cross Country T5 AWD Geartronic Pro,1 969 cm3 • 250 CP • VOLVO V60 CROSS COUNTRY T...,Km207 400 kmCombustibilBenzinaAnul producției2...,Cluj-Napoca (Cluj),22 997


In [286]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,43247,43247,43247,43247,43247
unique,13692,35361,25933,919,6784
top,BMW Seria 3,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul producției2023,Bucuresti (Bucuresti),9 990
freq,271,279,196,10060,213


In [287]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43247 entries, 0 to 43246
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   43247 non-null  object
 1   second  43247 non-null  object
 2   third   43247 non-null  object
 3   fourth  43247 non-null  object
 4   fifth   43247 non-null  object
dtypes: object(5)
memory usage: 1.6+ MB


### 4.2. Data processing

In [288]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [289]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [290]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [291]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [292]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [293]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [294]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [295]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "producției", "start"])
df["year"] = df["year"].apply(remove_char, args=["producției"])

In [296]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [297]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [298]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [299]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [300]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [301]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [302]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-04-10.csv")

### 4.3. Final analysis

In [303]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
2310,Audi,A6,12,1968,204,2023,Diesel,Galati,Galati,69900
23733,Mercedes-Benz,A,170000,1461,109,2018,Diesel,Timis,Lugoj,15100
20919,Mercedes-Benz,C,3000,1999,204,2023,Hibrid,Arad,Arad,62951
15498,Ford,Puma,40707,999,155,2021,Benzina,Ilfov,Dudu,21500
1986,Audi,A4,238000,1896,130,2004,Diesel,Alba,Sebes,2700
30413,Renault,Megane,217000,1461,110,2016,Diesel,Arges,Pitesti,8900
17303,Kia,Optima,120000,1999,154,2019,Plug-in,Bucuresti,Bucuresti,21063
38454,Volkswagen,Tiguan,210000,1968,177,2013,Diesel,Arges,Pitesti,13200
22841,Mercedes-Benz,S,169000,2987,258,2015,Diesel,Constanta,Constanta,35000
33765,Skoda,Superb,209000,1798,180,2017,Benzina,Maramures,Baia,12600


In [304]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,43247,43247,43247.0,43247.0,43247.0,43247.0,43247,43247,43247,43247.0
unique,77,703,,,,,7,47,811,
top,BMW,Passat,,,,,Diesel,Bucuresti,Bucuresti,
freq,5610,1338,,,,,26795,10078,10060,
mean,,,146499.7,1926.863782,178.484774,2015.952135,,,,24857.06
std,,,96856.01,744.464626,106.412882,5.285086,,,,54728.67
min,,,0.0,0.0,0.0,1933.0,,,,10.0
25%,,,70000.0,1498.0,115.0,2013.0,,,,8250.0
50%,,,152000.0,1968.0,150.0,2017.0,,,,14990.0
75%,,,214429.5,1998.0,197.0,2020.0,,,,28500.0


In [305]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43247 entries, 0 to 43246
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  43247 non-null  object
 1   model         43247 non-null  object
 2   mileage       43247 non-null  int32 
 3   capacity      43247 non-null  int32 
 4   power         43247 non-null  int32 
 5   year          43247 non-null  int32 
 6   fuel          43247 non-null  object
 7   county        43247 non-null  object
 8   city          43247 non-null  object
 9   price         43247 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.5+ MB
