# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [111]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [112]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-01-10.csv")

In [113]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
2021,Audi Q5,1 968 cm3 • 191 CP • audi q5 q5 40 tdi quattro...,Km48 500 kmCombustibilDieselAnul fabricației2020,Otopeni (Bucuresti - Ilfov),51 849
33450,Volkswagen Passat,"1 968 cm3 • 150 CP • carte service, xenon, acc,",Km223 100 kmCombustibilDieselAnul fabricației2...,Suceava (Suceava),14 500
28562,Smart Forfour,1 499 cm3 • 95 CP • smart forfour,Km179 000 kmCombustibilDieselAnul fabricației2...,Bucuresti (Bucuresti - Ilfov),1 800
33670,Volkswagen Golf,"1 968 cm3 • 140 CP • Vand GOLF 5, 2.0 TDI, BKD",Km298 000 kmCombustibilDieselAnul fabricației2...,Iernut (Mures),3 600
3467,Bentley Bentayga,3 996 cm3 • 550 CP • Night Vision / ACC / HUD ...,Km4 500 kmCombustibilBenzinaAnul fabricației2023,Bucuresti (Bucuresti - Ilfov),344 900
2986,Audi A6 Avant 2.0 TDI Ultra S tronic,1 968 cm3 • 150 CP • Audi A6 2.0 TDI Ultra s t...,Km178 000 kmCombustibilDieselAnul fabricației2...,Filiasi (Dolj),15 999
2580,Audi A3,"999 cm3 • 110 CP • In stoc, livrare rapida",Km12 kmCombustibilBenzinaAnul fabricației2023,Galati (Galati),32 400
24323,Renault Kadjar TCe GPF Intens,1 332 cm3 • 140 CP • Renault Kadjar Equilibre ...,Km55 kmCombustibilBenzinaAnul fabricației2022,Otopeni (Bucuresti - Ilfov),25 349
8738,Dacia Logan MCV 1.2 GPL Laureate,1 149 cm3 • 75 CP • Dacia Logan Mcv 2014 1.2 G...,Km151 428 kmCombustibilBenzina + GPLAnul fabri...,Craiova (Dolj),5 590
33816,Volkswagen Passat,"1 968 cm3 • 140 CP • bord si volan din lemn, i...",Km302 000 kmCombustibilDieselAnul fabricației2...,Cluj-Napoca (Cluj),4 800


In [114]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,36292,36292,36292,36292,36292
unique,12208,29956,22705,771,6595
top,Volkswagen Passat,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul fabricației2023,Bucuresti (Bucuresti - Ilfov),12 990
freq,235,230,245,8446,183


In [115]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36292 entries, 0 to 36291
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   36292 non-null  object
 1   second  36292 non-null  object
 2   third   36292 non-null  object
 3   fourth  36292 non-null  object
 4   fifth   36292 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


### 4.2. Data processing

In [116]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "toyota": "land", "volvo": "xc"}

In [117]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [118]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [119]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [120]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [121]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [122]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [123]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "fabricației", "start"])
df["year"] = df["year"].apply(remove_char, args=["fabricației"])

In [124]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [125]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [126]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [127]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [128]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [129]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [130]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-01-10.csv")

### 4.3. Final analysis

In [131]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
21247,Nissan,Qashqai,52000,1332,160,2019,Benzina,Bucuresti,Bucuresti,20200
4099,BMW,Seria 6,219352,2993,313,2015,Diesel,Bucuresti,Voluntari,24999
22853,Peugeot,308,227311,1560,120,2015,Diesel,Bucuresti,Otopeni,7950
17618,Mercedes-Benz,ML,241200,2987,258,2014,Diesel,Bucuresti,Bucuresti,20490
20794,Nissan,LEAF,17895,0,150,2019,Electric,Bucuresti,Bucuresti,18900
20762,Nissan,X-Trail,185621,1618,163,2016,Benzina,Cluj,Cluj-Napoca,14350
1899,Audi,A4,312951,1968,190,2016,Diesel,Bucuresti,Otopeni,14990
3467,Bentley,Bentayga,4500,3996,550,2023,Benzina,Bucuresti,Bucuresti,344900
12210,Ford,Puma,35000,999,125,2021,Benzina,Bucuresti,Bucuresti,16950
5132,BMW,Seria 3,34527,1998,184,2020,Benzina,Bucuresti,Bucuresti,35450


In [132]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,36292,36292,36292.0,36292.0,36292.0,36292.0,36292,36292,36292,36292.0
unique,62,652,,,,,6,43,698,
top,BMW,Golf,,,,,Diesel,Bucuresti,Bucuresti,
freq,4640,1153,,,,,22545,12579,8446,
mean,,,140033.8,1927.118043,178.467293,2016.109969,,,,25946.095916
std,,,95974.23,751.149231,97.320633,5.092255,,,,33977.808656
min,,,0.0,0.0,0.0,1964.0,,,,100.0
25%,,,63244.75,1498.0,115.0,2013.0,,,,8700.0
50%,,,145000.0,1968.0,150.0,2017.0,,,,15500.0
75%,,,206864.0,1998.0,197.0,2020.0,,,,29893.75


In [133]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36292 entries, 0 to 36291
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  36292 non-null  object
 1   model         36292 non-null  object
 2   mileage       36292 non-null  int32 
 3   capacity      36292 non-null  int32 
 4   power         36292 non-null  int32 
 5   year          36292 non-null  int32 
 6   fuel          36292 non-null  object
 7   county        36292 non-null  object
 8   city          36292 non-null  object
 9   price         36292 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.1+ MB
