# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [48]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [49]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-02-10.csv")

In [50]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
35696,Volkswagen Passat GTE,"1 395 cm3 • 156 CP • Vw Passat GTE , Primul Pr...",Km120 000 kmCombustibilHibridAnul fabricației2...,Pantelimon (Bucuresti),23 400
14629,Hyundai i20 1.2 L 84CP 5DR Comfort,1 197 cm3 • 84 CP • DETALII LA NR.TEL. 0741333096,Km1 kmCombustibilBenzinaAnul fabricației2023,Constanta (Constanta),15 172
14637,Hyundai Tucson 2.0 CRDI 4WD 6AT Premium+ Desig...,1 995 cm3 • 185 CP • Revizii gratuita_garantie...,Km182 000 kmCombustibilDieselAnul fabricației2...,Cluj-Napoca (Cluj),16 998
27677,Renault Clio Estate 0.9 TCe Dynamique,898 cm3 • 90 CP,Km139 941 kmCombustibilBenzinaAnul fabricației...,Timisoara (Timis),5 990
36944,Volkswagen Passat Variant TDI,1 896 cm3 • 115 CP • Vand Passat 1.9 TDI an 2000,Km288 300 kmCombustibilDieselAnul fabricației2...,Bucuresti (Bucuresti),1 700
30553,Skoda Kodiaq,1 968 cm3 • 190 CP • Mașina funcționează perfe...,Km228 000 kmCombustibilDieselAnul fabricației2...,Popesti-Leordeni (Bucuresti),23 000
37955,Volvo XC 60 D5 AWD Inscription,1 969 cm3 • 235 CP • Volvo XC60 Inscription AWD,Km149 800 kmCombustibilDieselAnul fabricației2...,Alba Iulia (Alba),28 490
25540,Porsche Macan 3.0 PDK S,2 967 cm3 • 258 CP • 3.0diesel V6 4x4 / BOSE /...,Km226 059 kmCombustibilDieselAnul fabricației2...,Suceava (Suceava),29 999
17230,Mazda CX-5 CD150 4x2 AT Revolution,2 191 cm3 • 150 CP • Mazda CX5 2018 (Noiembrie...,Km148 000 kmCombustibilDieselAnul fabricației2...,Braila (Braila),18 200
25399,Peugeot 206 1.4HDi X-Line,1 398 cm3 • 68 CP • Masina in stare de funcion...,Km165 000 kmCombustibilDieselAnul fabricației2...,Chiajna (Bucuresti),1 850


In [51]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,38136,38136,38136,38136,38136
unique,12625,31470,23293,875,6846
top,Volkswagen Passat,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul fabricației2023,Bucuresti (Bucuresti),9 990
freq,228,229,284,8685,183


In [52]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38136 entries, 0 to 38135
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   38136 non-null  object
 1   second  38136 non-null  object
 2   third   38136 non-null  object
 3   fourth  38136 non-null  object
 4   fifth   38136 non-null  object
dtypes: object(5)
memory usage: 1.5+ MB


### 4.2. Data processing

In [53]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [54]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [55]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [56]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [57]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [58]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [59]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [60]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "fabricației", "start"])
df["year"] = df["year"].apply(remove_char, args=["fabricației"])

In [61]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [62]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [63]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [64]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [65]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [66]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [67]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-02-10.csv")

### 4.3. Final analysis

In [68]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
5521,BMW,iX1,50,0,313,2023,Electric,Brasov,Brasov,54899
27603,Renault,Megane,2500,1332,140,2022,Benzina,Brasov,Brasov,20250
36412,Volkswagen,Golf,223441,1968,150,2016,Diesel,Braila,Braila,11890
1565,Audi,A6,228522,1968,190,2016,Diesel,Bucuresti,Bucuresti,19499
17901,Mercedes-Benz,EQE,1,0,408,2023,Electric,Bucuresti,Bucuresti,119886
17599,Mercedes-Benz,A,181000,1461,109,2016,Diesel,Bucuresti,Tunari,14490
31376,Toyota,RAV4,60280,2487,185,2019,Hibrid,Bucuresti,Bucuresti,37438
21889,Mercedes-Benz,V,394000,2143,163,2015,Diesel,Bucuresti,Bucuresti,26800
10605,Dacia,Logan,189600,1461,90,2014,Diesel,Prahova,Tatarani,5490
16039,Land Rover,Range Rover,92299,2996,400,2019,Benzina,Bucuresti,Otopeni,83979


In [69]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,38136,38136,38136.0,38136.0,38136.0,38136.0,38136,38136,38136,38136.0
unique,75,678,,,,,7,47,756,
top,BMW,Golf,,,,,Diesel,Bucuresti,Bucuresti,
freq,4913,1166,,,,,23489,11942,8685,
mean,,,140924.3,1933.873453,180.285269,2016.200913,,,,26082.141048
std,,,96605.59,754.237161,97.909946,5.054626,,,,34132.171883
min,,,0.0,0.0,0.0,1947.0,,,,11.0
25%,,,63357.75,1498.0,115.0,2013.0,,,,8890.0
50%,,,146000.0,1968.0,150.0,2017.0,,,,15950.0
75%,,,209000.0,1999.0,200.0,2020.0,,,,29900.0


In [70]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38136 entries, 0 to 38135
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  38136 non-null  object
 1   model         38136 non-null  object
 2   mileage       38136 non-null  int32 
 3   capacity      38136 non-null  int32 
 4   power         38136 non-null  int32 
 5   year          38136 non-null  int32 
 6   fuel          38136 non-null  object
 7   county        38136 non-null  object
 8   city          38136 non-null  object
 9   price         38136 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.2+ MB
