# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [93]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [94]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-05-10.csv")

In [95]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
35237,Volkswagen Passat CC 2.0 TDI DSG BMT,1 968 cm3 • 184 CP • R Line / Trapa / Dynaudio...,Km186 000 kmCombustibilDieselAnul producției2016,Braila (Braila),15 400
10102,Dacia Sandero Stepway dCi 90 Prestige,"1 461 cm3 • 90 CP • model Pestige, 1.5 diesel,...",Km200 000 kmCombustibilDieselAnul producției2015,Cluj-Napoca (Cluj),6 900
15165,Land Rover Freelander 1.8i,1 796 cm3 • 117 CP • Land Rover Freelander 4X4...,Km163 835 kmCombustibilBenzinaAnul producției2...,Zalau (Salaj),3 999
3730,BMW X5 xDrive30d,2 993 cm3 • 245 CP • BMW xDrive30d Panoramic,Km230 000 kmCombustibilDieselAnul producției2011,Bucuresti (Bucuresti),12 850
25741,Renault Talisman ENERGY dCi 160 EDC INITIALE P...,1 598 cm3 • 160 CP • Renault Talisman 1.6Dci /...,Km178 000 kmCombustibilDieselAnul producției2017,Bucuresti (Bucuresti),15 999
18103,Mercedes-Benz C,2 143 cm3 • 136 CP • Autoturisme Mercedes Benz...,Km154 395 kmCombustibilDieselAnul producției2018,Magurele (Ilfov),22 950
5914,BMW Seria 5,2 993 cm3 • 235 CP • 530d/Trapa/Automata/Xenon...,Km390 000 kmCombustibilDieselAnul producției2007,Satu Mare (Satu Mare),6 199
12299,Ford Mondeo 1.6 TDCi Titanium,1 560 cm3 • 115 CP • 1.6 TDCI Titanium,Km161 500 kmCombustibilDieselAnul producției2012,Baia Mare (Maramures),7 000
24210,Peugeot 407 HDi 135 Automatik Sport,"1 997 cm3 • 136 CP • 2.0hdi ,16v,Automatik ,Pr...",Km189 219 kmCombustibilDieselAnul producției2005,Agigea (Constanta),2 899
16776,Mercedes-Benz AMG GT C Roadster,3 982 cm3 • 557 CP • !! Posibilitate LEASING /...,Km33 800 kmCombustibilBenzinaAnul producției2019,Domnesti (Ilfov),129 000


In [96]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,36793,36793,36793,36793,36793
unique,12516,30277,23064,825,6199
top,Volkswagen Passat,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul producției2023,Bucuresti (Bucuresti),9 990
freq,237,217,185,8491,199


In [97]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36793 entries, 0 to 36792
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   36793 non-null  object
 1   second  36793 non-null  object
 2   third   36793 non-null  object
 3   fourth  36793 non-null  object
 4   fifth   36793 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


### 4.2. Data processing

In [98]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [99]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [100]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [101]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [102]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [103]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [104]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [105]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "producției", "start"])
df["year"] = df["year"].apply(remove_char, args=["producției"])

In [106]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [107]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [108]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [109]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [110]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [111]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [112]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-05-10.csv")

### 4.3. Final analysis

In [113]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
31548,Volkswagen,Polo,239000,1896,64,2006,Diesel,Suceava,Paltinoasa,1990
31242,Volkswagen,e-Golf,141500,0,136,2019,Electric,Galati,Galati,17243
32374,Volkswagen,Passat,219500,1968,190,2016,Diesel,Bucuresti,Bucuresti,15300
36044,Volvo,XC 60,126461,1969,320,2021,Benzina,Constanta,Constanta,34990
21432,Mitsubishi,ASX,174500,1798,116,2015,Diesel,Galati,Galati,9590
17556,Mercedes-Benz,GLE,55102,2925,330,2020,Diesel,Ilfov,Otopeni,74990
3680,BMW,i3,39900,0,170,2021,Electric,Iasi,Iasi,24900
35399,Volkswagen,Jetta,193000,1968,110,2016,Diesel,Dambovita,Targoviste,8390
16661,Mercedes-Benz,S,75604,2925,286,2021,Diesel,Ilfov,Otopeni,87400
7368,BMW,Seria 5,170000,2993,258,2016,Diesel,Harghita,Gheorgheni,21500


In [114]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,36793,36793,36793.0,36793.0,36793.0,36793.0,36793,36793,36793,36793.0
unique,75,686,,,,,7,49,747,
top,BMW,Passat,,,,,Diesel,Bucuresti,Bucuresti,
freq,4784,1159,,,,,22364,8493,8491,
mean,,,143218.5,1918.329166,178.697606,2016.235398,,,,24890.939282
std,,,97393.44,761.177209,98.693399,5.201721,,,,32573.387427
min,,,0.0,0.0,0.0,1962.0,,,,1.0
25%,,,65867.0,1498.0,115.0,2013.0,,,,8500.0
50%,,,148000.0,1968.0,150.0,2017.0,,,,15500.0
75%,,,210848.0,1998.0,200.0,2020.0,,,,29000.0


In [115]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36793 entries, 0 to 36792
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  36793 non-null  object
 1   model         36793 non-null  object
 2   mileage       36793 non-null  int32 
 3   capacity      36793 non-null  int32 
 4   power         36793 non-null  int32 
 5   year          36793 non-null  int32 
 6   fuel          36793 non-null  object
 7   county        36793 non-null  object
 8   city          36793 non-null  object
 9   price         36793 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.1+ MB
