# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [215]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [216]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-08-10.csv")

In [217]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
26280,Skoda Fabia Combi 1.0 TSI Ambition,999 cm3 • 95 CP • Skoda Fabia / 1.0 TSI 95 CP ...,Km168 010 kmCombustibilBenzinaAnul producției2...,Otopeni (Ilfov),8 649
1873,Audi A5 Sportback 2.0 TDI,"1 968 cm3 • 170 CP • Rate Fixe, Garantie 12 Lu...",Km258 000 kmCombustibilDieselAnul producției2012,Arad (Arad),14 999
5951,BMW Seria 5 530e xDrive Aut. Luxury Line,1 998 cm3 • 184 CP • BMW 530e xDrive Luxury Line,Km31 999 kmCombustibilHibrid Plug-InAnul produ...,Baia Mare (Maramures),39 490
5877,BMW X1 sDrive18d xLine,1 995 cm3 • 143 CP • Primul proprietar,Km185 000 kmCombustibilDieselAnul producției2014,Dudestii Noi (Timis),7 999
31262,Volkswagen ARTEON,1 498 cm3 • 150 CP • Volkswagen Arteon R Line ...,Km203 154 kmCombustibilBenzinaAnul producției2...,Bucuresti (Bucuresti),16 800
12398,Hyundai i20,1 248 cm3 • 75 CP,Km83 000 kmCombustibilBenzinaAnul producției2018,Bucuresti (Bucuresti),8 950
11028,Ford Kuga 2.0 TDCi 4x4 Aut. Titanium,1 997 cm3 • 180 CP,Km212 000 kmCombustibilDieselAnul producției2016,Oradea (Bihor),11 490
9789,Fiat Qubo 1.3 Multijet 16V DPF Start&Stop Dynamic,1 248 cm3 • 95 CP • Fiat Qubo,Km160 000 kmCombustibilDieselAnul producției2014,Targu-Mures (Mures),5 969
27716,Suzuki Swift,1 197 cm3 • 83 CP • Swift Mild Hybrid 12V SPIR...,Km1 kmCombustibilBenzinaAnul producției2023,Bistrita (Bistrita-Nasaud),17 519
31828,Volkswagen Passat CC,"1 968 cm3 • 140 CP • HIGHLINE,2010,140CP,M6,Pi...",Km287 000 kmCombustibilDieselAnul producției2010,Zalau (Salaj),4 990


In [218]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,34130,34130,34130,34130,34130
unique,12239,28509,22043,806,6097
top,BMW Seria 3,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul producției2024,Bucuresti (Bucuresti),9 990
freq,199,177,268,8170,160


In [219]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34130 entries, 0 to 34129
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   34130 non-null  object
 1   second  34130 non-null  object
 2   third   34130 non-null  object
 3   fourth  34130 non-null  object
 4   fifth   34130 non-null  object
dtypes: object(5)
memory usage: 1.3+ MB


### 4.2. Data processing

In [220]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [221]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [222]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [223]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [224]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [225]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [226]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [227]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "producției", "start"])
df["year"] = df["year"].apply(remove_char, args=["producției"])

In [228]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [229]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [230]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [231]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [232]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [233]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [234]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-08-10.csv")

### 4.3. Final analysis

In [235]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
25134,Seat,Leon,44500,1498,150,2020,Hibrid,Gorj,Targu,23000
31752,Volkswagen,Passat,216000,1968,150,2020,Diesel,Valcea,Ramnicu,13700
3833,BMW,X6,900,2993,340,2024,Diesel,Ilfov,Bragadiru,99000
32103,Volkswagen,Passat,256535,1984,200,2010,Benzina,Arges,Pitesti,8900
5838,BMW,Seria 3,5000,1995,190,2023,Diesel,Satu,Satu,54353
25480,Skoda,Yeti,204440,1968,170,2014,Diesel,Suceava,Suceava,11850
13155,Jeep,Grand,105000,2987,250,2016,Diesel,Bucuresti,Bucuresti,21850
30797,Volkswagen,Tiguan,262000,1968,140,2012,Diesel,Bacau,Onesti,10999
33284,Volvo,XC 60,100000,1969,304,2020,Hibrid,Ilfov,Otopeni,35689
7094,BMW,X5,295430,2993,258,2014,Diesel,Prahova,Valenii,18000


In [236]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,34130,34130,34130.0,34130.0,34130.0,34130.0,34130,34130,34130,34130.0
unique,76,694,,,,,7,47,728,
top,BMW,Golf,,,,,Diesel,Bucuresti,Bucuresti,
freq,4214,1066,,,,,19800,8170,8170,
mean,,,139577.6,1914.543803,180.1477,2016.604659,,,,25660.526985
std,,,96783.01,776.706709,100.254463,5.280496,,,,34298.729639
min,,,0.0,0.0,0.0,1962.0,,,,100.0
25%,,,61500.0,1496.0,115.0,2013.0,,,,8700.0
50%,,,142178.0,1968.0,150.0,2018.0,,,,16100.0
75%,,,207000.0,1998.0,200.0,2020.0,,,,29990.0


In [237]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34130 entries, 0 to 34129
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  34130 non-null  object
 1   model         34130 non-null  object
 2   mileage       34130 non-null  int32 
 3   capacity      34130 non-null  int32 
 4   power         34130 non-null  int32 
 5   year          34130 non-null  int32 
 6   fuel          34130 non-null  object
 7   county        34130 non-null  object
 8   city          34130 non-null  object
 9   price         34130 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.0+ MB
