# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [1]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [2]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-03-10.csv")

In [3]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
5709,BMW Seria 3 318d DPF Touring Edition Exclusive,1 995 cm3 • 143 CP • 2009 Euro 5 2.0d,Km281 000 kmCombustibilDieselAnul producției2009,Turda (Cluj),3 990
1740,Audi A6 Allroad,2 967 cm3 • 218 CP,Km128 600 kmCombustibilDieselAnul producției2018,Bucuresti (Bucuresti),27 900
7818,BMW Seria 5 530d xDrive MHEV,2 993 cm3 • 286 CP,Km140 000 kmCombustibilDieselAnul producției2021,Braila (Braila),42 900
6098,BMW Seria 3 318i AT,"1 998 cm3 • 156 CP • Primul proprietar, mașină...",Km34 500 kmCombustibilBenzinaAnul producției2022,Bucuresti (Bucuresti),34 990
6153,BMW X3 xDrive20d Aut. Blue Performance,1 995 cm3 • 184 CP • BMW GX3 X Drive XLine 22....,Km100 800 kmCombustibilDieselAnul producției2018,Bucuresti (Bucuresti),27 300
32859,Skoda Octavia,"1 598 cm3 • 110 CP • Skoda Octavia III, 1598 c...",Km230 000 kmCombustibilDieselAnul producției2016,Bucuresti (Bucuresti),9 999
28537,Renault Koleos,"1 995 cm3 • 150 CP • Rate fixe sau Cash, Bose ...",Km182 800 kmCombustibilDieselAnul producției2012,Corunca (Mures),8 499
26166,Opel Astra GTC 1.4 Turbo,1 364 cm3 • 140 CP,Km132 000 kmCombustibilBenzinaAnul producției2...,Bucuresti (Bucuresti),6 499
23815,Mercedes-Benz CLA 250 4MATIC Coupe,1 991 cm3 • 224 CP • Mercedes CLA 250 4Matic 2...,Km13 000 kmCombustibilBenzinaAnul producției2022,Bucuresti (Bucuresti),43 990
14851,Ford Puma 1.0 EcoBoost mHEV Titanium,999 cm3 • 125 CP • FORD PUMA Titanium 1.0l Eco...,Km1 kmCombustibilBenzinaAnul producției2024,Pitesti (Arges),18 200


In [4]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,41258,41258,41258,41258,41258
unique,13283,33950,24847,943,6659
top,BMW Seria 3,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul producției2023,Bucuresti (Bucuresti),7 990
freq,282,235,238,9468,213


In [5]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41258 entries, 0 to 41257
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   41258 non-null  object
 1   second  41258 non-null  object
 2   third   41258 non-null  object
 3   fourth  41258 non-null  object
 4   fifth   41258 non-null  object
dtypes: object(5)
memory usage: 1.6+ MB


### 4.2. Data processing

In [6]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [7]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [8]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [9]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [10]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [11]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [12]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [13]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "producției", "start"])
df["year"] = df["year"].apply(remove_char, args=["producției"])

In [14]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [15]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [16]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [17]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [18]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [19]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [20]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-03-10.csv")

### 4.3. Final analysis

In [21]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
20810,Mercedes-Benz,C,2,1993,265,2023,Diesel,Maramures,Baia,67819
7332,BMW,X5,139906,2993,286,2011,Diesel,Arges,Pitesti,15500
29021,Renault,Captur,9,999,90,2022,Benzina,Ilfov,Chiajna,19990
22000,Mercedes-Benz,C,69082,1950,245,2020,Diesel,Bucuresti,Bucuresti,31490
2892,Audi,A5,215600,1968,177,2012,Diesel,Bucuresti,Bucuresti,10500
36562,Volkswagen,Passat,215500,1968,190,2019,Diesel,Ilfov,Otopeni,18897
36112,Volkswagen,Crafter,3,1968,140,2022,Diesel,Bistrita-nasaud,Bistrita,40755
19977,Mercedes-Benz,V,1,1950,237,2023,Diesel,Arad,Arad,97568
40739,Volvo,XC 90,230068,1969,235,2016,Diesel,Valcea,Ramnicu,26000
35931,Volkswagen,Golf,50000,999,115,2020,Benzina,Bucuresti,Bucuresti,15300


In [22]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,41258,41258,41258.0,41258.0,41258.0,41258.0,41258,41258,41258,41258.0
unique,72,684,,,,,7,48,825,
top,BMW,Golf,,,,,Diesel,Bucuresti,Bucuresti,
freq,5462,1307,,,,,25502,9553,9468,
mean,,,144479.2,1923.584032,178.424548,2016.03483,,,,24901.093121
std,,,95384.39,750.992002,96.378789,5.138862,,,,32595.252213
min,,,0.0,0.0,0.0,1967.0,,,,1.0
25%,,,67591.75,1498.0,115.0,2013.0,,,,8490.0
50%,,,150000.0,1968.0,150.0,2017.0,,,,15300.0
75%,,,212000.0,1998.0,197.0,2020.0,,,,28590.0


In [23]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41258 entries, 0 to 41257
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  41258 non-null  object
 1   model         41258 non-null  object
 2   mileage       41258 non-null  int32 
 3   capacity      41258 non-null  int32 
 4   power         41258 non-null  int32 
 5   year          41258 non-null  int32 
 6   fuel          41258 non-null  object
 7   county        41258 non-null  object
 8   city          41258 non-null  object
 9   price         41258 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.4+ MB
