# EvaCar

## 4. Data collection

### 4.1. Initial analysis

In [1]:
# Import necessary libraries, packages, and modules
import pandas as pd

In [2]:
# Load data and create dataframe
df = pd.read_csv(f"../datasets/RawDataDB-24-01-10.csv")

In [3]:
# Display a sample of data
df.sample(10)

Unnamed: 0,first,second,third,fourth,fifth
15830,Mercedes-Benz GLC Coupe 250 d 4Matic 9G-TRONIC,2 143 cm3 • 204 CP,Km199 784 kmCombustibilDieselAnul fabricației2...,Otopeni (Bucuresti - Ilfov),38 999
33695,Volkswagen Golf Plus 1.4 Comfortline,1 390 cm3 • 80 CP • Volkswagen Golf Plus 1.4 B...,Km175 794 kmCombustibilBenzinaAnul fabricației...,Tauteu (Bihor),3 949
16911,Mercedes-Benz C 200 4MATIC Aut.,1 497 cm3 • 184 CP,Km65 815 kmCombustibilHibridAnul fabricației2019,Bucuresti (Bucuresti - Ilfov),32 500
32768,Volkswagen ARTEON 2.0 TDI DSG 4Motion Elegance,1 968 cm3 • 190 CP • Primul propietar/ Garanti...,Km40 500 kmCombustibilDieselAnul fabricației2019,Voluntari (Bucuresti - Ilfov),30 000
5882,BMW Seria 3 320d Touring xDrive Aut.,1 995 cm3 • 190 CP,Km127 768 kmCombustibilDieselAnul fabricației2...,Bucuresti (Bucuresti - Ilfov),27 300
26971,Skoda Octavia 1.6 TDI Active,1 598 cm3 • 115 CP • mașină de familie înmatri...,Km224 800 kmCombustibilDieselAnul fabricației2...,Craiova (Dolj),6 450
18118,Mercedes-Benz A,1 332 cm3 • 163 CP • Mercedes-Benz A 200,Km18 000 kmCombustibilBenzinaAnul fabricației2...,Ramnicu Valcea (Valcea),44 327
17127,Mercedes-Benz GLC 220 d 4MATIC,1 950 cm3 • 194 CP • GLC 220d 4Matic - BH 15626,Km34 050 kmCombustibilDieselAnul fabricației2020,Oradea (Bihor),43 000
15217,Land Rover Range Rover Evoque,2 179 cm3 • 150 CP • Range Rover Evoque Pure E...,Km44 364 kmCombustibilDieselAnul fabricației2014,Bucuresti (Bucuresti - Ilfov),18 499
28362,Skoda Octavia Combi Diesel 1.6 TDI Ambition,1 598 cm3 • 116 CP • rar efectuat,Km84 705 kmCombustibilDieselAnul fabricației2018,Bucuresti (Bucuresti - Ilfov),13 290


In [4]:
# Describe 
df.describe(include="all")

Unnamed: 0,first,second,third,fourth,fifth
count,36292,36292,36292,36292,36292
unique,12208,29956,22705,771,6595
top,Volkswagen Passat,1 968 cm3 • 150 CP,Km1 kmCombustibilBenzinaAnul fabricației2023,Bucuresti (Bucuresti - Ilfov),12 990
freq,235,230,245,8446,183


In [5]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36292 entries, 0 to 36291
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   first   36292 non-null  object
 1   second  36292 non-null  object
 2   third   36292 non-null  object
 3   fourth  36292 non-null  object
 4   fifth   36292 non-null  object
dtypes: object(5)
memory usage: 1.4+ MB


### 4.2. Data processing

In [6]:
# Define variables for data processing
double_name_manufacturer = ["alfa", "aston", "land"]
double_name_model = {"bentley": "flying", "bmw": "seria", "land": "range", "lexus": "seria", "tesla": "model", "toyota": "land", "volvo": "xc"}

In [7]:
# Define function for data processing
def remove_char(raw_data: str, char_to_remove: str) -> str:
    """
    Remove specific character or string from given string.
    """
    return raw_data.replace(char_to_remove, "")


def extract_data(raw_data: str, spliter: str, first_word: int, last_word: int) -> str:
    """
    Extract part of the string based on spliter element, start and stop words position.
    """
    split_elements = raw_data.split(spliter)
    chosen_elements = split_elements[first_word - 1:last_word]
    return " ".join(chosen_elements)


def extract_specific_data(raw_data: str, spliter: str, check_char: str, check_type: str) -> str:
    """
    Extract specific part of the string based on spliter element, flag word and it position.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    for element in split_elements:
        if (check_type == "start") and (element.startswith(check_char)):
            return element
        elif (check_type == "end") and (element.endswith(check_char)):
            return element
    else:
        return "0"

        
def indentify_manufacturer(raw_data: str, spliter: str, excepted: list) -> int:
    """
    Identify how many words contains manufacturer name and return it index.
    """ 
    split_elements = raw_data.split(spliter)
    if split_elements[0].lower() in excepted:
        return 2
    else:
        return 1


def extract_manufacturer(raw_data: str, spliter: str, excepted: list) -> str:
    """
    Extract manufacturer name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_manufacturer(raw_data, spliter, excepted)
    chosen_elements = split_elements[:index]
    return " ".join(chosen_elements)


def indentify_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> tuple:
    """
    Identify car model start index and final index.
    """
    raw_data = raw_data.lower()
    split_elements = raw_data.split(spliter)
    manufact_key = split_elements[0].lower()
    if (manufact_key in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (2, 4)
        elif model_value not in split_elements:
            return (2, 3)
    elif (manufact_key not in except_manufact) and (manufact_key in except_model):
        model_value = except_model[manufact_key]
        if model_value in split_elements:
            return (1, 3) 
        elif model_value not in split_elements:
            return (1, 2)
    elif (manufact_key in except_manufact) and (manufact_key not in except_model):
        return (2, 3)
    else:
        return (1, 2)


def extract_model(raw_data: str, spliter: str, except_manufact: list, except_model: dict) -> str:
    """
    Extract model name based on spliter element and index.
    """
    split_elements = raw_data.split(spliter)
    index = indentify_model(raw_data, spliter, except_manufact, except_model)
    chosen_elements = split_elements[index[0]:index[1]]
    return " ".join(chosen_elements)

In [8]:
# Create Manufacturer column
df["manufacturer"] = df["first"].apply(extract_manufacturer, args=[" ", double_name_manufacturer])

In [9]:
# Create Model column
df["model"] = df["first"].apply(extract_model, args=[" ", double_name_manufacturer, double_name_model])

In [10]:
# Create Mileage column
df["mileage"] = df["third"].apply(extract_specific_data, args=[" km", "km", "start"])
df["mileage"] = df["mileage"].apply(remove_char, args=["km"])
df["mileage"] = df["mileage"].apply(remove_char, args=[" "])

In [11]:
# Create Capacity column
df["capacity"] = df["second"].apply(extract_specific_data, args=[" • ", "cm3", "end"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" cm3"])
df["capacity"] = df["capacity"].apply(remove_char, args=[" "])

In [12]:
# Create Power column
df["power"] = df["second"].apply(extract_specific_data, args=[" • ", "cp", "end"])
df["power"] = df["power"].apply(remove_char, args=[" cp"])
df["power"] = df["power"].apply(remove_char, args=[" "])

In [13]:
# Create Year column
df["year"] = df["third"].apply(extract_specific_data, args=[" ", "fabricației", "start"])
df["year"] = df["year"].apply(remove_char, args=["fabricației"])

In [14]:
# Create Fuel column
df["fuel"] = df["third"].apply(extract_specific_data, args=[" ", "anul", "end"])
df["fuel"] = df["fuel"].apply(remove_char, args=["combustibil"])
df["fuel"] = df["fuel"].apply(remove_char, args=["km"])
df["fuel"] = df["fuel"].apply(remove_char, args=["anul"])
df["fuel"] = df["fuel"].apply(lambda x: x.capitalize())

In [15]:
# Create County column
df["county"] = df["fourth"].apply(extract_specific_data, args=[" ", "(", "start"])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [16]:
# Create City column
df["city"] = df["fourth"].apply(extract_data, args=[" ", 1, 1])
df["county"] = df["county"].apply(remove_char, args=["("])
df["county"] = df["county"].apply(remove_char, args=[")"])
df["county"] = df["county"].apply(lambda x: x.capitalize())

In [17]:
# Create Price column
df["price"] = df["fifth"].apply(remove_char, args=[" "])

In [18]:
# Delete source columns
df.drop(columns=["first", "second", "third", "fourth", "fifth"], inplace=True)

In [19]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

In [20]:
# Save processed dataframe to CSV
df.to_csv("../datasets/ProcessedDataDB-24-01-10.csv")

### 4.3. Final analysis

In [21]:
# Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
31325,Volkswagen,Golf,317000,1390,80,2008,Benzina,Satu,Satu,3299
4421,BMW,Seria 7,210145,2993,313,2014,Diesel,Bucuresti,Bucuresti,19850
14602,Kia,Rio,5,1197,0,2022,Benzina,Timis,Timisoara,17938
23341,Peugeot,308,233280,1560,99,2016,Diesel,Timis,Timisoara,6000
7833,BMW,Seria 2,90000,1499,136,2020,Hibrid,Brasov,Brasov,22699
13328,Hyundai,i20,74000,1120,75,2016,Diesel,Iasi,Iasi,8200
22543,Opel,Corsa,144000,1248,90,2014,Diesel,Bucuresti,Bucuresti,4990
12379,Ford,Fiesta,65268,1498,75,2017,Diesel,Bucuresti,Voluntari,7350
6799,BMW,Seria 8,59000,2993,320,2021,Diesel,Sibiu,Sibiu,64999
502,Audi,A6,93548,1968,204,2019,Diesel,Cluj,Sannicoara,27490


In [22]:
# Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,capacity,power,year,fuel,county,city,price
count,36292,36292,36292.0,36292.0,36292.0,36292.0,36292,36292,36292,36292.0
unique,62,655,,,,,6,43,698,
top,BMW,Golf,,,,,Diesel,Bucuresti,Bucuresti,
freq,4640,1153,,,,,22545,12579,8446,
mean,,,140033.8,1927.118043,178.467293,2016.109969,,,,25946.095916
std,,,95974.23,751.149231,97.320633,5.092255,,,,33977.808656
min,,,0.0,0.0,0.0,1964.0,,,,100.0
25%,,,63244.75,1498.0,115.0,2013.0,,,,8700.0
50%,,,145000.0,1968.0,150.0,2017.0,,,,15500.0
75%,,,206864.0,1998.0,197.0,2020.0,,,,29893.75


In [23]:
# Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36292 entries, 0 to 36291
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   manufacturer  36292 non-null  object
 1   model         36292 non-null  object
 2   mileage       36292 non-null  int32 
 3   capacity      36292 non-null  int32 
 4   power         36292 non-null  int32 
 5   year          36292 non-null  int32 
 6   fuel          36292 non-null  object
 7   county        36292 non-null  object
 8   city          36292 non-null  object
 9   price         36292 non-null  int32 
dtypes: int32(5), object(5)
memory usage: 2.1+ MB
