# EvaCar

In [542]:
# Import necessary libraries, packages, and modules
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Business understanding

### 1.1. Initial analysis

In [543]:
# 1. Load data and create dataframe
df = pd.read_csv("../datasets/RawDataDB.csv")

In [544]:
# 2. Display a sample of data
df.sample(10)

Unnamed: 0,manufacturer,model,mileage,year,fuel,price,currency,city,county,status,short_description,long_description
1611,Audi,A4,237 000 km,2016,Diesel,15 150,EUR,Tulcea,(Tulcea),Reactualizat,2.0 TDI S tronic,1 968 cm3 • 190 CP • AUDI A4 B9 2.0TDI ULTRA S...
12201,Ford,Focus,136 681 km,2018,Diesel,9 850,EUR,Bucuresti,(Bucuresti,Publicat,[],1 499 cm3 • 95 CP
10323,Dacia,Logan,213 503 km,2012,Benzina,3 500,EUR,Pitesti,(Arges),Publicat,MCV 1.6 Laureate,1 598 cm3 • 87 CP • Logan MCV 1.6 MPI 87 CP 7 ...
14385,Jaguar,E-Pace,104 200 km,2018,Diesel,26 990,EUR,Sannicoara,(Cluj),Publicat,D180 AWD First Edition,1 999 cm3 • 180 CP
22838,Opel,Astra,63 800 km,2022,Diesel,13 999,EUR,Bucuresti,(Bucuresti,Reactualizat,1.5 D Start/Stop Sports Tourer Automatik Elegance,1 496 cm3 • 122 CP • Opel Astra 1.5Cdti // Vin...
15501,Land Rover,Range,138 739 km,2019,Diesel,72 590,EUR,Pitesti,(Arges),Publicat,Rover 3.0L SDV6 Autobiography,2 993 cm3 • 275 CP • Land Rover Range Rover SW...
34632,Volkswagen,Passat,155 000 km,2012,Diesel,13 990,EUR,Cleja,(Bacau),Publicat,CC 2.0 TDI BlueMotion Technology DSG,1 968 cm3 • 140 CP
36338,Volvo,S40 2.0,260 000 km,2006,Diesel,2 200,EUR,Bucuresti,(Bucuresti,Publicat,D Basic,1 997 cm3 • 136 CP • Volvo S40/2.0 Diesel/2200€
35771,Volvo,XC 60,259 000 km,2013,Diesel,14 990,EUR,Cleja,(Bacau),Publicat,D3 AWD Aut. R Design,2 400 cm3 • 163 CP
5484,BMW,M4 Competition,14 488 km,2022,Benzina,80 974,EUR,Bucuresti,(Bucuresti,Reactualizat,AT,2 993 cm3 • 510 CP • BMW M4 Competition 19''/2...


In [545]:
# 3. Describe 
df.describe(include="all")

Unnamed: 0,manufacturer,model,mileage,year,fuel,price,currency,city,county,status,short_description,long_description
count,36534,36534,36534,36534,36534,36534,36534,36534,36533,36534,36506,36534
unique,64,1078,12842,1250,396,6862,2,686,126,2,9676,30012
top,Mercedes-Benz,Passat,Diesel,2019,Diesel,7 990,EUR,Bucuresti,(Bucuresti,Publicat,[],1 968 cm3 • 150 CP
freq,4857,1144,1064,4031,21444,178,36499,8696,12848,20101,9130,216


In [546]:
# 4. Display information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36534 entries, 0 to 36533
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   manufacturer       36534 non-null  object
 1   model              36534 non-null  object
 2   mileage            36534 non-null  object
 3   year               36534 non-null  object
 4   fuel               36534 non-null  object
 5   price              36534 non-null  object
 6   currency           36534 non-null  object
 7   city               36534 non-null  object
 8   county             36533 non-null  object
 9   status             36534 non-null  object
 10  short_description  36506 non-null  object
 11  long_description   36534 non-null  object
dtypes: object(12)
memory usage: 3.3+ MB


### 1.2. Process data

In [547]:
# Extract data based on specific element
def extract_data(raw_data: str, splitter: str, specific_element: str) -> str:
    """
    Take a raw elements and return specific part of it
    """
    raw_element = raw_data.split(f"{splitter}")

    for modified_element in raw_element:
        if modified_element.endswith(f"{specific_element}"):
            raw_data = "".join(modified_element.split(" ")[:-1])
        else:
            raw_data = ""
    return raw_data


# Remove specific parts
def remove_elements(raw_data: str, start_with: str, end_with: str) -> str:
    """
    Take a raw element and return element with removed start and/or end
    """
    if raw_data.startswith(f"{start_with}"):
        raw_data.replace(f"{start_with}", "")
    if raw_data.endswith(f"{end_with}"):
        raw_data.replace(f"{end_with}", "")
    
    return raw_data


# Remove spaces
def remove_space(raw_data: str) -> str:
    """
    Take a raw element and return element with removed white spaces
    """
    raw_data = "".join(raw_data.split(" "))
    
    return raw_data

In [548]:
# Add capacity column
capacity = df["long_description"].apply(extract_data, args=[" • ", "cm3"])
df.insert(loc=3, column="capacity", value=capacity)

In [549]:
# Add power column
power = df["long_description"].apply(extract_data, args=[" • ", "CP"])
df.insert(loc=4, column="power", value=power)

In [550]:
# Drop rows with wrong placed data
df = df.drop(df[(df.mileage == "Benzina") | \
                (df.mileage == "Diesel") | \
                    (df.mileage == "Hibrid") | \
                        (df.mileage == "Electric") | \
                            (df.mileage == "Benzina + GPL") | \
                                (df.mileage == "Benzina + CNG")].index)

In [551]:
df = df.drop(columns=['short_description', 'long_description'])

In [552]:
# Process mileage column
df["mileage"] = df["mileage"].apply(extract_data, args=[" • ", "km"])

In [553]:
df["price"] = df["price"].apply(remove_space)

In [554]:
df = df.drop(df[(df.mileage == '')].index)

In [555]:
df = df.drop(df[(df.capacity == '')].index)

In [None]:
df

In [None]:
# Change data types
data_types = {"mileage": int,
              "capacity": int,
              "power": int,
              "year": int,
              "price": int}

df = df.astype(data_types)

### 1.3. Final analysis

In [None]:
# 2. Display a sample of data
df.sample(10)