In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Data Analysis

Firstly, we have to load the csv files that we created before.

In [2]:
cars_df = pd.read_csv("../data/cars_data.csv", sep=";")

Now, we can see what is their shape and then, showing a few rows in order to watch the appearance of each dataframe.

In [3]:
np.shape(cars_df)

(103600, 36)

In [4]:
cars_df.head()

Unnamed: 0,Title,Brand,City,Price,Year,Kms,Fuel,Type of Gears,Seller,Guarantee,...,Autonomy,CO2 Emissions,Output,Cubic Capacity,Cylinders,Max Par,Gears,Transmission,Traction,Url
0,Citroen Jumper 2.2hdi Cl 33 L2h2 150,Citroen,Barcelona,11.529€,07/2014,105.000 km,Diesel,Manual,Profesional,Sí,...,0.0,199 gr/m3,150 cv,2.198 cm3,4 en línea,350 Nm,6,Manual,Delantera,https://www.coches.com/coches-segunda-mano/oca...
1,Volkswagen Passat 2.0tdi Highline Bmt,Volkswagen,Madrid,11.900€,2014,144.000 km,Diesel,Manual,Profesional,Sí,...,0.0,119 gr/m3,140 cv,1.968 cm3,4 en línea,320 Nm,6,Manual,Delantera,https://www.coches.com/coches-segunda-mano/oca...
2,Mercedes Clase Clk Clk 200 K,Mercedes,Madrid,5.400€,2004,160.000 km,Gasolina,Manual,Profesional,Sí,...,0.0,0 gr/m3,163 cv,1.796 cm3,4 en línea,240 Nm,6,Manual,Trasera,https://www.coches.com/coches-segunda-mano/oca...
3,Seat León 2.0tdi Cr S&s Fr 150,Seat,Madrid,15.390€,2016,19.090 km,Diesel,Manual,Profesional,No,...,0.0,112 gr/m3,150 cv,1.968 cm3,4 en línea,340 Nm,6,Manual,Delantera,https://www.coches.com/coches-segunda-mano/oca...
4,Opel Zafira Tourer 2.0cdti Selective 130,Opel,Malaga,11.500€,2015,90.000 km,Diesel,Manual,Profesional,Sí,...,0.0,134 gr/m3,130 cv,1.956 cm3,4 en línea,300 Nm,6,Manual,Delantera,https://www.coches.com/coches-segunda-mano/oca...


The next step is cleaning the dataset and changing the types of each column.

# Data cleaning and transformation

The first step is looking for NA's.

In [5]:
cars_df.columns

Index(['Title', 'Brand', 'City', 'Price', 'Year', 'Kms', 'Fuel',
       'Type of Gears', 'Seller', 'Guarantee', 'Colour', 'Boot Capacity',
       'Length', 'Height', 'Width', 'Doors', 'Vacancies', 'Tank', 'Weight',
       'Max Weight', 'Car Body', 'Max Speed', 'Comb Fuel', 'Urban Use',
       'Extraurban Use', 'Aceleration', 'Autonomy', 'CO2 Emissions', 'Output',
       'Cubic Capacity', 'Cylinders', 'Max Par', 'Gears', 'Transmission',
       'Traction', 'Url'],
      dtype='object')

In [6]:
cars_df.apply(lambda x: x.isnull().sum())

Title                 0
Brand                 0
City                  0
Price                 0
Year                  0
Kms                   0
Fuel                  0
Type of Gears        11
Seller                0
Guarantee             0
Colour            13661
Boot Capacity         0
Length                0
Height                0
Width                 0
Doors                 0
Vacancies             0
Tank                  0
Weight                0
Max Weight            0
Car Body              0
Max Speed             0
Comb Fuel             0
Urban Use             0
Extraurban Use        0
Aceleration           0
Autonomy              2
CO2 Emissions         0
Output                0
Cubic Capacity        0
Cylinders             0
Max Par               0
Gears                 0
Transmission         11
Traction              0
Url                   0
dtype: int64

We can see that there are some NaN values in our DataFrame.
The next step is deciding what are we going to do with them.

The colour is not very important for our model, so we drop this column.

In [7]:
cars_df = cars_df.drop("Colour", axis=1)

In [8]:
cars_df.columns

Index(['Title', 'Brand', 'City', 'Price', 'Year', 'Kms', 'Fuel',
       'Type of Gears', 'Seller', 'Guarantee', 'Boot Capacity', 'Length',
       'Height', 'Width', 'Doors', 'Vacancies', 'Tank', 'Weight', 'Max Weight',
       'Car Body', 'Max Speed', 'Comb Fuel', 'Urban Use', 'Extraurban Use',
       'Aceleration', 'Autonomy', 'CO2 Emissions', 'Output', 'Cubic Capacity',
       'Cylinders', 'Max Par', 'Gears', 'Transmission', 'Traction', 'Url'],
      dtype='object')

Now we are going to watch what values contains the 'Autonomy' column.

In [9]:
cars_df["Autonomy"].head(10)

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
5    0.0
6    0.0
7    0.0
8    0.0
9    0.0
Name: Autonomy, dtype: float64

Because of all the values are equal to zero, we are going to drop this column too.

In [10]:
cars_df = cars_df.drop("Autonomy", axis=1)

The next step is deciding what are we going to do with the rest of NaN's. 

In [11]:
cars_df.apply(lambda x: x.isnull().sum())

Title              0
Brand              0
City               0
Price              0
Year               0
Kms                0
Fuel               0
Type of Gears     11
Seller             0
Guarantee          0
Boot Capacity      0
Length             0
Height             0
Width              0
Doors              0
Vacancies          0
Tank               0
Weight             0
Max Weight         0
Car Body           0
Max Speed          0
Comb Fuel          0
Urban Use          0
Extraurban Use     0
Aceleration        0
CO2 Emissions      0
Output             0
Cubic Capacity     0
Cylinders          0
Max Par            0
Gears              0
Transmission      11
Traction           0
Url                0
dtype: int64

It looks that the two columns which contain NaN's contains the same information. Firstly, we drop one of them and then, what we are going to do is removing the raws which contain the NaN's.

In [12]:
cars_df = cars_df.drop("Type of Gears", axis=1)

In [13]:
array_nans = cars_df["Transmission"].isnull()

for i,val in enumerate(array_nans):
    if val == True:
        cars_df = cars_df.drop(i)

In [14]:
cars_df.apply(lambda x: x.isnull().sum())

Title             0
Brand             0
City              0
Price             0
Year              0
Kms               0
Fuel              0
Seller            0
Guarantee         0
Boot Capacity     0
Length            0
Height            0
Width             0
Doors             0
Vacancies         0
Tank              0
Weight            0
Max Weight        0
Car Body          0
Max Speed         0
Comb Fuel         0
Urban Use         0
Extraurban Use    0
Aceleration       0
CO2 Emissions     0
Output            0
Cubic Capacity    0
Cylinders         0
Max Par           0
Gears             0
Transmission      0
Traction          0
Url               0
dtype: int64

We have drop all the NaN's!!

Now we are going to find duplicates and then delete them.

In [15]:
#Counting the number of duplicates
cars_df["is_duplicated"] = cars_df.duplicated()
number_of_duplicates = 0

for var in cars_df["is_duplicated"]:
    if var == True:
        number_of_duplicates += 1

print("There are %d duplicated advertisements" %number_of_duplicates)

There are 20628 duplicated advertisements


In [16]:
#Deleting the duplicates and the additional column created before
cars_df = cars_df[cars_df["is_duplicated"] == False]
cars_df = cars_df.drop("is_duplicated", axis=1)
np.shape(cars_df)

(82961, 33)

The next step is changing the types of some columns.

In [17]:
cars_df.columns

Index(['Title', 'Brand', 'City', 'Price', 'Year', 'Kms', 'Fuel', 'Seller',
       'Guarantee', 'Boot Capacity', 'Length', 'Height', 'Width', 'Doors',
       'Vacancies', 'Tank', 'Weight', 'Max Weight', 'Car Body', 'Max Speed',
       'Comb Fuel', 'Urban Use', 'Extraurban Use', 'Aceleration',
       'CO2 Emissions', 'Output', 'Cubic Capacity', 'Cylinders', 'Max Par',
       'Gears', 'Transmission', 'Traction', 'Url'],
      dtype='object')

We start watching what is the type of each of them.

In [18]:
for col in cars_df.columns:
    print(cars_df[col].describe(),"\n\n")

count                82961
unique               18300
top       Bmw Serie 1 116d
freq                   511
Name: Title, dtype: object 


count     82961
unique       71
top         Bmw
freq       7974
Name: Brand, dtype: object 


count       82961
unique         52
top       Madrid 
freq        19646
Name: City, dtype: object 


count      82961
unique      4106
top       9.900€
freq         920
Name: Price, dtype: object 


count       82961
unique        478
top       01/2017
freq         2108
Name: Year, dtype: object 


count     82961
unique    24267
top        0 km
freq       1305
Name: Kms, dtype: object 


count      82961
unique         4
top       Diesel
freq       60392
Name: Fuel, dtype: object 


count           82961
unique              2
top       Profesional
freq            63979
Name: Seller, dtype: object 


count     82961
unique        2
top          No
freq      48592
Name: Guarantee, dtype: object 


count     82961
unique      456
top         0 l
freq      1652

Once we have seen the previous information, we have to change the type of some columns and maybe create new columns and drop old columns.

We start creating a new columns called 'Price (€)' which contains integers, and then removing the old column 'Price'.

In [19]:
cars_df["Price (€)"] = cars_df["Price"].apply(lambda x: float(x.split("€")[0].replace(".", "")))
cars_df = cars_df.drop("Price", axis=1)
cars_df["Price (€)"].head()

0    11529.0
1    11900.0
2     5400.0
3    15390.0
4    11500.0
Name: Price (€), dtype: float64

Now we must be focused in the next column.

In [20]:
cars_df["Year"].head(10)

0    07/2014
1       2014
2       2004
3       2016
4       2015
5       2017
6       2018
7    05/2014
8    03/2012
9    06/2016
Name: Year, dtype: object

We can see that the format is not the same in the rows.

In [21]:
cars_df["Year"] = cars_df["Year"].apply(lambda x: x.split("/"))

In [22]:
years = []

for val in cars_df["Year"]:
    if len(val) == 1: 
        years.append(val[0])
    else:
        years.append(val[1])

In [23]:
cars_df["Year"] = pd.to_numeric(years)

Kms column

In [24]:
cars_df["Kms"].head()

0    105.000 km
1    144.000 km
2    160.000 km
3     19.090 km
4     90.000 km
Name: Kms, dtype: object

In [25]:
cars_df["Kms"] = cars_df["Kms"].apply(lambda x: x.split()[0])
cars_df["Kms"] = pd.to_numeric(cars_df["Kms"].apply(lambda x: x.replace(".", "")))

In [26]:
cars_df["Kms"].head(10)

0    105000
1    144000
2    160000
3     19090
4     90000
5     55000
6        25
7     30000
8    114950
9     23000
Name: Kms, dtype: int64

Boot Capacity column

In [27]:
cars_df["Boot Capacity"].head()

0      0 l
1    565 l
2      0 l
3    380 l
4    152 l
Name: Boot Capacity, dtype: object

In [28]:
cars_df["Boot Capacity (l)"] = pd.to_numeric(cars_df["Boot Capacity"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Boot Capacity", axis=1)

In [29]:
cars_df.columns

Index(['Title', 'Brand', 'City', 'Year', 'Kms', 'Fuel', 'Seller', 'Guarantee',
       'Length', 'Height', 'Width', 'Doors', 'Vacancies', 'Tank', 'Weight',
       'Max Weight', 'Car Body', 'Max Speed', 'Comb Fuel', 'Urban Use',
       'Extraurban Use', 'Aceleration', 'CO2 Emissions', 'Output',
       'Cubic Capacity', 'Cylinders', 'Max Par', 'Gears', 'Transmission',
       'Traction', 'Url', 'Price (€)', 'Boot Capacity (l)'],
      dtype='object')

Lenght column

In [30]:
cars_df["Length"].head()

0    541 cm
1    477 cm
2    464 cm
3    428 cm
4    466 cm
Name: Length, dtype: object

In [31]:
cars_df["Length (cm)"] = pd.to_numeric(cars_df["Length"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Length", axis=1)

Height column

In [32]:
cars_df["Height"].head()

0    252 cm
1    147 cm
2    141 cm
3    146 cm
4    169 cm
Name: Height, dtype: object

In [33]:
cars_df["Height (cm)"] = pd.to_numeric(cars_df["Height"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Height", axis=1)

Width column

In [34]:
cars_df["Width"].head()

0    205 cm
1    182 cm
2    174 cm
3    182 cm
4    188 cm
Name: Width, dtype: object

In [35]:
cars_df["Width (cm)"] = pd.to_numeric(cars_df["Width"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Width", axis=1)

In [36]:
cars_df.columns

Index(['Title', 'Brand', 'City', 'Year', 'Kms', 'Fuel', 'Seller', 'Guarantee',
       'Doors', 'Vacancies', 'Tank', 'Weight', 'Max Weight', 'Car Body',
       'Max Speed', 'Comb Fuel', 'Urban Use', 'Extraurban Use', 'Aceleration',
       'CO2 Emissions', 'Output', 'Cubic Capacity', 'Cylinders', 'Max Par',
       'Gears', 'Transmission', 'Traction', 'Url', 'Price (€)',
       'Boot Capacity (l)', 'Length (cm)', 'Height (cm)', 'Width (cm)'],
      dtype='object')

Tank column

In [37]:
cars_df["Tank"].head()

0    90 l
1    70 l
2    62 l
3    50 l
4    58 l
Name: Tank, dtype: object

I have created the next function because I could appreciate that the value of some fields was equal to 'l' and I could not parse the value to int.

In [38]:
def change_value(serie):
    result = []
    for val in serie:
        if len(val) == 1:
            result.append(0)
        else:
            result.append(pd.to_numeric(val[0]))
    return result

In [39]:
cars_df["Tank (l)"] = cars_df["Tank"].apply(lambda x: x.split())
cars_df["Tank (l)"] = change_value(cars_df["Tank (l)"])

cars_df = cars_df.drop("Tank", axis=1)

Weight column

In [40]:
cars_df["Weight (kg)"] = pd.to_numeric(cars_df["Weight"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Weight", axis=1)

Max Weight column

In [41]:
cars_df["Max Weight"].head()

0    3.300 kg
1    2.100 kg
2    1.960 kg
3    1.820 kg
4    2.390 kg
Name: Max Weight, dtype: object

In [42]:
cars_df["Max Weight (kg)"] = pd.to_numeric(cars_df["Max Weight"].apply(lambda x: x.split()[0]))
cars_df = cars_df.drop("Max Weight", axis=1)

In [43]:
cars_df["Max Weight (kg)"].head()

0    3.30
1    2.10
2    1.96
3    1.82
4    2.39
Name: Max Weight (kg), dtype: float64

Max Speed column

In [44]:
cars_df["Max Speed"].head()

0      0 km/h
1    211 km/h
2      0 km/h
3    215 km/h
4    191 km/h
Name: Max Speed, dtype: object

In [45]:
cars_df["Max Speed (km/h)"] = cars_df["Max Speed"].apply(lambda x: x.split())
cars_df["Max Speed (km/h)"] = change_value(cars_df["Max Speed (km/h)"])

cars_df = cars_df.drop("Max Speed", axis=1)

In [46]:
cars_df["Max Speed (km/h)"].head()

0      0
1    211
2      0
3    215
4    191
Name: Max Speed (km/h), dtype: int64

Comb Fuel column

In [47]:
cars_df["Comb Fuel"].head()

0     7,2 l
1     4,6 l
2    12,2 l
3     4,4 l
4     5,1 l
Name: Comb Fuel, dtype: object

In [48]:
cars_df["Comb Fuel (l)"] = cars_df["Comb Fuel"].apply(lambda x: x.replace(",",".").split())
cars_df["Comb Fuel (l)"] = change_value(cars_df["Comb Fuel (l)"])

cars_df = cars_df.drop("Comb Fuel", axis=1)

In [49]:
cars_df["Comb Fuel (l)"].head()

0     7.2
1     4.6
2    12.2
3     4.4
4     5.1
Name: Comb Fuel (l), dtype: float64

Urban Use column

In [50]:
cars_df["Urban Use"].head()

0     8,8 l
1     6,0 l
2    15,9 l
3     5,3 l
4     6,2 l
Name: Urban Use, dtype: object

In [51]:
cars_df["Urban Use (l)"] = cars_df["Urban Use"].apply(lambda x: x.replace(",",".").split())
cars_df["Urban Use (l)"] = change_value(cars_df["Urban Use (l)"])

cars_df = cars_df.drop("Urban Use", axis=1)

In [52]:
cars_df["Urban Use (l)"].head()

0     8.8
1     6.0
2    15.9
3     5.3
4     6.2
Name: Urban Use (l), dtype: float64

Extraurban Use column

In [53]:
cars_df["Extraurban Use"].head()

0    6,3 l
1    3,7 l
2    8,6 l
3    3,9 l
4    4,5 l
Name: Extraurban Use, dtype: object

In [54]:
cars_df["Extraurban Use (l)"] = cars_df["Extraurban Use"].apply(lambda x: x.replace(",",".").split())
cars_df["Extraurban Use (l)"] = change_value(cars_df["Extraurban Use (l)"])

cars_df = cars_df.drop("Extraurban Use", axis=1)

In [55]:
cars_df["Extraurban Use (l)"].head()

0    6.3
1    3.7
2    8.6
3    3.9
4    4.5
Name: Extraurban Use (l), dtype: float64

Aceleration column

In [56]:
cars_df["Aceleration"].head()

0     0,0 s
1     9,8 s
2     9,3 s
3     8,4 s
4    11,4 s
Name: Aceleration, dtype: object

In [57]:
cars_df["Aceleration 0-100 (s)"] = cars_df["Aceleration"].apply(lambda x: x.replace(",",".").split())
cars_df["Aceleration 0-100 (s)"] = change_value(cars_df["Aceleration 0-100 (s)"])

cars_df = cars_df.drop("Aceleration", axis=1)

In [58]:
cars_df["Aceleration 0-100 (s)"].head()

0     0.0
1     9.8
2     9.3
3     8.4
4    11.4
Name: Aceleration 0-100 (s), dtype: float64

CO2 Emissions column

In [59]:
cars_df["CO2 Emissions"].head()

0    199 gr/m3
1    119 gr/m3
2      0 gr/m3
3    112 gr/m3
4    134 gr/m3
Name: CO2 Emissions, dtype: object

In [60]:
cars_df["CO2 Emissions (gr/m3)"] = cars_df["CO2 Emissions"].apply(lambda x: x.split())
cars_df["CO2 Emissions (gr/m3)"] = change_value(cars_df["CO2 Emissions (gr/m3)"])

cars_df = cars_df.drop("CO2 Emissions", axis=1)

In [61]:
cars_df["CO2 Emissions (gr/m3)"].head()

0    199
1    119
2      0
3    112
4    134
Name: CO2 Emissions (gr/m3), dtype: int64

Output column

In [62]:
cars_df["Output"].head()

0    150 cv
1    140 cv
2    163 cv
3    150 cv
4    130 cv
Name: Output, dtype: object

In [63]:
cars_df["Output (cv)"] = pd.to_numeric(cars_df["Output"].apply(lambda x: x.split()[0]))

cars_df = cars_df.drop("Output", axis=1)

In [64]:
cars_df["Output (cv)"].head()

0    150
1    140
2    163
3    150
4    130
Name: Output (cv), dtype: int64

Cubic Capacity column

In [65]:
cars_df["Cubic Capacity"].head()

0    2.198 cm3
1    1.968 cm3
2    1.796 cm3
3    1.968 cm3
4    1.956 cm3
Name: Cubic Capacity, dtype: object

In [66]:
cars_df["Cubic Capacity (cm3)"] = pd.to_numeric(cars_df["Cubic Capacity"].apply(lambda x: x.replace(".", "").split()[0]))

cars_df = cars_df.drop("Cubic Capacity", axis=1)

In [67]:
cars_df["Cubic Capacity (cm3)"].head()

0    2198
1    1968
2    1796
3    1968
4    1956
Name: Cubic Capacity (cm3), dtype: int64

Max Par column

In [68]:
cars_df["Max Par"].head()

0    350 Nm
1    320 Nm
2    240 Nm
3    340 Nm
4    300 Nm
Name: Max Par, dtype: object

In [69]:
cars_df["Max Par (Nm)"] = pd.to_numeric(cars_df["Max Par"].apply(lambda x: x.split()[0]))

cars_df = cars_df.drop("Max Par", axis=1)

In [70]:
cars_df["Max Par (Nm)"].head()

0    350
1    320
2    240
3    340
4    300
Name: Max Par (Nm), dtype: int64

In [71]:
cars_df.describe()

Unnamed: 0,Year,Kms,Doors,Vacancies,Gears,Price (€),Boot Capacity (l),Length (cm),Height (cm),Width (cm),...,Max Weight (kg),Max Speed (km/h),Comb Fuel (l),Urban Use (l),Extraurban Use (l),Aceleration 0-100 (s),CO2 Emissions (gr/m3),Output (cv),Cubic Capacity (cm3),Max Par (Nm)
count,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,...,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0,82961.0
mean,2011.596075,117454.6,4.429648,4.888948,5.815914,17201.51,329.255229,431.954412,151.518485,179.222454,...,3.840019,169.157218,5.548082,6.926261,4.689823,9.988677,120.539133,140.570557,1895.721086,277.899447
std,5.729486,7456621.0,0.907018,0.851398,0.943719,24726.52,207.701891,75.237054,28.271092,13.414875,...,41.367538,72.009002,2.10784,3.106232,1.538672,3.527402,61.38473,71.620313,681.354202,121.641327
min,1952.0,0.0,0.0,1.0,1.0,500.0,0.0,0.0,0.0,0.0,...,1.015,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,0.0
25%,2008.0,19000.0,4.0,5.0,5.0,7490.0,190.0,419.0,144.0,175.0,...,1.755,168.0,4.3,5.1,3.8,8.6,103.0,100.0,1499.0,200.0
50%,2013.0,74369.0,5.0,5.0,6.0,12800.0,360.0,438.0,148.0,180.0,...,1.94,189.0,5.1,6.2,4.3,10.5,119.0,120.0,1798.0,270.0
75%,2016.0,140000.0,5.0,5.0,6.0,20300.0,480.0,466.0,163.0,184.0,...,2.15,208.0,6.3,7.9,5.3,12.0,148.0,150.0,1997.0,330.0
max,2018.0,2147484000.0,5.0,16.0,10.0,3259900.0,994.0,822.0,325.0,247.0,...,990.0,407.0,24.1,40.4,15.1,33.5,574.0,1001.0,8300.0,1250.0


Once we have modified the type of these columns, we add two new columns which contains the coordinates of every city.

In order to create those columns, we are going to use the geopy package. Probably we have to install this package before, so we must install it using the next command in our shell: 'pip install geopy'

In [72]:
from geopy.geocoders import Nominatim

In [73]:
#Now we create a DataFrame that contains the cities of our dataset which its coordinates.

locator = Nominatim()
cities = cars_df["City"].unique()

coordinates_per_city = []

for city in cities:
    location = locator.geocode([city])
    coordinates_per_city.append([city, location.latitude, location.longitude])
    
df_cities = pd.DataFrame(coordinates_per_city, columns=["City", "Latitude", "Longitude"])

In [74]:
df_cities.head()

Unnamed: 0,City,Latitude,Longitude
0,Barcelona,41.382894,2.177432
1,Madrid,40.416705,-3.703582
2,Malaga,36.721303,-4.421637
3,Castellon,40.251857,-0.061505
4,Murcia,37.992379,-1.130543


Finally, we merge this data frame with the other.

In [75]:
cars_with_coords_df = pd.merge(cars_df, df_cities, on = 'City')

In [76]:
cars_with_coords_df.head()

Unnamed: 0,Title,Brand,City,Year,Kms,Fuel,Seller,Guarantee,Doors,Vacancies,...,Comb Fuel (l),Urban Use (l),Extraurban Use (l),Aceleration 0-100 (s),CO2 Emissions (gr/m3),Output (cv),Cubic Capacity (cm3),Max Par (Nm),Latitude,Longitude
0,Citroen Jumper 2.2hdi Cl 33 L2h2 150,Citroen,Barcelona,2014,105000,Diesel,Profesional,Sí,4,9,...,7.2,8.8,6.3,0.0,199,150,2198,350,41.382894,2.177432
1,Nissan Juke 1.5dci Acenta 4x2,Nissan,Barcelona,2017,13059,Diesel,Profesional,Sí,5,5,...,4.0,4.6,3.7,11.2,104,110,1461,260,41.382894,2.177432
2,Citroen C4 Cactus 1.6 Bluehdi S&s Feel 100,Citroen,Barcelona,2015,13700,Diesel,Profesional,Sí,5,5,...,3.4,3.8,3.2,10.7,90,100,1560,254,41.382894,2.177432
3,Opel Vivaro Viv. Com6 2.0cdti 29 L2 Semi Acris...,Opel,Barcelona,2014,128000,Diesel,Profesional,Sí,4,6,...,7.9,9.8,7.0,0.0,202,114,1995,290,41.382894,2.177432
4,Peugeot Expert Combi 6 Mixto 2.0hdi,Peugeot,Barcelona,2006,215000,Diesel,Profesional,Sí,4,6,...,6.7,8.3,5.7,0.0,0,95,1997,215,41.382894,2.177432


The next step is adding two more columns to our data frame.

The first one contains the brands rating (score), and the other one contains the type rating (score).

We start adding the first column.

In [77]:
# Dataset loading
brands_rank = pd.read_csv("../data/brands_rank.csv", sep=";")

In [78]:
np.shape(brands_rank)

(54, 2)

In [79]:
brands_rank.head()

Unnamed: 0,brand,score
0,ferrari,1570
1,bmw,1365
2,mercedes-benz,1295
3,audi,1260
4,lamborghini,1190


In [80]:
brands_rank.tail()

Unnamed: 0,brand,score
49,vauxhall,95
50,loncin,55
51,austin rover,50
52,malaguti,20
53,lada,15


In [81]:
brands_rank.rename(columns={'brand': 'Brand', 'score': 'Score'}, inplace=True)

In [82]:
len(cars_with_coords_df["Brand"].unique())

71

In [83]:
len(brands_rank)

54

In [84]:
brands_rank["Brand"] = brands_rank["Brand"].apply(lambda b: b.capitalize())

In [85]:
brands_rank.head()

Unnamed: 0,Brand,Score
0,Ferrari,1570
1,Bmw,1365
2,Mercedes-benz,1295
3,Audi,1260
4,Lamborghini,1190


In [86]:
cars_with_coords_df = pd.merge(cars_with_coords_df, brands_rank, on="Brand", how="left")

In [87]:
cars_with_coords_df.head()

Unnamed: 0,Title,Brand,City,Year,Kms,Fuel,Seller,Guarantee,Doors,Vacancies,...,Urban Use (l),Extraurban Use (l),Aceleration 0-100 (s),CO2 Emissions (gr/m3),Output (cv),Cubic Capacity (cm3),Max Par (Nm),Latitude,Longitude,Score
0,Citroen Jumper 2.2hdi Cl 33 L2h2 150,Citroen,Barcelona,2014,105000,Diesel,Profesional,Sí,4,9,...,8.8,6.3,0.0,199,150,2198,350,41.382894,2.177432,
1,Nissan Juke 1.5dci Acenta 4x2,Nissan,Barcelona,2017,13059,Diesel,Profesional,Sí,5,5,...,4.6,3.7,11.2,104,110,1461,260,41.382894,2.177432,300.0
2,Citroen C4 Cactus 1.6 Bluehdi S&s Feel 100,Citroen,Barcelona,2015,13700,Diesel,Profesional,Sí,5,5,...,3.8,3.2,10.7,90,100,1560,254,41.382894,2.177432,
3,Opel Vivaro Viv. Com6 2.0cdti 29 L2 Semi Acris...,Opel,Barcelona,2014,128000,Diesel,Profesional,Sí,4,6,...,9.8,7.0,0.0,202,114,1995,290,41.382894,2.177432,400.0
4,Peugeot Expert Combi 6 Mixto 2.0hdi,Peugeot,Barcelona,2006,215000,Diesel,Profesional,Sí,4,6,...,8.3,5.7,0.0,0,95,1997,215,41.382894,2.177432,430.0


In [88]:
cars_with_coords_df.rename(columns={'Score': 'Brand Score'}, inplace=True)

In [89]:
cars_with_coords_df.columns

Index(['Title', 'Brand', 'City', 'Year', 'Kms', 'Fuel', 'Seller', 'Guarantee',
       'Doors', 'Vacancies', 'Car Body', 'Cylinders', 'Gears', 'Transmission',
       'Traction', 'Url', 'Price (€)', 'Boot Capacity (l)', 'Length (cm)',
       'Height (cm)', 'Width (cm)', 'Tank (l)', 'Weight (kg)',
       'Max Weight (kg)', 'Max Speed (km/h)', 'Comb Fuel (l)', 'Urban Use (l)',
       'Extraurban Use (l)', 'Aceleration 0-100 (s)', 'CO2 Emissions (gr/m3)',
       'Output (cv)', 'Cubic Capacity (cm3)', 'Max Par (Nm)', 'Latitude',
       'Longitude', 'Brand Score'],
      dtype='object')

Now, where appears a NaN value we put a zero.

In [96]:
new_column = []

for val in cars_with_coords_df["Brand Score"]:
    if type(val) == float:
        new_column.append(0)
    else:
        new_column.append(pd.to_numeric(val.replace(",","")))

cars_with_coords_df["Brand Score"] = new_column

In [97]:
cars_with_coords_df["Brand Score"]

0           0
1         300
2           0
3         400
4         430
5         400
6         300
7         400
8         400
9         565
10        400
11        330
12          0
13        630
14          0
15        565
16          0
17        430
18        345
19        630
20       1365
21          0
22        300
23        430
24       1365
25       1165
26       1365
27        300
28        430
29          0
         ... 
82931       0
82932       0
82933       0
82934       0
82935    1000
82936    1260
82937     220
82938     145
82939       0
82940     220
82941     890
82942       0
82943     295
82944    1260
82945    1365
82946    1365
82947     890
82948     840
82949     565
82950     430
82951     840
82952    1260
82953       0
82954     300
82955       0
82956     890
82957    1000
82958     565
82959     890
82960       0
Name: Brand Score, Length: 82961, dtype: int64

In [100]:
cars_with_coords_df["Car Body"].unique()

array(['Combi', 'Todo Terreno', 'Berlina', 'Furgon', 'Stationwagon',
       'Convertible', 'Coupe', 'Chasis', 'Monovolumen', 'Roadster',
       'Pick-Up Doble Cabina', 'Chasis Doble Cabina', 'Pick-Up', 'Targa',
       'Bus'], dtype=object)