In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Hi there! In this notebook I have made my first attempt at using lasso and ridge regression, alongside linear regression in order to evaluate if these methods could improve model performance. I have tried to ommit some of the repetitive parts since I ended up builing 8 different datasets, but the important parts are highlighted at the end.

# The goal here is to build a model that can use all given datasets together and still have a nice performance score

# Part 1: Data Wrangling

# A quick check shows that the files have different numbers of columns, and we need to select the common columns for all files 

In [None]:
import os
filename_list=[]
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        filename_list.append(os.path.join(dirname, filename))

In [None]:
#Joining all csv files into one dataset with one line of code :)
data=pd.concat(map(pd.read_csv,filename_list))

In [None]:
data.info()

# The huge amount of missing data on some columns have made it clear that using those would not be benefitial to the model, since there is no feasible way to fill in all of the missing data. Therefore, columns that are missing the vast majority of the entry values will be excluded from the model

For this dataset, the following will be excluded:
* mileage2 
* fuel type2 
* engine size2 
* reference   

The "tax(£)" column will not be excluded as there is a similar "tax" column and I'll attempt to merge the two columns into one

In [None]:
data.drop(["engine size","mileage2","fuel type2","engine size2","reference"],axis=1,inplace=True)
data.head(2)

In [None]:
import math as m
tax=list(data["tax"])
taxe=list(data["tax(£)"])
tax_final=[]
for i in range(0, len(tax)):
    if m.isnan(tax[i]):
        if m.isnan(taxe[i]):
            tax_final.append(np.nan)
        else:
            tax_final.append(taxe[i])
    else:
        tax_final.append(tax[i])
data.drop(["tax","tax(£)"],axis=1,inplace=True)
data["tax"]=tax_final

In [None]:
data.info()

# One could try to fill in the missing values using proper interpolation techniques (for numerical values), but since the dataset is extensive compared to the number of features, the choice taken here will be to filter out entries that have any amount of null values

In [None]:
data.dropna(inplace=True)
data.info()

# Last but not least: price and mileage features should be numeric, and they are described as objects

In [None]:
data.head()

In [None]:
price_list=list(data.price)
new_price_list=[]
for i in price_list:
    new_price_list.append(float(i))
data["price"]=new_price_list

In [None]:
data.info()

In [None]:
mileage_list=list(data.mileage)
new_mileage_list=[]
for i in mileage_list:
    new_mileage_list.append(float(i))
data["mileage"]=new_mileage_list

In [None]:
data.info()

# Part 2: EDA

In [None]:
# importing visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data.describe()

First observation: There is an entry from year 2060, which must be removed

In [None]:
sns.pairplot(data)

In [None]:
sns.regplot(data=data, x="tax", y="price")

In [None]:
sns.regplot(data=data, x="engineSize", y="price")

In [None]:
data[data["year"]>2020]

In [None]:
data.model.nunique()

In [None]:
data.transmission.unique()

In [None]:
data.fuelType.unique()

In [None]:
data=data[data["year"]<=2020]

In [None]:
data.info()

# What can be observed from the EDA

* Mileage and mileage per gallon seem to have an inverse relation to price and I'll attempt a fit with the inverse value of both features in order to improve model performance.
* There is an entry with an inconsistent value: Since we are in year 2020, no year entry can exceed this value, but this happens with a Fiesta which is labeled as year 2060.
* There is a positive relation between price and year, but there seems to be no visible relation of engine size and tax to the price.
* There are 195 different car models in this dataset. Although it is possible to convert them to dummy variables, we will disconsider this feature for regression. Two cars with the exact same specs are supposed to be sold at the same price and any difference coming from model could be attributed to the model error.
* The transmission feature can be reworked as both dummies and cardinals. It's plausible that automatic transmissions are superior to semi-auto and manual transmissions, and a hierarchy can be estabilished in this feature. We will implement both alternatives to check if this improves model performance.
* The same could be said for fuel type, but since the hierarchy is not as clear and objective as in the transmission feature case, the dummy variable approach will be taken

In [None]:
i_mileage=[1/x for x in list(data.mileage)]
i_mpg=[1/x for x in list(data.mpg)]

In [None]:
data.drop("model", axis=1, inplace=True)

In [None]:
plt.scatter(i_mileage, data["price"])

In [None]:
plt.scatter(i_mpg, data["price"])

# Part 3: Model Selection

# For this regression we will use the following models:

* Linear regression 
* Lasso regression
* Ridge regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score

# From now on we will have the following names for datasets

# x y and z will have either 0 or 1 value corresponding to false or true

* x is corresponding to incorporating the inverse values of mileage
* y is corresponding to incorporating the inverse values of mpg
* z is corresponding to using cardinal values for the transmission feature, if it is 0 the dataset will use dummies

And the dataset will be named as dataxyz

In [None]:
data.info()

In [None]:
data=data.reset_index().drop("index",axis=1)
data

In [None]:
#fuel type dummies
dummies_ft=pd.get_dummies(data["fuelType"],drop_first=True)
dummies_ft["Other FT"]=dummies_ft["Other"]
dummies_ft.drop("Other",axis=1,inplace=True)
data=data.join(dummies_ft)

In [None]:
data.drop("fuelType", axis=1, inplace=True)

In [None]:
data000=data.copy()
data100=data.copy()
data010=data.copy()
data001=data.copy()
data101=data.copy()
data110=data.copy()
data011=data.copy()
data111=data.copy()

In [None]:
data100["mileage"]=i_mileage
data101["mileage"]=i_mileage
data110["mileage"]=i_mileage
data111["mileage"]=i_mileage

In [None]:
data010["mpg"]=i_mpg
data011["mpg"]=i_mpg
data110["mpg"]=i_mpg
data111["mpg"]=i_mpg

In [None]:
#dummies and cardinals for transmission
dummies_transmission=pd.get_dummies(data["transmission"],drop_first=True)
dummies_transmission["Other T"]=dummies_transmission["Other"]
dummies_transmission.drop("Other", axis=1, inplace=True)
cardinal_transmission={'Automatic':2, 'Manual':0, 'Semi-Auto':1, 'Other':3} 
# Other gets value 3 because it is very likely that any other type of transmission is superior to the ones listed

data000.drop("transmission",axis=1,inplace=True)
data100.drop("transmission",axis=1,inplace=True)
data010.drop("transmission",axis=1,inplace=True)
data110.drop("transmission",axis=1,inplace=True)
data000=data000.join(dummies_transmission)
data100=data100.join(dummies_transmission)
data010=data010.join(dummies_transmission)
data110=data110.join(dummies_transmission)

data001["transmission"]=data001["transmission"].map(cardinal_transmission)
data101["transmission"]=data101["transmission"].map(cardinal_transmission)
data011["transmission"]=data011["transmission"].map(cardinal_transmission)
data111["transmission"]=data111["transmission"].map(cardinal_transmission)


In [None]:
y000=data000["price"]
X000=data000.drop("price",axis=1)

y100=data100["price"]
X100=data100.drop("price",axis=1)

y010=data010["price"]
X010=data010.drop("price",axis=1)

y001=data001["price"]
X001=data001.drop("price",axis=1)

y101=data101["price"]
X101=data101.drop("price",axis=1)

y110=data000["price"]
X110=data000.drop("price",axis=1)

y011=data011["price"]
X011=data011.drop("price",axis=1)

y111=data111["price"]
X111=data111.drop("price",axis=1)

X_train000, X_test000, y_train000, y_test000 = train_test_split(X000, y000, test_size=0.3)

X_train001, X_test001, y_train001, y_test001 = train_test_split(X001, y001, test_size=0.3)

X_train010, X_test010, y_train010, y_test010 = train_test_split(X010, y010, test_size=0.3)

X_train100, X_test100, y_train100, y_test100 = train_test_split(X100, y100, test_size=0.3)

X_train110, X_test110, y_train110, y_test110 = train_test_split(X110, y110, test_size=0.3)

X_train101, X_test101, y_train101, y_test101 = train_test_split(X101, y101, test_size=0.3)

X_train011, X_test011, y_train011, y_test011 = train_test_split(X011, y011, test_size=0.3)

X_train111, X_test111, y_train111, y_test111 = train_test_split(X111, y111, test_size=0.3)

# Model 1: Linear Regression

In [None]:
lr=LinearRegression()
lr.fit(X_train000, y_train000)
lr_result=lr.predict(X_test000)
print(explained_variance_score(y_test000,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train001, y_train001)
lr_result=lr.predict(X_test001)
print(explained_variance_score(y_test001,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train010, y_train010)
lr_result=lr.predict(X_test010)
print(explained_variance_score(y_test010,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train100, y_train100)
lr_result=lr.predict(X_test100)
print(explained_variance_score(y_test100,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train011, y_train011)
lr_result=lr.predict(X_test011)
print(explained_variance_score(y_test011,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train110, y_train110)
lr_result=lr.predict(X_test110)
print(explained_variance_score(y_test110,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train101, y_train101)
lr_result=lr.predict(X_test101)
print(explained_variance_score(y_test101,lr_result))

In [None]:
lr=LinearRegression()
lr.fit(X_train111, y_train111)
lr_result=lr.predict(X_test111)
print(explained_variance_score(y_test111,lr_result))

# Model 2: Lasso Regression

# Lasso modeling involves choosing an alpha value, and since we already have 8 different datasets, I chose to optimize the one with the best R2 value from the linear model

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train010,y_train010)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X010.columns)
plt.xlabel("\alpha value")
plt.ylabel("Coefficient value")

In [None]:
lasso=Lasso(alpha=1)
lasso.fit(X_train010,y_train010)
lasso_result=lasso.predict(X_test010)
print(explained_variance_score(y_test010,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train010,y_train010)
lasso_result=lasso.predict(X_test010)
print(explained_variance_score(y_test010,lasso_result))
print(lasso.coef_)

# Unfortunately, the lasso removed some of the dummy variables in the process, so the model performed worse since there was a lack of information none of the remaining features could explain. This pushed me to also try optimizing the other datasets

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train000,y_train000)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X000.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train000,y_train000)
lasso_result=lasso.predict(X_test000)
print(explained_variance_score(y_test000,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train000,y_train000)
lasso_result=lasso.predict(X_test000)
print(explained_variance_score(y_test000,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train100,y_train100)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X100.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train100,y_train100)
lasso_result=lasso.predict(X_test100)
print(explained_variance_score(y_test100,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train100,y_train100)
lasso_result=lasso.predict(X_test100)
print(explained_variance_score(y_test100,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train001,y_train001)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X001.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train001,y_train001)
lasso_result=lasso.predict(X_test001)
print(explained_variance_score(y_test001,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train001,y_train001)
lasso_result=lasso.predict(X_test001)
print(explained_variance_score(y_test001,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train110,y_train110)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X110.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train110,y_train110)
lasso_result=lasso.predict(X_test110)
print(explained_variance_score(y_test110,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train110,y_train110)
lasso_result=lasso.predict(X_test110)
print(explained_variance_score(y_test110,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train101,y_train101)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X101.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train101,y_train101)
lasso_result=lasso.predict(X_test101)
print(explained_variance_score(y_test101,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train101,y_train101)
lasso_result=lasso.predict(X_test101)
print(explained_variance_score(y_test101,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train011,y_train011)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X011.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train011,y_train011)
lasso_result=lasso.predict(X_test011)
print(explained_variance_score(y_test011,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=100)
lasso.fit(X_train011,y_train011)
lasso_result=lasso.predict(X_test011)
print(explained_variance_score(y_test011,lasso_result))
print(lasso.coef_)

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    lasso=Lasso(alpha=i)
    lasso.fit(X_train111,y_train111)
    values.append(list(lasso.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X111.columns)

In [None]:
lasso=Lasso(alpha=10)
lasso.fit(X_train111,y_train111)
lasso_result=lasso.predict(X_test111)
print(explained_variance_score(y_test111,lasso_result))
print(lasso.coef_)

In [None]:
lasso=Lasso(alpha=1000)
lasso.fit(X_train111,y_train111)
lasso_result=lasso.predict(X_test111)
print(explained_variance_score(y_test111,lasso_result))
print(lasso.coef_)

# As seen above, all cases of variable selection ended up lowering model performance, so the lasso regression is not useful in this case.

# Model 3: Ridge Regression

In [None]:
values=[]
for i in [0.1,1,10,100,1000,10000]:
    ridge=Ridge(alpha=i)
    ridge.fit(X_train010,y_train010)
    values.append(list(ridge.coef_))
val_arr=np.array(values)    
val_arr=np.transpose(val_arr)
values=val_arr.tolist()

plt.figure(figsize=(20,12))
for i in range(0,len(values)):
    plt.plot([0.1,1,10,100,1000,10000],values[i])
plt.xscale("log")
plt.legend(X010.columns)

In [None]:
ridge=Ridge(alpha=1)
ridge.fit(X_train010,y_train010)
ridge_result=ridge.predict(X_test010)
print(explained_variance_score(y_test010,ridge_result))
print(ridge.coef_)

In [None]:
ridge=Ridge(alpha=10)
ridge.fit(X_train010,y_train010)
ridge_result=ridge.predict(X_test010)
print(explained_variance_score(y_test010,ridge_result))
print(ridge.coef_)

In [None]:
ridge=Ridge(alpha=100)
ridge.fit(X_train010,y_train010)
ridge_result=ridge.predict(X_test010)
print(explained_variance_score(y_test010,ridge_result))
print(ridge.coef_)

# Similarly to the lasso, ridge regression was also unable to improve performance

# Given that these methods are suposed to improve model performance, the hypothesis of the model feature not being important starts to lose credibility and we will attempt to enable this feature with reduced dimensionality

# Retry: Reworking model feature

# This will be done by adding the brand column, which will contain the model brand. It's a more comprehensive feature and will not increase dimensionality by a huge value. Only problem is: 3 datasets don't have brand name on the file name, and for simplicity, we will remove these from our final dataset

In [None]:
filename_list
# entries 3,4 and 5 will be removed

In [None]:
filename_list1=filename_list[0:3]+filename_list[6:]
filename_list1

In [None]:
data2=pd.read_csv(filename_list1[0])
brand_name=filename_list1[0].split("/")[-1].split(".")[0]
data2["brand"]=[brand_name]*len(data2)
for i in filename_list1[1:]:
    aux_df=pd.read_csv(i)
    brand_name=i.split("/")[-1].split(".")[0]
    aux_df["brand"]=[brand_name]*len(aux_df)
    data2=pd.concat([data2,aux_df])
# coincidently, a lot of the features that weren't present in most datasets belong to the data we just removed 
data2=data2[data2["year"]<=2020]
data2.head()

In [None]:
import math as m
tax=list(data2["tax"])
taxe=list(data2["tax(£)"])
tax_final=[]
for i in range(0, len(tax)):
    if m.isnan(tax[i]):
        if m.isnan(taxe[i]):
            tax_final.append(np.nan)
        else:
            tax_final.append(taxe[i])
    else:
        tax_final.append(tax[i])
data2.drop(["tax","tax(£)"],axis=1,inplace=True)
data2["tax"]=tax_final

data2.dropna(inplace=True)
data2.drop("model", axis=1, inplace=True)

In [None]:
data2.info()

# A quick check on the relation between brand and price

In [None]:
g = sns.FacetGrid(data2, col="brand",col_wrap=3)
g.map(sns.distplot,"price")

# Without any visible difference between average price, we're going to use the dummy variable approach, combined with the best performing model from the last attempt

In [None]:
#dummies and cardinals for transmission
data2=data2.reset_index().drop("index",axis=1)
dummies_transmission=pd.get_dummies(data2["transmission"],drop_first=True)
dummies_transmission["Other T"]=dummies_transmission["Other"]
dummies_transmission.drop("Other",axis=1, inplace=True)
dummies_brand=pd.get_dummies(data2["brand"], drop_first=True)
dummies_ft=pd.get_dummies(data2["fuelType"],drop_first=True)
dummies_ft["Other FT"]=dummies_ft["Other"]
dummies_ft.drop("Other",axis=1, inplace=True)
data2["mpg"]=i_mpg
data2.drop(["transmission","brand","fuelType"], axis=1, inplace=True)
data2=data2.join(dummies_transmission)
data2=data2.join(dummies_brand)
data2=data2.join(dummies_ft)
data2.head()

# Linear Regression Attempt

In [None]:
data2.info()

In [None]:
y=data2["price"]
X=data2.drop("price",axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
lr=LinearRegression()
lr.fit(X_train, y_train)
lr_res=lr.predict(X_test)
print(explained_variance_score(y_test,lr_res))

# Considering all brands over 90,000 entries, the best model could explain 75% of the variance. The model could be improved by making one model for each dataset, but this attempt had as it's goal an all-rounder model.

* The implementation of the "model" dummy feature can possibily do more harm than good given there would be an extra 194 features for the model. I didn't want to try it, but any feedback on this implementation would be greatly appreciated.
* I had hopes that the shinkage methods would remove the features that didn't seem to have much of a visible relation to the price feature, such as engineSize. Maybe a more iterative method such as stepwise selection could perform better in this scenario.
* The shrinkage methods seemed to perform worse when dealing with many dummy variables. Shrinkage methods remove or lessen the weight of some variables in the model and since dummy variables always go together (either all of them are in the model or none of them are) the variable selection process only diminished the model performance.
* The 1/mpg rework improved model performance, but the 1/mileage rework didn't have the same effect, but still presented a negative linear coefficient.
* The creation of the brand feature didn't have the impact I expected, probably beacause all of the price distributions divided by brand seemed very similar, as shown in this notebook