In [363]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV
from sklearn.metrics import mean_squared_error as mse
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector, SelectFromModel



In [364]:
#Import the data
df = pd.read_csv('data/vehicles.csv')
df.shape


(426880, 18)

In [365]:
#Start cleaning the data
df = df.drop(columns=['id', 'VIN'])
df.dropna(inplace=True)
df = df[df['price'].between(df['price'].quantile(0), df['price'].quantile(.99999))]
df.shape

(79194, 16)

In [366]:
#Create a development set and a training set
dev = df.sample(frac=0.1, random_state=42)
test = df.drop(dev.index)
df = dev

df.head()
test.shape

(71275, 16)

In [367]:
#Create goal
df_goal = df['price']
df = df.drop(columns=['price'])
print(df.shape)


(7919, 15)


In [368]:
#encode the data
encoder = LabelEncoder()
df['manufacturer'] = encoder.fit_transform(df['manufacturer'])
df['model'] = encoder.fit_transform(df['model'])
df['cylinders'] = encoder.fit_transform(df['cylinders'])
df['transmission'] = encoder.fit_transform(df['transmission'])
df['drive'] = encoder.fit_transform(df['drive'])
df['size'] = encoder.fit_transform(df['size'])
df['type'] = encoder.fit_transform(df['type'])
df['paint_color'] = encoder.fit_transform(df['paint_color'])
df['state'] = encoder.fit_transform(df['state'])
df['region'] = encoder.fit_transform(df['region'])

df = pd.get_dummies(df)
print(df.info())
df.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7919 entries, 325655 to 127766
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   region                   7919 non-null   int64  
 1   year                     7919 non-null   float64
 2   manufacturer             7919 non-null   int64  
 3   model                    7919 non-null   int64  
 4   cylinders                7919 non-null   int64  
 5   odometer                 7919 non-null   float64
 6   transmission             7919 non-null   int64  
 7   drive                    7919 non-null   int64  
 8   size                     7919 non-null   int64  
 9   type                     7919 non-null   int64  
 10  paint_color              7919 non-null   int64  
 11  state                    7919 non-null   int64  
 12  condition_excellent      7919 non-null   uint8  
 13  condition_fair           7919 non-null   uint8  
 14  condition_good   

Unnamed: 0,region,year,manufacturer,model,cylinders,odometer,transmission,drive,size,type,...,fuel_electric,fuel_gas,fuel_hybrid,fuel_other,title_status_clean,title_status_lien,title_status_missing,title_status_parts only,title_status_rebuilt,title_status_salvage
325655,282,2009.0,12,1063,5,63759.0,0,0,1,10,...,0,1,0,0,1,0,0,0,0,0
92288,70,2006.0,6,2099,6,260600.0,0,2,1,8,...,0,1,0,0,1,0,0,0,0,0
1483,27,2007.0,36,2560,3,51000.0,1,1,0,4,...,0,1,0,0,1,0,0,0,0,0
423929,206,2016.0,6,991,6,75734.0,0,2,1,10,...,0,1,0,0,1,0,0,0,0,0
263310,290,2020.0,36,2409,6,20000.0,0,0,1,10,...,0,1,0,0,0,1,0,0,0,0


In [369]:
#Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
df_scaled.head()

Unnamed: 0,region,year,manufacturer,model,cylinders,odometer,transmission,drive,size,type,...,fuel_electric,fuel_gas,fuel_hybrid,fuel_other,title_status_clean,title_status_lien,title_status_missing,title_status_parts only,title_status_rebuilt,title_status_salvage
0,0.871289,0.064203,-0.488875,-0.324963,0.310272,-0.280338,-0.308954,-1.020994,-0.311839,0.934695,...,-0.044995,0.301532,-0.11309,-0.029744,0.25062,-0.088832,-0.049041,-0.027536,-0.189673,-0.116478
1,-1.037643,-0.22991,-1.049231,1.075881,1.071439,0.661879,-0.308954,1.545752,-0.311839,0.458217,...,-0.044995,0.301532,-0.11309,-0.029744,0.25062,-0.088832,-0.049041,-0.027536,-0.189673,-0.116478
2,-1.424832,-0.131872,1.752547,1.69923,-1.212061,-0.341412,2.562647,0.262379,-1.80486,-0.49474,...,-0.044995,0.301532,-0.11309,-0.029744,0.25062,-0.088832,-0.049041,-0.027536,-0.189673,-0.116478
3,0.186955,0.750467,-1.049231,-0.422319,1.071439,-0.223018,-0.308954,1.545752,-0.311839,0.934695,...,-0.044995,0.301532,-0.11309,-0.029744,0.25062,-0.088832,-0.049041,-0.027536,-0.189673,-0.116478
4,0.943324,1.142618,1.752547,1.495053,1.071439,-0.489799,-0.308954,-1.020994,-0.311839,0.934695,...,-0.044995,0.301532,-0.11309,-0.029744,-3.990105,11.257256,-0.049041,-0.027536,-0.189673,-0.116478


In [370]:
#reduce the data's dimensionality
pca = PCA(n_components=3, svd_solver='full')
pca.fit(df_scaled)
pca_df = pca.transform(df_scaled)


In [371]:
#plot the data
plot = px.scatter_3d(pca_df, x=0, y=1, z=2, opacity=1, title='PCA of Car Data', color=df_goal, color_continuous_scale='Bluered')
plot

In [372]:
px.box(df_goal, title='PCA of Car Data')


In [373]:
df.head()

#Create a train and test set
X_train, X_test, y_train, y_test = train_test_split(df, df_goal, test_size=0.2, random_state=42)


In [374]:
#Create a linear regression pipeline
linear_pipeline = Pipeline([
  ('poly', PolynomialFeatures(degree=4, include_bias=False)),
  ('scaler', StandardScaler()),
  ('selectFromModel', SelectFromModel(Lasso())),
  ('linear', LinearRegression())
  ])

#Fit the pipeline
linear_pipeline.fit(X_train, y_train)
linear_pipeline.fit(X_test, y_test)

#Predict the data
linear_test_pred = linear_pipeline.predict(X_test)
linear_train_pred = linear_pipeline.predict(X_train)

#Print the MSE
print('Linear Regression')
print('Test MSE:', mse(y_test, linear_test_pred))
print('Train MSE:', mse(y_train, linear_train_pred))



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.184e+11, tolerance: 9.402e+07


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.863e+09, tolerance: 2.667e+07



Linear Regression
Test MSE: 79.23011363636364
Train MSE: 5.334855922516416e+16


In [375]:

#Graph the data
plot = px.scatter(y=y_test, x=X_test['year'], opacity=.5, title='Linear Regression of Car Data', color_discrete_sequence=['red'])
plot.add_scatter(y=linear_test_pred, x=X_test['year'], mode='markers', marker=dict(color='blue', size=5))
plot.show()
