# Analysing Car Dataset
I take an exploratory approach into the cars dataset.

I first do exploratory data analysis, following by predictions on car prices.

# Import necessary packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import StandardScaler, OneHotEncoder, PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, LinearRegression, Lasso, ElasticNet

# Read in file

In [None]:
df = pd.read_csv('/kaggle/input/cars-dataset-audi-bmw-ford-hyundai-skoda-vw/cars_dataset.csv')

# Initial view on data
There are no nulls present, so I go straight into viewing features.

In [None]:
df.info()

# Initial description
There are five numerical features of interest. I will delve into these first.

In [None]:
df[['price','mileage','tax','mpg','engineSize']].describe()

# Defining feature space
I list out the numerical and categorical fields. Whilst year is a numerical attribute, it can also be treated categorically. I use it as such here, as a ordinal category rather than an continuous variable. 

In [None]:
numeric_vars = ['price','mileage','tax','mpg','engineSize']
cat_fields = ['model', 'transmission', 'fuelType', 'Make', 'year']

# Density plots
I view the distribution of the datapoints initially here.

In [None]:
fig, axs = plt.subplots(5,figsize=(10,20))

sns.set_theme(style="darkgrid")
for index, cols in enumerate(numeric_vars):
    sns.kdeplot(data=df, x=cols, ax=axs[index], fill=True)

# Zero engine size?
I spot here that there are zero engineSize values... so how many are there?

In [None]:
str(np.round(100*df[df['engineSize'] == 0].shape[0]/df.shape[0],2)) + " %"

# Drop Zero engineSize
Checking how many there are, we get 0.31% only. So I simply drop them.

In [None]:
df = df[df['engineSize'] != 0]

# View categorical variables
I view the categorical features here. There are make imbalances, which could be important since intuitively different makes are priced differently.

Fuel Type demonstrate a strong influence of this variable on other features. Different makes have different fuel distributions!

Also the number of cars are increasing year on year. Do volumes affect the prices? Or is it merely the vehicle age?

In [None]:
fig, axs = plt.subplots(3,2,figsize=(15,15))

sns.set_theme(style="darkgrid")
ax = sns.countplot(x="Make", data=df, ax = axs[0,0],
              order = df['Make'].value_counts().index)
ax = sns.countplot(x="Make", data=df, hue = 'fuelType', ax = axs[0,1],
              order = df['Make'].value_counts().index)
ax = sns.countplot(x="transmission", data=df, hue = 'fuelType', ax = axs[1,0],
              order = df['transmission'].value_counts().index)
ax = sns.countplot(x="transmission", data=df, hue = 'Make', ax = axs[1,1])
ax2 = sns.countplot(x="year", data=df[df['year'] >= 2010], hue = 'Make', ax = axs[2,0])
ax3 = sns.countplot(x="year", data=df[df['year'] >= 2010], ax = axs[2,1])
axlab = ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
axlab2 = ax3.set_xticklabels(ax3.get_xticklabels(), rotation=40, ha="right")

# Viewing Models
There are many models per make. Some models are in far fewer frequency, and so I clump together these ones.

In [None]:
fig, axs = plt.subplots(7,figsize=(40,80))

makes = sorted(list(set(df['Make'])))

for make_index, make_ in enumerate(makes):
    df_alt = df[df['Make'] == make_].reset_index()
    df_alt = df_alt[['Make','model']]

    ax = sns.countplot(x = 'model', data = df_alt, ax = axs[make_index],
                  order = df_alt['model'].value_counts().index)
    
    ax.set_title(make_, fontsize = 30)
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Stripping whitespaces
I remove any leading or trailing whitespace from the values in the categorical variables.

In [None]:
cols = df.select_dtypes(['object']).columns
df[cols] = df[cols].apply(lambda x: x.str.strip())

# Grabbing highest volume model
For each make, I isolate the most frequent model and clump all the rest into "Other".

In [None]:
counts = df[['Make','model']].groupby('model').count().reset_index()
counts = counts.rename(columns = {'Make':'counts'})

temp_df = df.merge(counts, left_on='model', right_on='model')[['Make','model','counts']]
temp_df.drop_duplicates(inplace = True)

temp_df = temp_df.groupby(["Make"], sort=False).apply(lambda x: x.sort_values(["counts"],
                                                                    ascending = False)).reset_index(drop = True)

top_list_per_make = list(temp_df.groupby('Make').head(1)['model'])

indexlist = df[~df['model'].isin(top_list_per_make)].index

df.loc[indexlist,'model'] = 'Other'

# Viewing this
The clumping lowers cardinality for later steps. However this also excessively clusters the categories. Therefore this feature will be removed later.

I keep it in just for the following visuals.

In [None]:
fig, axs = plt.subplots(7,figsize=(40,80))

makes = sorted(list(set(df['Make'])))

for make_index, make_ in enumerate(makes):
    df_alt = df[df['Make'] == make_].reset_index()
    df_alt = df_alt[['Make','model']]

    ax = sns.countplot(x = 'model', data = df_alt, ax = axs[make_index],
                  order = df_alt['model'].value_counts().index)
    
    ax.set_title(make_, fontsize = 30)
    ax.set_xticklabels(ax.get_xmajorticklabels(), fontsize = 16)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Cross-viewing numerical and categorical features
I view every combination of numerical and categorical feature to see if there are any other interesting patterns. I average the values per categorical feature to get an overall picture.



In [None]:
df_pairs = []
for numericval in numeric_vars:
    for catval in cat_fields:
        df_pair = df[[catval, numericval]].groupby(catval).mean().sort_values([catval]).reset_index()
        
        df_pair.rename(columns={df_pair.columns[1]: "Average " + df_pair.columns[1]}, inplace = True)
        
        df_pairs.append(df_pair)

fig, axs = plt.subplots(5,5,figsize=(20,40))

j = k = l = m = n = 0

for i, df_out in enumerate(df_pairs):
        if df_out.columns[0] == 'transmission':
            sns.barplot(x = df_out[df_out.columns[0]], y = df_out[df_out.columns[1]], ax=axs[j,0])
            j += 1
            
        elif df_out.columns[0] == 'fuelType':
            ax = sns.barplot(x = df_out[df_out.columns[0]], y = df_out[df_out.columns[1]], ax=axs[k,1])
            ax.set_ylabel('')
            k += 1
            
        elif df_out.columns[0] == 'Make':
            ax = sns.barplot(x = df_out[df_out.columns[0]], y = df_out[df_out.columns[1]], ax=axs[l,2])
            axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
            ax.set_ylabel('')
            l += 1
            
        elif df_out.columns[0] == 'year':
            ax = sns.barplot(x = df_out[df_out.columns[0]], y = df_out[df_out.columns[1]], ax=axs[m,3])
            axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
            ax.set_ylabel('')
            m += 1
            
        elif df_out.columns[0] == 'model':
            ax = sns.barplot(x = df_out[df_out.columns[0]], y = df_out[df_out.columns[1]], ax=axs[n,4])
            axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
            ax.set_ylabel('')
            n += 1



There are some interesting things to notice here. The mileage drops in a roughly exponential manner after 2006, yet an increase happened before this. Perhaps legacy vehicles (pre 2000) are being kept as staple items?

In general, electric vehicles are good to have - they have better mileage cover than I expected and also smaller engines than Diesel vehicles on average. The mpg is considerably higher, and as a big bonus there is zero tax!

# Violin plots of each feature
Breaking these down, there is some range in the values, with fairly high density at lower values.

The huge range in values demonstrate potential outliers. However I do not intend to remove these, as the extreme prices for example are potentially important in building a pricing model.

In [None]:
fig, axs = plt.subplots(5,5,figsize=(20,40))

j = k = l = m = n = 0

for numericval in numeric_vars:
    for catval in cat_fields:
        if catval == 'transmission':
            sns.violinplot(x = catval, y = numericval, data=df, ax=axs[j,0])
            j += 1
        
        elif catval == 'fuelType':
            ax = sns.violinplot(x = catval, y = numericval, data=df, ax=axs[k,1])
            ax.set_ylabel('')
            
            k += 1
        
        elif catval == 'Make':
            ax = sns.violinplot(x = catval, y = numericval, data=df, ax=axs[l,2])
            ax.set_ylabel('')
            
            l += 1
        
        elif catval == 'year':
            ax = sns.violinplot(x = catval, y = numericval, data=df, ax=axs[m,3])
            axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
            ax.set_ylabel('')
            
            m += 1
            
        elif catval == 'model':
            ax = sns.violinplot(x = catval, y = numericval, data=df, ax=axs[n,4])
            axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
            ax.set_ylabel('')
            
            n += 1

# Age of vehicle calculation
I opt here to switch the year from categorical to a numerical variable. I do this by using it to calculate the age in years of a vehicle, which is important as it drives the price as seen in plots above.

Greater granularity on this quantity could potentially improve models.

In [None]:
df['Age_of_vehicle'] = 2021 - df['year']

numeric_vars.append('Age_of_vehicle')

df.drop('year', axis = 1, inplace = True)

# Collinearity matrix
The collinearity matrix demonstrates little collinearity between variables.

It also shows that the mileage is strongly related to the age (+0.75 coefficient), which suggests that age increases, so does mileage. This is expected. It however also shows that the rise then drop in mileage we saw earlier is, on average, a drop. The rise is irrelevant.

In [None]:
# Compute the correlation matrix
corr = df[numeric_vars].corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0, annot = True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

# Correlation map
Mapping the values in pairgrid shows the distribution of variables as visualised in the above correlation matrix. The price exponentially decays with mileage.

What can also be seen is the linear relationship between age of vehicle and mileage once again.

Perhaps mileage is not a needed feature, since it is encoded by the age. I keep both in anyway.

In [None]:
def CorMap(df):
    df_corr = df[numeric_vars]
    
    corr = df_corr.corr()

    g = sns.PairGrid(df_corr)
    g.map_diag(plt.hist)
    g.map_offdiag(plt.scatter);
    
    return(corr)

plot = CorMap(df)

# Normalising datasets
There are skewed distributions in the data. So power transforms are needed to normalise the distribution. I view these transforms here, just for illustration.

In [None]:
fig, axs = plt.subplots(6,2,figsize=(20,40))

for index, col in enumerate(numeric_vars):
    xt, _ = stats.yeojohnson(df[col])
    sns.histplot(df[col], ax = axs[index,0]).set_title(f"Original data for {col}")
    sns.histplot(xt, ax = axs[index,1]).set_title(f"Transformed data for {col}")

# Preparing the model
As mentioned earlier, I now remove the model as I believe this variable will unnecessarily clutter the response matrix in one-hot encoding (curse of dimensionality) for little gain. The Make granularity is likely sufficient.

In [None]:
X = df.drop(columns = ['price', 'model'])
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Selecting numerical and categorical feature names
Since year is no longer present, I can encode the variables directly by type.

In [None]:
numerical_ix = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_ix = X_train.select_dtypes(include=['object', 'bool']).columns

# Transforms to apply
I apply one-hot encoding to the categorical variables and Yeo-Johnson (with centre-scaling) to the numerical variables.

In [None]:
t = [('cat', OneHotEncoder(), categorical_ix),
     ('num', PowerTransformer(method = 'yeo-johnson'), numerical_ix)]

col_transform = ColumnTransformer(transformers=t)

# Generic pipeline function
This pipeline enables arbitrary regression models to be inserted.

In [None]:
# define the data preparation and modeling pipeline
def pipeline_model(model):
    pipeline = Pipeline(steps=[('prep',col_transform), ('model', model)])
    return(pipeline)

# k-fold cross validation
This will enable me to choose which model performs best. I use k = 3, and choose a few different linear and non-linear models. The linear models will perform poorly, intuitively speaking.

I include them anyway, just to highlight how bad model choices lead to poor results.

Note also that I do not use GridSearchCV here to optimise hyperparameters for each model. It would be good practice to do this, but in the interest of time I forgo this step.

In [None]:
cv = KFold(n_splits=3, shuffle=True, random_state=1)

models = [RandomForestRegressor(),
          SVR(),
          Ridge(),
          LinearRegression(),
          Lasso(),
          ElasticNet()
         ]

score_list = []
for _, model_ in enumerate(models):
    model = TransformedTargetRegressor(regressor=pipeline_model(model_),
                                 func=np.log1p, inverse_func=np.expm1)
    
    scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs = -1, verbose = 2)
    
    final_score = np.mean(scores)
    
    score_list.append(final_score)
    

# Results
The best performer in the 3-fold tests is RFR, so I use this going ahead.

In [None]:
model_name_list = ['Random Forest Regressor',
          'Support Vector Machine - Regressor',
          'Ridge Regressor',
          'Linear Regression',
          'Lasso Regression',
          'ElasticNet Regression']

results = pd.DataFrame(
    {'Model type': model_name_list,
     'Mean Score (R^2)': score_list})

results

# Fitting Random forest regressor
I build the random forest regressor pipeline, with the predictor also transformed (and inverse transformed on completion), as this will improve output. See the above Yeo-Johnson graphs to see why.

In principle, normal distributions are favoured by models.

In [None]:
pipeline = Pipeline(steps=[('prep',col_transform), ('model', RandomForestRegressor())])

model = TransformedTargetRegressor(regressor=pipeline, func=np.log1p, inverse_func=np.expm1)
    
model.fit(X_train, y_train)

# Predicting the output
I use the model to predict the output.

In [None]:
y_hat = model.predict(X_test)
y_test = y_test.to_numpy()

r2_score(y_hat, y_test)

# Viewing the prediction
Clearly the model is performing very well in predicting prices.

In [None]:
dataset = pd.DataFrame({'prediction': list(y_hat), 'actual': list(y_test)})
g = sns.jointplot(x="prediction", y="actual", data=dataset, kind='reg',
                  joint_kws={'line_kws':{'color':'cyan'}})

# Extracting feature names
There is no direct output of feature names, therefore it needs to be coerced out from the fit object.

In [None]:
cat_feats = list(model.regressor_.steps[0][1].named_transformers_['cat'].get_feature_names())
numerical_feats = list(numerical_ix)
full_feature_list = cat_feats + numerical_feats

# Extracting importances from model
This will get an idea of which features were important as per the random forest model.

In [None]:
importances = model.regressor_.steps[1][1].feature_importances_

# Importance visual
Interestingly, the most important feature is whether a vehicle is manual or not. The three numerical features are all important also.

What is more interesting is that the model suggests other features are less relevant in pricing. For instance, the Make doesn't matter so much.

Delving deeper into this, it is likely down to similar types of vehicles (irrespective of Make/model) having similar prices. Essentially the four chosen quantities are a "catch-all", and it also tells us that the fuel type (Manual or not) itself carries the most weight in pricing. For those that are NOT Manual, it doesn't really distinguish much between them.

The correlation matrix only shows relationships between numerical variables, however a cross-categorical analysis is relevant which is done above. This identifies the importance of fuel type.

With this, we can see it is worth dropping all categorical features, but fuel type, and having a boolean variable ("Manual/Other") in place.

In [None]:
importance_df = pd.DataFrame({'Features': full_feature_list, 'Importance': list(importances)}).sort_values(by = 'Importance', ascending = False)
ax = sns.barplot(x="Features", y="Importance", data=importance_df)
axlab = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Future work
1. Optimise hyper-parameters of comparitive models.
2. Try xgboost also to compare to Random Forest Regressor.
3. See what removing features and creating Manual/Not-Manual grouping does to the $R^2$.