In [None]:
#importing libs
import numpy as np
import pandas as pd
pd.set_option('MAX_COLUMNS', None)

#Data viz
import matplotlib.pyplot as plt
import seaborn as sns

#Modelling and Testing
import statsmodels.api as sm
from scipy.stats import shapiro
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, SGDRegressor

#Evaluating the models
from sklearn.metrics import r2_score,mean_absolute_error, mean_squared_error


#Paths
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 0.0 Problem Description
### Problem Statement

A Chinese automobile company Geely Auto aspires to enter the US market by setting up their manufacturing unit there and producing cars locally to give competition to their US and European counterparts.

They have contracted an automobile consulting company to understand the factors on which the pricing of cars depends. Specifically, they want to understand the factors affecting the pricing of cars in the American market, since those may be very different from the Chinese market. The company wants to know:

Which variables are significant in predicting the price of a car How well those variables describe the price of a car Based on various market surveys, the consulting firm has gathered a large data set of different types of cars across the America market.

### Business Goal

We are required to model the price of cars with the available independent variables. It will be used by the management to understand how exactly the prices vary with the independent variables. They can accordingly manipulate the design of the cars, the business strategy etc. to meet certain price levels. Further, the model will be a good way for management to understand the pricing dynamics of a new market.


# 0.1 Colect the Data

In [None]:
data = pd.read_csv('/kaggle/input/car-price-prediction/CarPrice_Assignment.csv')
data.set_index('car_ID', inplace=True)
data.drop('CarName', inplace=True, axis=1)
data.head()

In [None]:
#Describe the data
data.describe()

In [None]:
#My own describe
pd.DataFrame({'missing':data.isna().mean(),
             'unicos':data.nunique(),
             'tipos': data.dtypes})

In [None]:
# Changing the type of variable 'symboling' because it was guessed as int and its in fact a ordinal categorical variable.
data['symboling'] = data['symboling'].astype('object') 

There is no missing values. Horray!

# 1.0 Univariate Analysis
We can see that our target variable is right-skewed let's fix it performing a log transformation.

In [None]:
#Spliting the data
target = data[['price']]
categorical_vars = data.select_dtypes('object')
numerical_vars = data.select_dtypes(['int64','float64'])
numerical_vars.drop('price', inplace=True, axis=1)

In [None]:
#Target Analysis
#Skew and Kurt
skew = target.skew()
kurt = target.kurt()
print(f'Skewness: {skew}')
print(f'Kurtosis: {kurt}')

#Distribucion
f,ax = plt.subplots(figsize=(12,6))
sns.distplot(target)

#qqplot
norm = sm.qqplot(target, line='s')

#Normality Shapiro test
stats, p = shapiro(target)
print(f'Statistics p-value: {p}')
# interpret
alpha = 0.05
if p > alpha:
    print('Target looks Gaussian (fail to reject H0)')
else:
    print('Target does not look Gaussian (reject H0)')

In [None]:
#transforming the target

target_norm = np.log(data['price'])

#Skew and Kurt
skew = target_norm.skew()
kurt = target_norm.kurt()
print(f'Skewness: {skew}')
print(f'Kurtosis: {kurt}')

#Distribucion
f,ax = plt.subplots(figsize=(12,6))
sns.distplot(target_norm)

#qqplot
norm = sm.qqplot(target_norm, line='s')

#Normality Shapiro test
stats, p = shapiro(target_norm)
print(f'Statistics p-value: {p}')
# interpret
alpha = 0.05
if p > alpha:
    print('Target looks Gaussian (fail to reject H0)')
else:
    print('Target does not look Gaussian (reject H0)')

The target distribution still not gaussian, once it fails to the Shapiro test, but it's way closer to a Gaussian distribution than before.

In [None]:
# Numerical variables univariate analysis
def univariate_analysis(df):
    """
    Function to perform univariate analysis.
    
    df: DataFrame
    """
    for col in df.columns.to_list():
        plt.figure(figsize=(8,5))
        plt.title(f'{col}\n distribuition',fontsize=16)
        sns.distplot(df[col])
        plt.xlabel(col,fontsize=14)
        plt.show()
        
        #Normality Shapiro test
        stats, p = shapiro(df[col])
        print(f'Statistics p-value: {p}')
        # interpret
        alpha = 0.05
        if p > alpha:
            print('Target looks Gaussian (fail to reject H0)')
        else:
            print('Target does not look Gaussian (reject H0)')

univariate_analysis(numerical_vars)

We can see that all of the numerical variables don't pass the Shapiro test so we will perform some transformation on them in the preprocessing stage.

In [None]:
#Checking for outliers
def outliers_analysis(df):
    """
    Function to check for outliers visually through a boxplot
    
    df: DataFrame
    """
    for col in df.columns.to_list():
        plt.figure(figsize=(8,5))
        plt.title(f'{col}\n',fontsize=16)
        sns.boxplot(x=col, data=df)
        plt.xlabel(col,fontsize=14)
        plt.ylabel('Target',fontsize=14)
        plt.show()

outliers_analysis(numerical_vars)

Our dataset is not outliers heavy but there are some outliers so we will use RobustScale() to transform our numerical features at Preprocessing stage.

# 1.1 Bivariate Analysis

### Numerical variables.
We can see by plots below that most of the numerical variables hold a linear relationship with our target variable.

In [None]:
# Variables vs Target
def variables_vs_target(df,target):
    """
    Function to compare variabels with the target with a scatterplot.
    
    df: DataFrame
    target: Variabel Target
    """
    for col in df.columns.to_list():
        plt.figure(figsize=(8,5))
        plt.title(f'{col} vs. \nTarget',fontsize=16)
        plt.scatter(x=df[col],y=target,color='blue',edgecolor='k')
        plt.xlabel(col,fontsize=14)
        plt.ylabel('Target',fontsize=14)
        plt.show()

variables_vs_target(numerical_vars,target=target)

### Categorical variables

Through the plots below, we can see some interesting patterns such as:

- Cars with diesel and turbo trends to be more expensive.
- Rwd drivewheel trends to be more expensive than the others.
- Engine located at rear trends to be more expensive.

Between other patterns.

In [None]:
categorical_boxplots = categorical_vars.copy()
categorical_boxplots['Target'] = target


def categorical_analysis(df):
    """
    Function to analyze the target variable vs categorical ones.
    
    df: DataFrame
    """
    for col in df.columns.to_list():
        plt.figure(figsize=(8,5))
        plt.title(f'{col}\n distribuition',fontsize=16)
        sns.boxplot(x=col, y='Target', data=df)
        plt.xlabel(col,fontsize=14)
        plt.ylabel('Target',fontsize=14)
        plt.show()

categorical_analysis(categorical_boxplots)

# 2.0 Preprocessing to modeling with scikitlearn

In [None]:
#Getting 'symboling' out of the mix, once its an ordinal categorical
labels = categorical_vars.columns.to_list()[1:]
categorical_vars = categorical_vars[labels]

#Spliting the data
X_treino, X_teste, y_treino, y_teste = train_test_split(data, target_norm, test_size=.10, shuffle=True, random_state=3)


#Categorical Pipeline
categorical_pipeline = Pipeline([('ohe', OneHotEncoder())])

#Numerical Pipeline
numerical_pipeline = Pipeline([('scaler', RobustScaler())])

#Pipeline
pipeline = ColumnTransformer([('cat',categorical_pipeline, categorical_vars.columns.to_list()),
                              ('num',numerical_pipeline, numerical_vars.columns.to_list())])

# 2.1 - SciKit - Multiple Linear Regression
**Multiple Linear Regression:**
Multiple regression is an extension of simple regression we use it when we want to predict a continuous variable based on more than one predictor(simple regression).

The formula of multiple Regression is

$$ y = b1*variable1 + b2*variable2 ... bn*variablen + a $$
  
   Where:

    b = Angular coefficient of each variable.
    a = Intercept of the population.
   
   
We can evaluate the performance of our model, basically, by calculating the distance of the prediction to the observed value in a validation dataset.

Below the approaches that we will use to evaluate de models here.



Why R2 and RMSE  to evaluate the models?

**R-squared:**

The R-squared tells us the amount of variance our model is capturing from the response variable.

It goes by the formula below:

$$ R2 = 1 - \frac {\sum (y-ŷ)²} {\sum (y-ymean)²} $$

**RMSE:**

The RMSE is the Root Mean Squared Error it measures the average of the squared difference between the predictions and the observed value.
It works like a standard deviation of our predictions, the smaller its value the better, it's also sensitive to outliers and it comes in the same unit values of the response variable.

It goes by the formula below:

$$ RMSE =\sqrt \frac {\sum (Predicted-Actual)²} {N} $$

In [None]:
reg_pipe = Pipeline([('ct',pipeline),
                     ('reg',LinearRegression())])

#Fitting
reg_pipe.fit(X_treino,y_treino)

#Predicting
reg_pred = reg_pipe.predict(X_teste)

#Evaluating
print(f'The R2: {r2_score(np.expm1(y_teste),np.expm1(reg_pred))}')
print(f'The RMSE: {np.sqrt(mean_squared_error(np.expm1(y_teste),np.expm1(reg_pred)))}')
print('\n')
print(f'Intercept: {reg_pipe.named_steps.reg.intercept_}')
print(f'Coefs: {reg_pipe.named_steps.reg.coef_}')

From the results above we can see that our r2 is about 0.98 meaning that our model are able to explain 98% of the price(response variable) variance.

Also, the RMSE is just about 1100 which is a good range given the range of our target. The Min price is 5118 and the Max price 45400.

# 2.2 - SciKit - Lasso
Lasso regression is a method of feature elimination and shrinkage to linear models. The goal here is to obtain a subset of features that minimizes the prediction errors. It reduces the complexity of the model preventing overfitting.

In [None]:
lasso_pipe = Pipeline([('ct',pipeline),
                         ('reg',LassoCV(alphas = (0.001, 0.1,1.0,5.0,10.0,50.0,100), cv=5))])

#Fitting
lasso_pipe.fit(X_treino,y_treino)

#Predicting
lasso_pred = lasso_pipe.predict(X_teste)

#Evaluating
print(f'The R2: {r2_score(np.expm1(y_teste),np.expm1(lasso_pred))}')
print(f'The RMSE: {np.sqrt(mean_squared_error(np.expm1(y_teste),np.expm1(lasso_pred)))}')
print('\n')
print(f'Best alpha: {lasso_pipe.named_steps.reg.alpha_}')
print(f'Non-zero coefs:{len(lasso_pipe.named_steps.reg.coef_!=0)} from {X_treino.shape[0]} variables')
print(f'Intercept: {lasso_pipe.named_steps.reg.intercept_}')
print(f'Coefs: {lasso_pipe.named_steps.reg.coef_}')

The lasso model keeps the good performance of the regular linear regression, but due to how its regularization term works some coefficients are penalized to zero performing a kind of feature selection. In our model, we can see that just 51 variables were relevant to it.

# 2.3 - SciKit - Ridge
Ridge is a regularization method in which the main goal is to smooth the variables that are related to each other(multicollinearity) by penalizing ist coeficients through the term lambda. So, ridge regression shrinks the coefficients and it helps to reduce the model complexity and multicollinearity.

When lambda = 0, we have a regular linear regression cost function, that is the lower lambda is more closer to a regular linear regression.

Ridge Cost Function:

$$\text{Cost} = \sum_{i=1}^N\Big\{ y_i - \sum_{j=0}^M \beta_j x_{ij}\Big\}^2 + \lambda \sum_{j=0}^M w_j^2$$

In [None]:
ridge_pipe = Pipeline([('ct',pipeline),
                         ('reg',RidgeCV(alphas=(0.1,1.0,5.0,10.0,50.0,100), cv=5))])


#Fitting
ridge_pipe.fit(X_treino,y_treino)

#Predicting
ridge_pred = ridge_pipe.predict(X_teste)

#Evaluating
print(f'The R2: {r2_score(np.expm1(y_teste),np.expm1(ridge_pred))}')
print(f'The RMSE: {np.sqrt(mean_squared_error(np.expm1(y_teste),np.expm1(ridge_pred)))}')
print('\n')
print(f'Best alpha: {ridge_pipe.named_steps.reg.alpha_}')
print(f'Intercept: {ridge_pipe.named_steps.reg.intercept_}')
print(f'Coefs: {ridge_pipe.named_steps.reg.coef_}')

The ridge model performed a little worst than the other two. 

If I have to choose here a model to go into production, I would go if Lasso because of its capability of feature selection.

# 3.0 Preprocessing to modeling with Statsmodel

Now let's perform Linear Regression, Lasso regression, and Ridge regression with Statsmodel.
It's a great library to work with in case you need a more statistical interpretation of the model.

In [None]:
#Getting 'symboling' out of the mix, once its an ordinal categorical
labels = categorical_vars.columns.to_list()[1:]
categorical_vars_stats = categorical_vars[labels]

#One-Hot-Encoding
categorical_vars_ohe = pd.get_dummies(categorical_vars_stats)

#Numerical transformation
skewed_vars= numerical_vars.apply(lambda x: x.skew())
skewed_labels = skewed_vars[skewed_vars > 0.75].index
numerical_vars[skewed_labels] = np.log1p(numerical_vars[skewed_labels])

#Putting together
dataset = categorical_vars_ohe.merge(numerical_vars, left_index=True, right_index=True)

#Spliting the data
X_treino, X_teste, y_treino, y_teste = train_test_split(dataset, target_norm, test_size=.10, shuffle=True, random_state=4)

# 3.1 - Statsmodel - Ordinary Least Squared

In [None]:
#OLS
model = sm.OLS(y_treino,X_treino)
results = model.fit()
results.summary()

# Independence assumption

This assumption checks  if the residuals are distributed uniformly randomly around the zero x-axes and do not form specific clusters.

We can see that for all our Xs variables, the residuals are distributed uniformly except the variable compression ratio that formed two clusters.

In [None]:
#Independece assumption

variables = X_treino[['wheelbase', 'carlength', 'carwidth', 'carheight',
       'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio',
       'horsepower', 'peakrpm', 'citympg', 'highwaympg']].columns.to_list()

for col in variables:
    plt.figure(figsize=(8,5))
    plt.title('Independece Assumption',fontsize=16)
    plt.scatter(x=X_treino[col],y=results.resid,color='blue',edgecolor='k')
    plt.hlines(y=0,xmin = min(X_treino[col]) , xmax = max(X_treino[col]),color='red',linestyle='--',lw=3)
    plt.xlabel(col,fontsize=14)
    plt.ylabel('Residuals',fontsize=14)
    plt.show()

# Homoscedasticity

The assumption of a data set to have constant variance is called homoscedasticity. And it’s opposite, where the variance is a function of explanatory variables X is called heteroscedasticity.

Below we can see the response values (as per the model) vs the residuals, they are randomly distributed attending the Homoscedasticity assumption.

Why the Homoscedasticity assumption is important?

Because, if the residual errors are not identically distributed, we cannot use tests of significance or perform confidence interval checking. These tests assume that the residuals are independent and equally distributed.

In [None]:
# Homoscedasticity plot
plt.figure(figsize=(8,5))
plt.title('Fitted vs Residuals',fontsize=16)
plt.scatter(x=results.fittedvalues,y=results.resid,color='blue',edgecolor='k')
plt.hlines(y=0,xmin = 8.5 , xmax = max(results.fittedvalues),color='red',linestyle='--',lw=3)
plt.xlabel('Fitted Values',fontsize=14)
plt.ylabel('Residuals',fontsize=14)
plt.show()

# Normality of Resids

As we can see below the residuals of our model pass in the normality test since the distribution is bell shaped and the p-value of shapiro test is higher than 0.05

What’s normality test telling us is that most of the prediction errors from your model are zero or close to zero and large errors are much less frequent than the small errors.

In [None]:
#Distribucion
f,ax = plt.subplots(figsize=(12,6))
sns.distplot(results.resid)

#qqplot
norm = sm.qqplot(results.resid, line='s')

#### Shapiro Test

In [None]:
#Normality Shapiro test
stats, p = shapiro(results.resid)
print(f'Statistics p-value: {p}')
# interpret
alpha = 0.05
if p > alpha:
    print('Resids looks Gaussian (fail to reject H0)')
else:
    print('Resids does not look Gaussian (reject H0)')