In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Import necessary libraries for the project

In [None]:
#matplotlib and seaborn are imported for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#splitting the dataset into train & test data
from sklearn.model_selection import train_test_split

#GridSearchCV is used for hyperparameter tuning in Lasso & Ridge
from sklearn.model_selection import GridSearchCV

#three linear models used in the project
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#StandardScaler for preprocessing the dataset
from sklearn.preprocessing import StandardScaler

#metrics to evaluate the linear regression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error

#import warnings to ignore any warnings during execution
import warnings
warnings.filterwarnings('ignore')

**EXPLORATORY DATA ANALYSIS(EDA)**

load the csv file 

In [None]:
data=pd.read_csv("../input/car-price-dataset/CarPrice.csv")


display the first five entries of data

In [None]:
data.head()

The dataset contains 205 rows with 26 features

In [None]:
data.shape

The features in the dataset does not contain any null values

In [None]:
data.isnull().sum()

info() will return the informations of columns(features),count of non-null values and datatype of individual columns

In [None]:
data.info()

**describe()** describes the features of the dataframe by & default it will show the description of only int and float features but by specifying **include="all"** we will be able to get the description of all features irrespective of its datatypes

In [None]:
data.describe(include="all")

I have dropped two features **car_ID** & **CarName** because they does not effect the price of the car in the dataset.

In [None]:
#data.drop(axis=0) by default so its important to specify the axis=1 else you can specify 
#columns=["car_ID","CarName"]
df_car=data.drop(["car_ID","CarName"],
         axis=1
         )

We are now able to see the remaining columns in the dataset after dropping **car_ID** and **CarName**

In [None]:
df_car.columns

Display the first five entries of dataframe

In [None]:
df_car.head()

Split the dataset into **dependent(y)** & **independent(X)** variables,

Dependent variable is also called the target variable which is **price** of the car in our case

In [None]:
X=df_car.drop(columns=["price"])
y=df_car["price"]

Specify two empty lists **cat_col** & **num_col** to store **categorical** and **numerical** columns respectively

In [None]:
cat_col=[]#will store categorical features
num_col=[]#will store numerical features

#iterating thourgh all columns in X
for col in X:
    #append the features whose datatype is object in cat_col
    if df_car[col].dtype=="O":
        cat_col.append(col)
    #append those features whose datatype is other than object in num_col    
    else:
        num_col.append(col)

Create dataFrames **df_cat** & **df_num** to store the features with datatypes **object** and **numerical** respectively

By dividing the dataframe into **numerical** & **categorical** features seperately, it will allow an ease handling of numerical and categorical features in their respective dataframes.

In [None]:
#dataframe to store the categorical features
df_cat=pd.DataFrame(
    data=df_car,
    #we will use the column names from the cat_col list
    columns=cat_col
)

#dataframe to store the categorical features
df_num=pd.DataFrame(
    data=df_car,
    #we will use the column names from the num_col list
    columns=num_col
)

Display head of Dataframe with numerical features

In [None]:
df_num.head()

Display head of Dataframe with categorical features

In [None]:
df_cat.head()

Print the **labels** of each columns in **df_cat**

In [None]:
for cols in df_cat:
    print(cols," contains :",df_cat[cols].nunique()," labels")

Since the dataset does not contain any null values hence the df_num & df_cat will also have no null values, however we are intrested in df_cat whose datatype is object.

To feed our data to the Machine Learning Models the data values must be converted into numerical values.



Perform **one-hot encoding** to the categorical features, using **pd.get_dummies()**



In [None]:
df_cat=pd.get_dummies(
    data=df_cat,
    drop_first=True
)

Display the head of **one-hot encoded** dataframe

In [None]:
df_cat.head()

Now we have converted categorical features into **numerical** values by perfoming **one-hot** encoding & now we have all the features on both **df_num** & **df_cat** in numerical form so we **concatenate** them to get the final desired dataframe.

In [None]:
car_final=pd.concat(
    [df_num,df_cat,y],
    axis=1
)

Split the data into **dependent(y)** and **independent(X)** variables

In [None]:
X=car_final.drop("price",
                axis=1)
y=car_final["price"]

Split the data into **training** and **testing** data, with **test data** of size of **20%** of total dataset.

In [None]:
X_train,X_test,y_train,y_test=train_test_split(
    X,
    y,
    random_state=42,
    test_size=0.2
)

Perform preprocessing on the **X_train** and **X_test** using **StandardScaler()**

It will scale the data values in such a way that the **mean is zero** and a **variance of one**

In [None]:
scaler=StandardScaler()

X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

Since I am comparing three linear models namely **LinearRegression() Ridge() & Lasso()**, I will have to write the same codes again and again for individual linear models so I have used **functions** that would perform the same job for all three regression models to **ease** and **shorten** my work

The function is used to fit the models in given linear model and return **training** and **testing** **scores**



In [None]:
#the function takes model, train & test split as an argument
def fit_model_getScores(model,X_train,y_train,X_test,y_test):
    #fit the model with training dataset
    model.fit(X_train,y_train)
    
    #score the training data
    train_score=model.score(X_train,y_train)
    #score the test data
    test_score=model.score(X_test,y_test)
    
    #Display the scores
    print("Scores of {}".format(model),"\n")
    print("Training Score:{:.2f}".format(train_score))
    print("Testing Score:{:.2f}".format(test_score))

The function given below will help return the **metrics** used for **evaluating linear models** & that includes **mse,mae,rmse,r2_score**

In [None]:
#function takes model,and test data split as an argument 
def get_metrics(model,X_test,y_test):
    #calculate the predicted value of y 
    y_pred=model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)#mse
    r2__score=r2_score(y_test,y_pred)#r2_score
    mae=mean_absolute_error(y_test,y_pred)#mae
    rmse=mean_squared_error(y_test,y_pred,squared=False)#rmse
    
    #print the metrics 
    print("The Metrics for {}:".format(model))
    print("----------------------------")
    print("Mean Squared Error:{:.2f}".format(mse))
    print("Root Mean Squared Error:{:.2f}".format(rmse))
    print("Mean Absolute Error:{:.2f}".format(mae))
    print("r2_score:{:.2f}".format(r2__score))


This function will calculate the **coefficient** of a given **linear model** and will return the **series of coefficient** with **independent features(columns) as an index**.

**Note**: In a linear model the **numbers(count) of coefficient** is always **equal** to **the number of independent features** present in the dataset 

In [None]:
#the function takes model and independent dataframe as an argument
def return_coef_series(model,X):
    #it will give the coefficeint pertaining to a specific linear model
    coef=model.coef_
    
    #make a series out of coefficient with columns of X as an index 
    coef_series=pd.Series(
        data=coef,
        index=X.columns
    )
    
    #return the series
    return coef_series
    

The **plot_coef**() will help **visualize** the **coefficient** of a particular **linear model**

In [None]:
#takes coefficient of linear_model as an argument
def plot_coef(model_coef):
    fig=plt.figure(figsize=(12,8))
    model_coef.plot(
        kind="bar"
    )
    plt.xticks(rotation=90)

The below given function is used to perform h**yper-parameter tuning** for **Ridge()** and **Lasso()** regression.

**GridSearchCV** is used for **hypertuning** and return the **best parameters fitting the linear model.**

In [None]:
#takes model, user-defined hyper-parameters, train & test data splits as argument 
def gridSearch(model,params,X_train,y_train):
    grid=GridSearchCV(
        estimator=model,
        param_grid=params,
        cv=5
    )
    grid.fit(X_train,y_train)
   
    return grid.best_params_

**BUILDING LINEAR MODELS**

**1-LinearRegression() Model**

**LinearRegression(aka ordinary least squares):** Simplest & most classic linear method for regression. It finds the parameters w & b that minimize the mean squard error between predicted value and true value.

y=wx + b

w->Weights associated with individual independent features(Slope of a line)

b->y intercept



In [None]:
#initializing the model
linear_model=LinearRegression()

Lets fit the LinearRegression and fetch **training** and **testing** scores

In [None]:
fit_model_getScores(linear_model,
                    X_train_scaled,y_train,
                    X_test_scaled,y_test
                   )

Get the **metrics** to evaluate LinearRegression

In [None]:
get_metrics(linear_model,
            X_test_scaled,y_test
           )

The function **return_coef_series** will return the series of **coefficient** along with **features** as its **index**.

In [None]:
linear_coef=return_coef_series(linear_model,X).sort_values()
linear_coef

Lets visualize the coefficient of LinearRegression() model

In [None]:
plot_coef(linear_coef)

**2-Ridge():** It is a linear model which uses **L2 regularization** technique.

**L2 Regularization:** Regularization techniques explicitly restricts a model to aviod overfitting.

**LinearRegression()** does not allow us to control its complexity so its very likely that it will **overfit** the models when the dataset is **relatively small**.

**l2 regularization** reduces the cofficient of the independent features to small magnitude as possible i.e all entries of **w should be close to zero**

**Ridge** have **alpha parameter** which makes a trade-off between the simplicity of the model and its perfomance on training set & hence tuning it will yeild different model performance.

In [None]:
ridge_model=Ridge()

Lets perform Hyperparameter tuning and fit the ridge model with the best parameters generated by GridSeachCV hypertuning method

GridSeachCV gave us **alpha=1** and **max_iter=1000** as the best parameters for the model

In [None]:
params={
    "alpha":[1e-9,1e-6,1e-3,1,100,1000,10000],
    "max_iter":[1e3,1e4,1e5,1e6]#maximum number of iterations to run
}

ridge_best_params=gridSearch(ridge_model,params,X_train_scaled,y_train)
ridge_best_params

Fit ridge model with best parameters

In [None]:
ridge1_model=Ridge(**ridge_best_params)

Fit the ridge model and return the **test** and **train** scores

Fitting the model we get scores equivalent to the LinearRegression()

In [None]:
fit_model_getScores(ridge1_model,
                    X_train_scaled,y_train,
                    X_test_scaled,y_test
                   )

Get the metrics to evaluate ridge model

In [None]:
get_metrics(ridge1_model,
            X_test_scaled,y_test
           )

Get the coefficeint series of ridge model

From the values returned in series, we can see that the **coefficients** have been **reduced** to the fractions of its original value.

**L2 regularization** technique in Ridge reduces the coefficient the features **as close to zero.**


In [None]:
ridge_coef=return_coef_series(ridge1_model,X).sort_values()
ridge_coef

Visualize the coefficient series of ridge model

We can see that the upper & lower x-limmits have been reduced.

In [None]:
plot_coef(ridge_coef)

**3-Lasso():** It is a linear model which uses **L1 regularization** technique.

**l1 regularization** also reduces the coefficient magnitude however unlike Ridge it **reduces magnitude of some of the features to zero**. Hence it **neglects** some of the features completely.

Hence it is also used for **automatic feature selection** as it ignores some of the features.

**Lasso** also have **alpha parameter** which makes a trade-off between the simplicity of the model and its perfomance on training set & hence tuning it will yeild different model performance.

In [None]:
lasso_model=Lasso()

We got **alpha=100** & **max_iter=1000** for the lasso model 



In [None]:
params={
    "alpha":[1e-9,1e-6,1e-3,1,100,1000,10000],
    "max_iter":[1e3,1e4,1e5,1e6]#maximum number of iterations to run
}

lasso_best_params=gridSearch(lasso_model,params,X_train_scaled,y_train)
lasso_best_params

Lets try fitting the Lasso model using the parameters that have been returned from Hypertuning

In [None]:
lasso1_model=Lasso(**lasso_best_params)

The scores are **93% for training set** and **86% for testing set** which is better generalized model than the above two models i.e LinearRegression() & Ridge()

In [None]:
fit_model_getScores(lasso1_model,
                    X_train_scaled,y_train,
                    X_test_scaled,y_test
                   )

Print the metrics of lasso model

In [None]:
get_metrics(lasso1_model,
            X_test_scaled,y_test
           )

Lets see the magnitude of coefficients returned by the lasso model.

This is where it gets very intresting we can see that there are many features whose cofficients are reduced to zero. It means that the Lasso model have completely ignored those features with coefficients equals to zero while fitting the model.

In [None]:
lasso_coef=return_coef_series(lasso1_model,X).sort_values()
lasso_coef

Lets create Dataframe that stores the features with its corresponding coefficient values

In [None]:
lasso_coef_df=pd.DataFrame(
    data=lasso_coef,
    columns=["Coefficient"]
)
lasso_coef_df

Lets us see how many features have been used in the model and how many have been neglected by the Lasso model

The dataframes below shows the features as an indexes whose coefficient has been reduced to zero and are completely neglected by the Lasso model

In [None]:
features_used=lasso_coef_df[lasso_coef_df["Coefficient"]==0]
features_used

Lasso model have used 26 features out of 43 and have neglected 17 features

In [None]:
print("Total Features:{}".format(X.shape[1]))
print("Features Neglected:{}".format(features_used.shape[0]))
print("Features Used:{}".format(X.shape[1]-features_used.shape[0]))

Its is very clear from the plot that many of the features are neglected and its bar are being reduced to 0 magnitude, and hence lasso is moslty used for automatic feature selection.

In [None]:
plot_coef(lasso_coef)

**Comparison between the three Linear Models:**

* From the plot below comparing the coefficient of independent features, its clear that **LinearRegression()** model have most coefficients nonzero and are of large magnitude and most of its values are out of y-lim, Which are represented by blue square blocks.

* The **Ridge()** model however have the coefficents whose magnitude are smaller and are close to zero, which are represented with orange **'^'**.

* Comming to **Lasso()** model most of its values are lying either on the horizotal line and few which are very close to horizontal line, owing to its smaller magnitude represented by green **'v'**

In [None]:
#specify the figure & size
fig=plt.figure(figsize=(12,7))

#plot the coefficient of individual linear models
plt.plot(linear_model.coef_,'s',label="Linear Regression")
plt.plot(ridge1_model.coef_,'^',label="Ridge")
plt.plot(lasso1_model.coef_,'v',label="Lasso")

#specify columns/features as the xticks
plt.xticks(range(X.shape[1]), X.columns,rotation=90)

#the length of horizontal line equals to the length of features
plt.hlines(0,0,X.shape[1])

#specify the x & y labels
plt.xlabel("Features")
plt.ylabel("Coefficient magnitude")

plt.legend()
plt.show()