In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the data set
df=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')
df.head()

In [None]:
# Checking for null values
df.isnull().sum()

In [None]:
# Checking the unique values of categorical feature
print(df['Fuel_Type'].unique())
print(df['Seller_Type'].unique())
print(df['Transmission'].unique())
print(df['Owner'].unique())

In [None]:
# We will remove car name from our data set because it will not have any mathematical significance
final_dataset=df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
# Adding a column to show the age of the car. Will take reference year as 2020
final_dataset['Age']=2020-final_dataset['Year']
# Dropping the year column since we have captured that information in the Age column
final_dataset.drop(['Year'],axis=1,inplace=True)
final_dataset.head()

In [None]:
# Encoding the catgorical features
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()

for col in final_dataset.columns:
    if final_dataset[col].dtypes=='O':
        final_dataset[col]=lb.fit_transform(final_dataset[col])
final_dataset.head()

In [None]:
# Finding correlation
final_dataset.corr()

In [None]:
# Visual representation of correlation
import seaborn as sns
sns.pairplot(final_dataset)

In [None]:
# Correlation heatmap
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,20))
sns.heatmap(data=final_dataset.corr().round(2), annot=True)

In [None]:
# Seperating the dependent and independent features
X=final_dataset.drop(['Selling_Price'],axis=1)
y=final_dataset['Selling_Price']

In [None]:
#Splitting into test and train data
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=355)

In [None]:
# RandomForest regressor
from sklearn.ensemble import RandomForestRegressor
model1=RandomForestRegressor()

In [None]:
#Hyper-parameters

# No. of trees in Random Forest
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]
# No. of forests to consider at every split
max_features=['auto','sqrt']
# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(start=5,stop=30,num=6)]
# Minimum number of samples required to split a node
min_samples_split=[2,5,10,15,100]
# Minimum number of samples required at each leaf node
min_samples_leaf=[1,2,5,10]

In [None]:
# Hyper Parameter tuning
from sklearn.model_selection import RandomizedSearchCV
# Create random grid
random_grid={'n_estimators':n_estimators,'max_features':max_features,'max_depth':max_depth,
            'min_samples_split':min_samples_split,'min_samples_leaf':min_samples_leaf}
model1=RandomizedSearchCV(estimator=RandomForestRegressor(),
                             param_distributions=random_grid,scoring='neg_mean_squared_error',
                            n_iter=10,cv=5,random_state=42, n_jobs = 1)
model1.fit(X_train,y_train)

In [None]:
# listing out the best parameters & score
print(model1.best_params_)
print(model1.best_score_)


In [None]:
rf_model=RandomForestRegressor(n_estimators=1000,min_samples_split=2, min_samples_leaf=1,
                              max_features='sqrt',max_depth=25)
rf_model.fit(X_train,y_train)
y_pred=rf_model.predict(X_test)
# Plotting the predictions
plt.scatter(y_test,y_pred)

In [None]:
# Displaying the model metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
print("Model Score:",rf_model.score(X_test,y_test))

In [None]:
# Applying Linear Regresison

# from sklearn.linear_model  import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV, 
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
regression = LinearRegression()
regression.fit(X_train,y_train)

In [None]:
# Training & Testing data scores

print("R2 Score for Training dataset: ",regression.score(X_train,y_train))
print("Adjusted R2 Score for Training dataset: ",sm.OLS(y_train, X_train).fit().rsquared_adj)

print("R2 score for Test dataset: ",regression.score(X_test,y_test))
print("Adjusted R2 Score for Testing dataset: ",sm.OLS(y_test, X_test).fit().rsquared_adj)

We see that the adjusted R2 score is slightly less for our test data. Let's see if our model is overfitting our training data.

In [None]:
# Lasso Regularization
from sklearn.linear_model  import Lasso, LassoCV

# LassoCV will return best alpha and coefficients after performing 10 cross validations
lasscv = LassoCV(alphas = None,cv =5, max_iter = 100000, normalize = True)
lasscv.fit(X_train, y_train)

# best alpha parameter
alpha = lasscv.alpha_
alpha

#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train, y_train)
lasso_reg.score(X_test, y_test)

Our r2_score for test data (67.98%) comes almost same as before using regularization. So, it is fair to say our OLS model did not overfit the data.

In [None]:
# Using Ridge regression model
# RidgeCV will return best alpha and coefficients after performing 10 cross validations. 
# We will pass an array of random numbers for ridgeCV to select best alpha from them
from sklearn.linear_model  import Ridge,RidgeCV
alphas = np.random.uniform(low=0, high=10, size=(50,))
ridgecv = RidgeCV(alphas = alphas,cv=5,normalize = True)
ridgecv.fit(X_train, y_train)

ridge_model = Ridge(alpha=ridgecv.alpha_)
ridge_model.fit(X_train, y_train)
ridge_model.score(X_test, y_test)

We got the almost same r2 square using Ridge regression as well. So, it's safe to say there is no overfitting.

In [None]:
# Elastic net
from sklearn.linear_model  import ElasticNet,ElasticNetCV
elasticCV = ElasticNetCV(alphas = None, cv =5)

elasticCV.fit(X_train, y_train)

elasticnet_reg = ElasticNet(alpha = elasticCV.alpha_,l1_ratio=0.5)
elasticnet_reg.fit(X_train, y_train)
elasticnet_reg.score(X_test, y_test)

Even after using different types of regularisation techniques we are getting the same r2 score approximately, Hence we can conclude that our model is not overfitted.

In [None]:
# plotting the y_test vs y_pred
# ideally should have been a straight line
plt.scatter(y_test, regression.predict(X_test))
plt.show()

In [None]:
# Displaying the model metrics
print('MAE:', metrics.mean_absolute_error(y_test,  regression.predict(X_test)))
print('MSE:', metrics.mean_squared_error(y_test,  regression.predict(X_test)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, regression.predict(X_test))))
print("Model Score:",metrics.r2_score(y_test, regression.predict(X_test)))


In [None]:
# Applying XGBoost
# Importing XG Boost libraries

import xgboost as xgb
from xgboost import XGBRegressor
# fit model to training data
xg_model = XGBRegressor()


In [None]:
#Learning Rate
learning_rate=[1,0.5,0.1,0.01,0.001]
# No. of trees
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=12)]
# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(start=5,stop=30,num=6)]

In [None]:
# Hyper Parameter tuning

# Create random grid
random_grid_xg={'n_estimators':n_estimators,'learning_rate':learning_rate,'max_depth':max_depth}
xg_model=RandomizedSearchCV(XGBRegressor(objective='reg:squarederror'),
                            param_distributions=random_grid_xg,n_iter=10,cv=5,random_state=42, 
                            n_jobs = 1)
xg_model.fit(X_train,y_train)

In [None]:
# listing out the best parameters & score
print(xg_model.best_params_)
print(xg_model.best_score_)

In [None]:
xg_model=XGBRegressor(n_estimators=900,max_depth=5,learning_rate=0.01)
xg_model.fit(X_train,y_train)
#y_pred=rf_model.predict(X_test)
# Plotting the predictions
plt.scatter(y_test,xg_model.predict(X_test))

In [None]:
# Displaying the model metrics

print('MAE:', metrics.mean_absolute_error(y_test, xg_model.predict(X_test)))
print('MSE:', metrics.mean_squared_error(y_test, xg_model.predict(X_test)))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test,xg_model.predict(X_test))))
print("Model Score:",xg_model.score(X_test,y_test))


# **We can clearly see that XGBoost Regression gives us the best model score.**