In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import warnings
warnings.simplefilter(action='ignore')

In [None]:
path = '/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv'
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.duplicated().any()

In [None]:
df = df.drop_duplicates()
df.shape

In [None]:
df.info()

In [None]:
df.drop(['torque'], axis=1, inplace=True)
df.head()

In [None]:
#Check missing values
df.isnull().any()

In [None]:
#Missing values in Percentage of the Total Sample
df.isnull().sum() / df.shape[0] * 100

In [None]:
#Having more than 6000 samples, only 3% data is missing at max hence dropping such rows
df.dropna(axis=0, inplace=True)
df.isnull().any()

In [None]:
df.shape

### **Removing Units**

In [None]:
#Removing units to handle the column as float columns
#Meethod-1
def remove_unit(df,colum_name) :
    t = []
    for i in df[colum_name]:
        number = str(i).split(' ')[0]
        t.append(number)
    return t


In [None]:
df['engine'] = remove_unit(df,'engine')
df['mileage'] = remove_unit(df,'mileage')
df['max_power'] = remove_unit(df,'max_power')

df['engine'] = pd.to_numeric(df['engine'])
df['mileage'] = pd.to_numeric(df['mileage'])
df['max_power'] = pd.to_numeric(df['max_power'])
df.head()

In [None]:
type(df['engine'][0])

In [None]:
#Adding 'age' feature to know how old the car is and dropping 'year' feature as it is useless now
df['age'] = 2021 - df['year']
df.drop(['year'],axis = 1,inplace = True)

In [None]:
print(df['fuel'].unique())
print(df['seller_type'].unique())
print(df['transmission'].unique())
print(df['owner'].unique())

In [None]:
#Ordinal encoding
df['owner'] = df['owner'].replace({'First Owner': 1, 'Second Owner': 2, 'Third Owner': 3, 'Fourth & Above Owner': 4, 'Test Drive Car': 5})
df.head()

In [None]:
df['seats'].unique()

In [None]:
# Converting the datatype of 'seats' to string object since it is a categorical data
df['seats'] = df['seats'].astype(str)

# **EDA**

## **Univariate Analysis**

In [None]:
fig = make_subplots(rows=3, cols=2,subplot_titles=("Selling Price in Rupee", "Total KM Driven", "Fuel Efficiency in KM per litre",
                                                   "Engine CC", "Brake Horse Power(BHP)", "Age of Car","Number of Seats"))

fig.add_trace(go.Histogram(x=df['selling_price'], name="Rupee"), row=1, col=1)

fig.add_trace(go.Histogram(x=df['km_driven'], name="KM"), row=1, col=2)

fig.add_trace(go.Histogram(x=df['mileage'], name="KM/L"), row=2, col=1)

fig.add_trace(go.Histogram(x=df['engine'], name="CC"), row=2, col=2)

fig.add_trace(go.Histogram(x=df['max_power'], name="BHP"), row=3, col=1)

fig.add_trace(go.Histogram(x=df['age'], name="Years"), row=3, col=2)

fig.update_layout(height=1400, width=800, title_text="Distribution of numerical data")
fig.show()

In [None]:
fig = make_subplots(rows=3, cols=2,subplot_titles=("Selling Price in Rupee", "Total KM Driven", "Fuel Efficiency in KM per litre",
                                                   "Engine CC", "Brake Horse Power(BHP)", "Age of Car","Number of Seats"))

fig.add_trace(go.Box(x=df['selling_price'], name="Rupee"), row=1, col=1)

fig.add_trace(go.Box(x=df['km_driven'], name="KM"), row=1, col=2)

fig.add_trace(go.Box(x=df['mileage'], name="KM/L"), row=2, col=1)

fig.add_trace(go.Box(x=df['engine'], name="CC"), row=2, col=2)

fig.add_trace(go.Box(x=df['max_power'], name="BHP"), row=3, col=1)

fig.add_trace(go.Box(x=df['age'], name="Years"), row=3, col=2)

fig.update_layout(height=1400, width=800, title_text="Distribution of numerical data")
fig.show()

In [None]:
count_fuel = df['fuel'].value_counts().reset_index()
count_fuel = count_fuel.rename(columns = {'index':'fuel','fuel':'count'})

count_seller = df['seller_type'].value_counts().reset_index()
count_seller = count_seller.rename(columns = {'index':'seller_type','seller_type':'count'})

count_transmission = df['transmission'].value_counts().reset_index()
count_transmission = count_transmission.rename(columns = {'index':'transmission','transmission':'count'})

count_owner = df['owner'].value_counts().reset_index()
count_owner = count_owner.rename(columns = {'index':'owner','owner':'count'})

count_seats = df['seats'].value_counts().reset_index()
count_seats = count_seats.rename(columns = {'index':'seats','seats':'count'})

## **Bivariate/Multivariate Analysis**

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="RdBu")
plt.show()

In [None]:
sns.pairplot(df)

# **Feature Selection, Feature Engineering and Data Preparation for Modelling**

In [None]:
# Make a copy of the data for modelling
df_model = df.copy()

# Create the 'brand' column by splitting the 'name' column
df_model['brand'] = df_model['name'].str.split(' ').str.get(0)
df_model.drop(['name'],axis=1,inplace=True)

# Filter the outlier and log-transform the target variable('selling_price')
df_model = df_model[df_model['selling_price'] < 2500000]
df_model['selling_price'] = np.log(df_model['selling_price'])

# Filter the outlier in 'km_driven' feature
df_model = df_model[df_model['km_driven'] < 300000]

# Filter the unwanted rows in 'fuel' feature
df_model = df_model[~df_model['fuel'].isin(['CNG','LPG'])]

# Filter the outliers in 'mileage' feature
df_model = df_model[(df_model['mileage'] > 5) & (df_model['mileage'] < 35)]

# Filter the outlier in 'max_power' feature and log-transform the data.
df_model = df_model[df_model['max_power'] < 300]
df_model['max_power'] = np.log(df_model['max_power'])

# Log-transform the 'age' feature data.
df_model['age'] = np.log(df_model['age'])

df_model.head()

In [None]:
print(df_model['brand'].unique())

In [None]:
df_model = pd.get_dummies(data = df_model, drop_first=True)
df_model.head()

In [None]:
df_model.columns

In [None]:
X = df_model.drop(['selling_price'],axis=1)
y = df_model['selling_price']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print("x train: ",X_train.shape)
print("x test: ",X_test.shape)
print("y train: ",y_train.shape)
print("y test: ",y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_var = ['km_driven', 'mileage', 'engine', 'max_power', 'age']
X_train[num_var] = scaler.fit_transform(X_train[num_var])
X_test[num_var] = scaler.transform(X_test[num_var])

In [None]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
select = RFE(RandomForestRegressor(n_estimators=100, random_state=42), n_features_to_select=40)
select.fit(X_train, y_train)
X_train_rfe= select.transform(X_train)
X_test_rfe= select.transform(X_test)

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

r2_train_scores = []
r2_test_scores = []
cv_mean = []

def car_price_prediction_model(model):
    model.fit(X_train, y_train)
    
    #R2 score of training set
    y_train_pred = model.predict(X_train)
    r2_train = r2_score(y_train, y_train_pred)
    r2_train_scores.append(round(r2_train,2))
    
    #R2 score of test set
    y_test_pred = model.predict(X_test)
    r2_test = r2_score(y_test, y_test_pred)
    r2_test_scores.append(round(r2_test,2))
    
    # CV score of training set
    cv_training = cross_val_score(model, X_train, y_train, cv=5)
    cv_mean_training = cv_training.mean()
    cv_mean.append(round(cv_mean_training,2))
    
    # Printing each score
    print("Training set R2 scores: ",round(r2_train,2))
    print("Test set R2 scores: ",round(r2_test,2))
    print("Training cross validation score: ", cv_training)
    print("Training cross validation mean score: ",round(cv_mean_training,2))
    
    fig, ax = plt.subplots(1,2,figsize = (10,4))
    ax[0].set_title('Residual Plot of Train samples')
    sns.distplot((y_train-y_train_pred),hist = False,ax = ax[0])
    ax[0].set_xlabel('y_pred')
    
    # Y_test vs Y_train scatter plot
    ax[1].set_title('y_test vs y_pred_test')
    ax[1].scatter(x = y_test, y = y_test_pred)
    ax[1].set_xlabel('y_test')
    ax[1].set_ylabel('y_pred_test')
    
    plt.show()

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

r2_train_scores_rfe = []
r2_test_scores_rfe = []
cv_mean_rfe = []

def car_price_prediction_model_rfe(model):
    model.fit(X_train_rfe, y_train)
    
    
    #R2 score of RFE training set
    y_train_pred_rfe = model.predict(X_train_rfe)
    r2_train_rfe = r2_score(y_train, y_train_pred_rfe)
    r2_train_scores_rfe.append(round(r2_train_rfe,2))
    
    #R2 score of RFE test set
    y_test_pred_rfe = model.predict(X_test_rfe)
    r2_test_rfe = r2_score(y_test, y_test_pred_rfe)
    r2_test_scores_rfe.append(round(r2_test_rfe,2))

    # CV score of RFE training set
    cv_training_rfe = cross_val_score(model, X_train_rfe, y_train, cv=5)
    cv_mean_training_rfe = cv_training_rfe.mean()
    cv_mean_rfe.append(round(cv_mean_training_rfe,2))
    
    # Printing each score
    print("Training set R2 scores: ",round(r2_train_rfe,2))
    print("Test set R2 scores: ",round(r2_test_rfe,2))
    print("Training cross validation score: ", cv_training_rfe)
    print("Training cross validation mean score: ",round(cv_mean_training_rfe,2))
    
    fig, ax = plt.subplots(1,2,figsize = (10,4))
    ax[0].set_title('Residual Plot of RFE-Train samples')
    sns.distplot((y_train-y_train_pred_rfe),hist = False,ax = ax[0])
    ax[0].set_xlabel('residual')
    
    # Y_test vs Y_train scatter plot
    ax[1].set_title('y_test vs y_pred_test_rfe')
    ax[1].scatter(x = y_test, y = y_test_pred_rfe)
    ax[1].set_xlabel('y_test')
    ax[1].set_ylabel('y_pred_test_rfe')
    
    plt.show()

# **Regression Modelling and Evaluation**

### **1. Linear Regression(Ordinary Least Square)**

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
car_price_prediction_model(lm)

In [None]:
car_price_prediction_model_rfe(lm)

### **2. Linear Regression(Ridge)**

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

rg = Ridge()
alpha = np.logspace(-3,3,num=14)
rg_rs = RandomizedSearchCV(estimator=rg, param_distributions=dict(alpha=alpha))
car_price_prediction_model(rg_rs)

In [None]:
car_price_prediction_model_rfe(rg_rs)

### **3. Linear Regression(Lasso)**

In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV

ls = Lasso()
alpha = np.logspace(-3,3,num=14)
ls_rs = RandomizedSearchCV(estimator=ls, param_distributions=dict(alpha=alpha))
car_price_prediction_model(ls_rs)

In [None]:
car_price_prediction_model_rfe(ls_rs)

### **4. Extreme Gradient Boosting Regressor**

In [None]:
from xgboost import XGBRegressor
xg = XGBRegressor(verbosity= 0)

n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]
base_score=[0.25,0.5,0.75,1]

parameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

xg_rs = RandomizedSearchCV(estimator=xg, param_distributions=parameter_grid)

In [None]:
car_price_prediction_model(xg_rs)

In [None]:
car_price_prediction_model_rfe(xg_rs)

### **5. Random Forest Regressor**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

rf = RandomForestRegressor()

# Number of trees in Random forest
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

rf_rs = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)

In [None]:
car_price_prediction_model(rf_rs)

In [None]:
car_price_prediction_model_rfe(rf_rs)

### **6. Gradient Boosting Regressor**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gb = GradientBoostingRegressor()

# Rate at which correcting is being made
learning_rate = [0.001, 0.01, 0.1, 0.2]
# Number of trees in Gradient boosting
n_estimators=list(range(500,1000,100))
# Maximum number of levels in a tree
max_depth=list(range(4,9,4))
# Minimum number of samples required to split an internal node
min_samples_split=list(range(4,9,2))
# Minimum number of samples required to be at a leaf node.
min_samples_leaf=[1,2,5,7]
# Number of fearures to be considered at each split
max_features=['auto','sqrt']

# Hyperparameters dict
param_grid = {"learning_rate":learning_rate,
              "n_estimators":n_estimators,
              "max_depth":max_depth,
              "min_samples_split":min_samples_split,
              "min_samples_leaf":min_samples_leaf,
              "max_features":max_features}

gb_rs = RandomizedSearchCV(estimator = gb, param_distributions = param_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=1)
#n_jobs = Number of Cores of the laptop used

In [None]:
car_price_prediction_model(gb_rs)

In [None]:
car_price_prediction_model_rfe(gb_rs)

# **Model Evaluation**

In [None]:
algo = ["LinearRegression(OLS)","LinearRegression(Ridge)","LinearRegression(Lasso)", "ExtremeGradientBoostingRegressor","RandomForestRegressor","GradientBoostingRegressor"]

model_eval = pd.DataFrame({'Model': algo,'R Squared(Train)': r2_train_scores,'R Squared(Test)': r2_test_scores, 'CV score mean(Train)': cv_mean})
display(model_eval)

In [None]:
model_eval_RFE = pd.DataFrame({'Model': algo,'R Squared(Train)': r2_train_scores_rfe,'R Squared(Test)': r2_test_scores_rfe,'CV score mean(Train)': cv_mean_rfe})
display(model_eval_RFE)

# **Conclusion**

**1. Gradient Boosting Regressor is the model I will choose since it has the highest CV score(91%) which mean it generalize better than other models.**

**2. Linear model is also a great model choice if we have computational power constraint since the non-linear model are quite computational expensive.**

**3. The automatic feature selection(RFE) did not make significant improvement on all of the models. Hence we do not need it unless computational time is of concern.**

In [None]:
gb_rs.fit(X_train, y_train)

In [None]:
predictions = gb_rs.predict(X_test)

In [None]:
predictions

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test, predictions)

In [None]:
# import pickle
# #open the file where you want to store the data
# file = open('gradient_boosting_regressor_model.pkl', 'wb')
# #dump information to the file
# pickle.dump(gb_rs, file)