In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from numpy import arange
from sklearn import metrics
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import VotingRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#loading the data
df = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
#check first 10 rows
df.head(10)

In [None]:
#check bottom 5 rows
df.tail()

In [None]:
#check number of columns and rows
df.shape

In [None]:
#lets get info about the data
df.info()

In [None]:
#check for null values
df.isnull().sum()

In [None]:
#lets check some statistics
df.describe()

In [None]:
#lets check for datatypes
df.dtypes

In [None]:
#lets get the categorical variables
cat_vars = [var for var in df.columns if df[var].dtypes=='O']
cat_vars

In [None]:
#lets get the numerical variables
num_vars = [var for var in df.columns if df[var].dtypes!='O']
num_vars

In [None]:
#length of numerical and categoical variables
print(len(num_vars))
print(len(cat_vars))

In [None]:
#lets  check unique labels in categorical variables
print('unique cars:',df['Car_Name'].unique())
print()
print('unique fuel type:',df['Fuel_Type'].unique())
print()
print('seller type:',df['Seller_Type'].unique())
print()
print('mode of transmission:',df['Transmission'].unique())

In [None]:
#lets count the various car names
df['Car_Name'].value_counts()

In [None]:
#lets create mean car price per car name
def mean_car_price_per_carname(df, vr):

    total_data = len(df)

   
    df_t = pd.Series(df[vr].value_counts() / total_data).reset_index()
    df_t.columns = [vr, 'per_carnames']

   
    df_t = df_t.merge(df.groupby([vr])['Selling_Price'].mean().reset_index(),
                            on=vr,
                            how='left')

    return df_t

In [None]:
df_t = mean_car_price_per_carname(df, 'Car_Name')
df_t.head(20)

In [None]:
#let visualize it and see if the car names have impact on the selling price
def bar_plot(df,var):
    fig, ax = plt.subplots(figsize=(30, 13))
    plt.xticks(df.index, df[var], rotation=90)

    ax2 = ax.twinx()
    ax.bar(df.index, df["per_carnames"], color='lightblue')
    ax2.plot(df.index, df["Selling_Price"], color='red')
    ax.axhline(y=0.04, color='red')
    ax.set_ylabel('percentage of car names ')
    ax.set_xlabel(var)
    ax2.set_ylabel('Average car price per name')
    plt.show()

In [None]:
bar_plot(df_t,'Car_Name')#From the graph below the car names tend to have impact

In [None]:
#lets group the car names into rare cars and popular ones
def popular_car_grouping(df,vr):
    total_data = len(df)
    df_t = pd.Series(df[vr].value_counts()/total_data)
    group_car = {
        c:('rare_cars' if c not in df_t[df_t>=0.023256].index  else 'popular_cars')
        for c in df_t.index
    }
    dt = df[vr].map(group_car)
    return dt

In [None]:
df['car_popularity']=popular_car_grouping(df,'Car_Name')

In [None]:
df['car_popularity'].unique()

In [None]:
#creating number of years from when the car was made
df['car_num_yrs'] = 2021-df['Year']

In [None]:
#to check if the new faetures are there
df.head()

In [None]:
#LETS CHECK THE RELATIONSHIP BETWEEN VARIOUS VARIABLES AND THE TARGET
#for numerical variables
num_vars = [var for var in df.columns if df[var].dtypes!='O' and var!='Selling_Price']
for v in num_vars:
    plt.figure(figsize=(12,5))
    plt.plot(df[v],df["Selling_Price"],'o',color='green')
    plt.ylabel('Selling Price')
    plt.xlabel(v)

In [None]:
cat_vars = [var for var in df.columns if df[var].dtypes=='O' and var!="Car_Name"]
for c in cat_vars:
    plt.figure(figsize=(12,5))
    plt.bar(df[c],df['Selling_Price'])
    plt.xlabel(c)
    plt.ylabel('Selling Price')

In [None]:
#LETS CHECK FOR DISTRIBUTION OF VARIABLES
#For numerical variables
num_vars = [var for var in df.columns if df[var].dtypes!='O' and var!='Selling_Price']
for v in num_vars:
    if v!='Year':
        plt.figure(figsize=(12,5))
        plt.hist(df[v],color='green')
        plt.xlabel(v)
        plt.ylabel('Count')

In [None]:
#FEATURE ENGINEERING
#Handling outlier
sns.boxplot(df['Selling_Price'])
plt.show()

In [None]:
#investigate the quantiles and mean
print('The 95th quantile is:',df['Selling_Price'].quantile(0.95))
print('The 5th quantile is:',df['Selling_Price'].quantile(0.05))
print('The median is:',df['Selling_Price'].quantile(0.50))
print('The 75th quantile is:',df['Selling_Price'].quantile(0.75))
print('The mean is:',df['Selling_Price'].mean())

In [None]:
def outlier_boundary(vr):
    oulier_lower=vr.quantile(0.02)
    oulier_upper=vr.quantile(0.95)
    return oulier_lower,oulier_upper

In [None]:
#checking the top and bottom baundries
last_p,top_p=outlier_boundary(df['Selling_Price'])
last_p,top_p

In [None]:
#capping outlier in the target
df['Selling_Price']=np.where(df['Selling_Price']>top_p,top_p,
                    np.where(df['Selling_Price']<last_p,last_p,df['Selling_Price']))

In [None]:
last_p,top_p=outlier_boundary(df['Present_Price'])
last_p,top_p

In [None]:
#capping outlier in the presentprice
df['Present_Price']=np.where(df['Present_Price']>top_p,top_p,
                    np.where(df['Present_Price']<last_p,last_p,df['Present_Price']))

In [None]:
#Handling Rare Labels
#handling rare category that may be present in training but not in testing set(CNG is present only twice in the dataset)
df['Fuel_Type'] = df['Fuel_Type'].replace('CNG','Diesel')

In [None]:
df_t = df[['Present_Price', 'Kms_Driven','Fuel_Type', 'Seller_Type', 'Transmission', 'Owner', 'car_popularity','car_num_yrs','Selling_Price']]

In [None]:
#Categorical encoding
df_t = pd.get_dummies(df_t,drop_first=True) 

In [None]:
df_t

In [None]:
#FEATURE SELECTION
#using correlation matrix
mat_cor = df_t.corr()
high_cor_vr = mat_cor.index
plt.figure(figsize=(18,20))
sns.heatmap(df_t[high_cor_vr].corr(),annot=True,cmap='YlOrRd')

In [None]:
#creating x and y 
x = df_t[['Present_Price','car_num_yrs','Fuel_Type_Petrol', 'Seller_Type_Individual', 'Transmission_Manual','car_popularity_rare_cars']]
y = df_t['Selling_Price']

In [None]:
#splitting data into training and testing
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [None]:
#Feature Scaling
scaler=StandardScaler()
scaler.fit(x_train)

In [None]:
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
#MODEL TRAINING
#LASSO REGRESSION
reg = Lasso()
#model evaluation
cv = RepeatedKFold(n_splits=11, n_repeats=4, random_state=20)
#Gridsearch for model tuning
grid = dict()
grid['alpha'] = arange(0, 1, 0.01)
find_params = GridSearchCV(reg, grid, cv=cv, n_jobs=1)
#fit it to our data for it the learn the best parameters 
lasso_model = find_params.fit(x_train_scaled,y_train)

In [None]:
#making predictions
lasso_pred =lasso_model.predict(x_test_scaled)

In [None]:
print('accuracy of lasso_regression:',lasso_model.score(x_test_scaled,y_test))

In [None]:
#KNN REGRESSION
#knn instance
knn_set = KNeighborsRegressor()

In [None]:
#hyperparameters
leaf_size = list(range(1,11))
n_neighbors = list(range(1,11))
p=[1,2]
#setting hyperparameters as dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Use GridSearch
knn_grid = GridSearchCV(knn_set, hyperparameters, cv=10)

In [None]:
#Fit the model
knn_reg = knn_grid.fit(x_train_scaled,y_train)
pred_knn=knn_reg.predict(x_test_scaled)

In [None]:
print('knn regression accuracy is:',knn_reg.score(x_test_scaled,y_test)) 

In [None]:
#RANDOM FOREST REGRESSION
#create instance of random forest
rf_reg=RandomForestRegressor()
#Hyperparameters:
# Number of trees to try 
n_estimators = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200,1300]
#max features to consider 
max_features = ['auto', 'sqrt']
# Maximum number of depth in trees
max_depth = [5, 10, 15, 20, 25, 30,35]
#min samples for splitting nodes
min_samples_split = [5, 10, 15, 20,25,60,100]
# Min samples for each leaf 
min_samples_leaf = [1, 2, 3,5, 12]

In [None]:
#random_grid for hyperparameter tuning
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

In [None]:
random_grid

In [None]:
#search for optimal parameters using cv=6
rf_regr = RandomizedSearchCV(estimator = rf_reg, param_distributions = random_grid, n_iter = 12, cv = 6, verbose=2, random_state=4, n_jobs = 1)


In [None]:
#fit the train data
rf_regr.fit(x_train_scaled,y_train)

In [None]:
#predict the selling price in the test data
pred=rf_regr.predict(x_test_scaled)

In [None]:
#check how well model is performing
print('accuracy of random forest regressor:',rf_regr.score(x_test_scaled,y_test))

In [None]:
#ENSEMBLE:VOTINGREGRESSOR
vote_reg = VotingRegressor([('lasso', lasso_model), ('knn', knn_reg),('rf',rf_regr)]) 
vote_reg.fit(x_train_scaled,y_train)
pred = vote_reg.predict(x_test)

In [None]:
#check how well model is performing
print('accuracy of voting regressor:',vote_reg.score(x_test_scaled,y_test))

In [None]:
print('accuracy of voting regressor:',vote_reg.score(x_test_scaled,y_test))
print('accuracy of random forest regressor:',rf_regr.score(x_test_scaled,y_test))
print('knn regression accuracy is:',knn_reg.score(x_test_scaled,y_test)) 
print('accuracy of lasso_regression:',lasso_model.score(x_test_scaled,y_test))