In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

import cufflinks as cf
from plotly.offline import download_plotlyjs,init_notebook_mode,plot,iplot
init_notebook_mode(connected=True)
cf.go_offline()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
cars = pd.read_csv('/kaggle/input/vehicle-dataset-from-cardekho/Car details v3.csv')

In [None]:
cars.head()

In [None]:
cars.info()

In [None]:
# Let's findout how many null values are present
cars.isnull().sum()

In [None]:
# Let's go ahead and drop the null values
cars.dropna(inplace=True)

In [None]:
# Let's goahead and check the name of few cars
for i in range(6):
    index = np.random.randint(1,7000)
    print(cars['name'].iloc[index])

In [None]:
# Different car belongs to different companies,have diferent model.There are no fixed set of names.
# It would be very difficult to use this column to train our model. It will act as noise in the data which will only reduce the performence of the model.
# Let's go ahead and drop this column
cars.drop('name',axis = 1,inplace = True)

In [None]:
# Let's check the year column
plt.figure(figsize=(15,6))
sns.set_style('darkgrid')
sns.countplot(x='year',data=cars,palette='Spectral')

In [None]:
# Let's try to use year in better way.As the list is from 2020.
# We will add a new column number_of_year which will tell how old the car is
cars['number_of_year'] = cars['year'].apply(lambda x:2020-x)

# We will then drop the 'year' column
cars.drop('year',axis=1,inplace=True)

In [None]:
# Fuel Column
sns.countplot(x='fuel',data=cars,palette='icefire')

**Majority of car ran on Diesel and Petrol.**

In [None]:
# Lets check the km_driven column.
plt.figure(figsize=(8,6))
plt.yscale('log')
plt.xscale('log')
sns.scatterplot(x='km_driven',y='selling_price',data=cars,hue='fuel')

**Diesel-car were sold at higher price then Petrol-car**

In [None]:
# Let's check seller type of the car
print(cars['seller_type'].value_counts())
print('\n')

cars.groupby('seller_type').size().plot(kind='pie',y='seller')
plt.ylabel('seller')

**Majority of people did'nt go to any dealer to sell there car.**

In [None]:
# Let's check the transmission of cars
print(cars['transmission'].value_counts())
print('\n')

cars.groupby('transmission').size().plot(kind='pie',y='type')
plt.ylabel('transmission')

**Majority of car had Manual transmission**

In [None]:
# Let's check the owner column
print(cars['owner'].value_counts())
print('\n')

plt.figure(figsize=(8,5))
# sns.countplot(x='owner',data=cars)
cars['owner'].iplot(kind='hist')

In [None]:
# Let's check the fuel and transmission column together
sns.countplot(x='transmission',data=cars,hue='fuel')

In [None]:
# Let's check the mileage,engine and max_power column
cars['mileage'].iloc[:3]

In [None]:
cars['engine'].iloc[:3]

In [None]:
cars['max_power'].iloc[:3]

In [None]:
# Let's remove the kmpl, CC and bhp from the values.We will create a new columns and delete these columns

# function to remove kmpl,CC and bhp 
def remove(car):
    kmpl,cc,bhp = car
    
    # for mileage(kmpl/kmpg)
    kmpl = kmpl.split(' ')[0]
    
    # for engine(CC)
    cc = cc.split(' ')[0]
    
    # for max_power
    bhp = bhp.split(' ')[0]
    
    return kmpl,cc,bhp

# Let's apply the above function.
# We will create three new_columns mileage(kmpl),engine(cc).max_power(bhp) and remove the old columns
cars[['mileage(kmpl)','engine(cc)','max_power(bhp)']] = cars[['mileage','engine','max_power']].apply(remove,axis=1,result_type='expand')


# removing the old columns
cars.drop(['mileage','engine','max_power'],axis = 1,inplace=True)

In [None]:
# Let's remove the torque column as well
cars.drop('torque',axis = 1,inplace=True)

In [None]:
cars.head()

In [None]:
# Let's convert the categorical column usng one hot encoding.
cat_cols = ['fuel','seller_type','transmission','owner']

dummies = pd.get_dummies(cars[cat_cols],drop_first=True)

# Let's drop the cat_cols
cars.drop(cat_cols,axis=1,inplace=True)

# Let's concatenate the dummies and cars
cars = pd.concat([cars,dummies],axis = 1)

In [None]:
# Let's divide the columns into X (input variable) and y (output variable)
X = cars.drop('selling_price',axis = 1)
y = cars['selling_price']

In [None]:
# Let's split the data into train_set and test_set
from sklearn.model_selection import train_test_split

# We will use 20% of data as test data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= 0.20)

In [None]:
# Now let's scale the numerical columns in the cars data set
num_cols =['number_of_year','km_driven','seats','mileage(kmpl)','engine(cc)','max_power(bhp)']

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

In [None]:
# Let's convert the X_train and y_train in to numpy arrays
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# Let's create our Models

**Linear Regression**

In [None]:
from sklearn.linear_model import LinearRegression

# to evaluate the model
from sklearn import metrics

In [None]:
linear = LinearRegression()

linear.fit(X_train,y_train)

In [None]:
# prediction
pred_linear = linear.predict(X_test)

Model Evaluation

In [None]:
# Mean Squared Error
mse = metrics.mean_squared_error(y_test,pred_linear)

# Root Mean Squared Error
rmse_linear = np.sqrt(mse)

# r_2 score
r2_linear = metrics.r2_score(y_test,pred_linear)

print(f'MEAN SQUARED ERROR FOR LINEAR REGRESSION IS: {mse}')
print('\n')
print(f'ROOT MEAN SQUARED ERROR FOR LINEAR REGRESSION IS: {rmse_linear}')
print('\n')
print(f'R2 SCORE FOR LINEAR REGRESSION IS: {r2_linear}')

In [None]:
# Let's create a scatter plot
sns.scatterplot(x = y_test,y = pred_linear)
plt.xlabel('y_test')
plt.ylabel('predicted value')

**If we check the scatterplot, it is not linear which is not good for model.It says that the preidcted value varied the y_test value by large amount. Linear Regression would not work. Lets try SVR model and check if it does better then Linear regression.**

**SVR**

In [None]:
from sklearn.svm import SVR

# To find the best parameter for SVR model
from sklearn.model_selection import GridSearchCV

In [None]:
# HyperParameter Tunning
param_grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['rbf','linear','poly']
}

grid = GridSearchCV(SVR(),param_grid=param_grid,verbose=2,scoring='neg_mean_squared_error')

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
# Let's use these parameter to crete our SVR model
svr = SVR(C=100,gamma=1,kernel='linear')

svr.fit(X_train,y_train)

In [None]:
# prediction
pred_svr = svr.predict(X_test)

Model Evaluation

In [None]:
# Mean Squared Error
mse = metrics.mean_squared_error(y_test,pred_svr)

# Root Mean Squared Error
rmse_svr = np.sqrt(mse)

# r_2 score
r2_svr = metrics.r2_score(y_test,pred_svr)

print(f'MEAN SQUARED ERROR FOR SVR IS: {mse}')
print('\n')
print(f'ROOT MEAN SQUARED ERROR FOR SVR IS: {rmse_svr}')
print('\n')
print(f'R2 SCORE FOR SVR IS: {r2_svr}')

In [None]:
# Let's create a scatter plot
sns.scatterplot(x = y_test,y = pred_svr)
plt.xlabel('y_test')
plt.ylabel('predicted value')

**Seeing the rmse scroe and scatterplot, we can conclude SVR model performed worse then Linear Regression**

**RandomForest**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# To find the best parameter
from sklearn.model_selection import RandomizedSearchCV

In [None]:
random = RandomForestRegressor(n_jobs=-1)

hyper_parameters = {
    'max_depth' : [int(x) for x in np.linspace(5,40,8)],
    'n_estimators' : [int(x) for x in np.linspace(100,1500,15)],
    'max_features': ['auto','sqrt'],
    'min_samples_leaf': [1,2,5,10],
    'min_samples_split':[2,5,10,15,100]
}

rf_random = RandomizedSearchCV(estimator=random,param_distributions=hyper_parameters,scoring='neg_mean_squared_error',n_iter = 10,cv=5,verbose=2,n_jobs=-1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
# Let's check the best parameter
rf_random.best_params_

In [None]:
# Let's use these parameters to train our model
random = RandomForestRegressor(n_estimators=900,min_samples_split=2,min_samples_leaf=2,max_features='auto',max_depth=20)

In [None]:
random.fit(X_train,y_train)

In [None]:
# prediction
pred_random = random.predict(X_test)

Model Evaluation

In [None]:
# Mean Squared Error
mse = metrics.mean_squared_error(y_test,pred_random)

# Root Mean Squared Error
rmse_random = np.sqrt(mse)

# r_2 score
r2_random = metrics.r2_score(y_test,pred_random)

print(f'MEAN SQUARED ERROR FOR RANDOM FOREST REGRESSOR IS: {mse}')
print('\n')
print(f'ROOT MEAN SQUARED ERROR FOR RANDOM FOREST REGRESSOR IS: {rmse_random}')
print('\n')
print(f'R2 SCORE FOR RANDOM FOREST REGRESSOR IS: {r2_random}')

In [None]:
# Let's create a scatter plot
sns.scatterplot(x = y_test,y = pred_random)
plt.xlabel('y_test')
plt.ylabel('predicted value')

**If we check the scatter plot, it forms some what like a linear line.It shows our model performance is good**

**Random Forest performed very well as compared to Linear Regressor and SVR**

# THANK YOU