## Car Price Prediction

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv
import matplotlib.pyplot as plt # data visualization
import seaborn as sns  ## data visualization
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        

In [None]:
df=pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## Data visualization

In [None]:
df['Vehicle_age']=2021 - df['Year']

In [None]:
df.drop('Year', axis=1, inplace=True)

In [None]:
plt.figure(figsize=(15,4))
plt.subplot(1,5,1)
sns.countplot(df['Seller_Type'])

plt.subplot(1,5,2)
sns.countplot(df['Transmission'])

plt.subplot(1,5,3)
sns.countplot(df['Fuel_Type'])

plt.subplot(1,5,4)
sns.countplot(df['Owner'])

plt.subplot(1,5,5)
sns.countplot(df['Vehicle_age'])

In [None]:
sns.pairplot(df)

## Check the outlier

In [None]:
plt.figure(figsize=(25,4))
plt.subplot(1,5,1)
sns.boxplot(df['Selling_Price'])

plt.subplot(1,5,2)
sns.boxplot(df['Present_Price'])

plt.subplot(1,5,3)
sns.boxplot(df['Kms_Driven'])

plt.subplot(1,5,4)
sns.boxplot(df['Vehicle_age'])


## Handle the outlier

In [None]:
## handle the outlier 
def outlier_threshold(data1, feature):
    q1=data1[feature].quantile(0.25)
    q3=data1[feature].quantile(0.75)
    IQR=q3-q1
    lower_limit, upper_limit=q1 - 1.5 * IQR , q3 + 1.5*IQR
    return lower_limit, upper_limit

def has_outlier(data1, feature):
    lower_limit, upper_limit=outlier_threshold(data1, feature)
    if data1[(data1[feature]< lower_limit) | (data1[feature]>upper_limit)].any(axis=None):
        print(feature, 'Yes')
    else:    
        print(feature, 'No')

def replace_with_threshold(data1, num_features):
    for feature in num_features:
        lower_limit, upper_limit=outlier_threshold(data1, feature)
        df.loc[(df[feature]<lower_limit), feature]=lower_limit
        df.loc[(df[feature]>upper_limit), feature]=upper_limit


In [None]:
columns=['Selling_Price','Present_Price','Kms_Driven','Vehicle_age']
for col in columns:
    has_outlier(df, col)

In [None]:
replace_with_threshold(df, columns)

In [None]:
for col in columns:
    has_outlier(df, col)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='viridis')

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,4,1)
sns.barplot(x='Fuel_Type', y='Selling_Price', data=df)

plt.subplot(1,4,2)
sns.stripplot(x='Fuel_Type', y='Selling_Price', data=df)

plt.subplot(1,4,3)
sns.barplot(x='Seller_Type', y='Selling_Price', data=df)

plt.subplot(1,4,4)
sns.stripplot(x='Seller_Type', y='Selling_Price', data=df)



In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,4,1)
sns.barplot(x='Owner', y='Selling_Price', data=df)

plt.subplot(1,4,2)
sns.stripplot(x='Owner', y='Selling_Price', data=df)

plt.subplot(1,4,3)
sns.barplot(x='Vehicle_age', y='Selling_Price', data=df)

plt.subplot(1,4,4)
sns.stripplot(x='Vehicle_age', y='Selling_Price', data=df)

In [None]:
sns.scatterplot(x='Kms_Driven', y='Selling_Price', data=df)

In [None]:
## drop the car name
df.drop('Car_Name', axis=1, inplace=True)

In [None]:
## dummies with categorical feature
df=pd.get_dummies(df, drop_first=True)

In [None]:
print(df.shape)
df.head()

In [None]:
X=df.iloc[:, 1:]
y=df.iloc[:,0]

## Split the data into train and test dataset

In [None]:
x_train ,x_test, y_train ,y_test=train_test_split(X, y, test_size=0.30, random_state=1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x_train_scale=scaler.fit_transform(x_train)
x_test_scale=scaler.fit_transform(x_test)


## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(x_train_scale ,y_train)
print('training set score: {}'.format(lr.score(x_train_scale ,y_train)))
print("testing set score :{}".format(lr.score(x_test_scale ,y_test)))

In [None]:
y_pred=lr.predict(x_test_scale)
plt.figure(figsize=(20,10))
index=range(1, len(y_pred)+1)
plt.plot(index, y_pred, label='Predictes value')
plt.plot(index, y_test, label='actual value')
plt.legend()
plt.xlabel('target indexes')
plt.ylabel('selling price')
plt.show()

## R^2 is used to check the performance of the model in Regression 

In [None]:
from sklearn.metrics import r2_score
r2_linear=r2_score(y_test, y_pred)
r2_linear

## Support Vector Regressor

In [None]:
svr=SVR()
svr.fit(x_train_scale ,y_train)
print("training set score : {}".format(svr.score(x_train_scale, y_train)))
print("training set score : {}".format(svr.score(x_test_scale, y_test)))
y_pred1=svr.predict(x_test_scale)

In [None]:
svr1=SVR()
paras={'C':[0.001, 0.1, 1, 2, 5 ,10, 100],
      'degree':[1,2,3,4,5,6,7],
      'gamma':[0.001, 0.1, 1,2 ,5, 10, 100]}

gridsearch=GridSearchCV(estimator=svr,param_grid=paras, cv=10, n_jobs=-1)
gridsearch.fit(x_train_scale,y_train)

In [None]:
gridsearch.best_params_

## Support Vector Regressor with best parameters

In [None]:
svr=SVR(C=10, degree=1, gamma=0.1)
svr.fit(x_train_scale ,y_train)
print("training set score : {}".format(svr.score(x_train_scale, y_train)))
print("testing set score : {}".format(svr.score(x_test_scale, y_test)))
y_pred2=svr.predict(x_test_scale)

In [None]:
plt.figure(figsize=(15,8))
index=range(1, len(y_pred2)+1)
plt.plot(index, y_pred2, label='Predictes value')
plt.plot(index, y_test, label='actual value')
plt.legend()
plt.xlabel('target indexes')
plt.ylabel('selling price')
plt.show()

In [None]:
r2_svm=r2_score(y_test, y_pred2)
r2_svm

## Random Forest Regressor 

In [None]:
random=RandomForestRegressor()
random.fit(x_train ,y_train)
print("training set score :{}".format(random.score(x_train,y_train)))
print("testing set score : {}".format(random.score(x_test,  y_test)))

In [None]:
y_pred1=random.predict(x_test)
r2_random=r2_score(y_test, y_pred1)
r2_random

In [None]:

pd.DataFrame({'Model':['Linear Regression', 'Support vector machine', 'Random Forest Regressor'], 'R2 Score':[r2_linear, r2_svm, r2_random]})

When the value of the r2_score is near to 1 that means model is good otherwise model gives some error. So here Support vector machine gives the best r2 score so it is good model