<a href="https://colab.research.google.com/github/singammanasvi9440/Project/blob/main/Used_car_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## About data

**Dataset description: **  
Name: The brand and model of the car.   
Location : The location in which the car is being sold or is available for purchase.   
Year : The year or edition of the model.
Kilometers_Driven : The total kilometres driven in the car by the previous owner(s) in KM.   
Fuel_Type : The type of fuel used by the car. (Petrol, Diesel, Electric, CNG, LPG). 
Transmission : The type of transmission used by the car. (Automatic / Manual). 
Mileage :The standard mileage offered by the car company in kmpl or km/kg

Engine :The displacement volume of the engine in CC.


# Importing dependencies

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_log_error,r2_score,mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
import matplotlib
import pickle



# Data overview

In [None]:
train_data = pd.read_csv('train-data.csv')
test_data = pd.read_csv('test-data.csv')

In [None]:
#sample data
train_data.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [None]:
Location = train_data['Location'].unique()
Year = train_data['Year'].unique()
Fuel_types = train_data['Fuel_Type'].unique()
transmission = train_data['Transmission'].unique()
owner_type = train_data['Owner_Type'].unique()

In [None]:
print(Location)
print(Year)
print(Fuel_types)
print(transmission)
print(owner_type)

['Mumbai' 'Pune' 'Chennai' 'Coimbatore' 'Hyderabad' 'Jaipur' 'Kochi'
 'Kolkata' 'Delhi' 'Bangalore' 'Ahmedabad']
[2010 2015 2011 2012 2013 2016 2018 2014 2017 2007 2009 2008 2019 2006
 2005 2004 2002 2000 2003 1999 2001 1998]
['CNG' 'Diesel' 'Petrol' 'LPG' 'Electric']
['Manual' 'Automatic']
['First' 'Second' 'Fourth & Above' 'Third']


In [None]:
#over view of train data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


# Data cleaning

In [None]:
train_data.drop(['Unnamed: 0','New_Price','Name'],axis=1,inplace=True)
train_data=train_data.dropna()

In [None]:
train_data.isnull().sum()

Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [None]:
# removing the text from the  columns
train_data['Mileage'].astype(str)
train_data['Engine'].astype(str)
train_data['Power'].astype(str)
train_data['Mileage'].replace(regex=True,inplace=True,to_replace=r'km/kg',value=r'')
train_data['Mileage'].replace(regex=True,inplace=True,to_replace=r'kmpl',value=r'')
train_data['Engine'].replace(regex=True,inplace=True,to_replace=r'CC',value=r'')
train_data['Power'].replace(regex=True,inplace=True,to_replace=r'bhp',value=r'')

#stripping extra spaces
train_data['Mileage']=train_data['Mileage'].str.replace(" ","")
train_data['Engine']=train_data['Engine'].str.replace(" ","")
train_data['Power']=train_data['Power'].str.replace(" ","")


In [None]:
data=train_data.loc[train_data['Power']!='null']

In [None]:
data['Power'].unique()

array(['58.16', '126.2', '88.7', '88.76', '140.8', '55.2', '63.1',
       '171.5', '103.6', '74', '103.25', '116.3', '187.7', '115',
       '175.56', '98.6', '83.8', '167.62', '190', '88.5', '177.01', '80',
       '67.1', '102', '108.45', '138.1', '184', '179.5', '103.5', '64',
       '82', '254.8', '73.9', '46.3', '37.5', '77', '82.9', '149.92',
       '138.03', '112.2', '163.7', '71', '105', '174.33', '75', '103.2',
       '53.3', '78.9', '147.6', '147.8', '68', '186', '170', '69', '140',
       '78', '194', '500', '108.5', '86.8', '187.74', '132', '86.7',
       '73.94', '117.3', '218', '168.5', '89.84', '110', '90', '82.85',
       '67', '241.4', '35', '270.9', '126.32', '73', '130', '100.6',
       '150', '75.94', '215', '107.3', '37.48', '120', '178', '152',
       '91.1', '85.80', '362.07', '121.3', '143', '81.80', '171', '76.8',
       '103.52', '444', '362.9', '67.06', '120.7', '258', '81.86', '112',
       '88.73', '57.6', '157.75', '102.5', '201.1', '83.1', '68.05',
       '

In [None]:
#converting the columns back to int and float
data['Mileage'].astype('float32')
data['Engine'].astype('int64')
data['Power'].astype('float32')

0        58.160000
1       126.199997
2        88.699997
3        88.760002
4       140.800003
           ...    
6014     74.000000
6015     71.000000
6016    112.000000
6017     67.099998
6018     57.599998
Name: Power, Length: 5872, dtype: float32


🚙
New_price column has almost null values and other columns have the negligible amount of null values

In [None]:
#missing values in test data
test_data.isnull().sum()
#dropping missing values and unnecessary columns
test_data.drop(['Unnamed: 0','New_Price'],axis=1,inplace=True)
test_data.dropna(inplace=True)

In [None]:
print('length of train data',len(train_data))
print('length of test data',len(test_data))

length of train data 5975
length of test data 1223


#Exploratory Data Analysis

In [None]:

df = data
fig = px.scatter_matrix(df, dimensions=["Kilometers_Driven", "Fuel_Type", "Mileage", "Power"])
fig.show()

In [None]:

df = data
fig = px.scatter(df, x="Owner_Type", y="Kilometers_Driven")
fig.show()

Index(['Name', 'Location', 'Year', 'Kilometers_Driven', 'Fuel_Type',
       'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats',
       'Price'],
      dtype='object')

In [None]:

df = data
fig = px.scatter(df, x="Mileage", y="Kilometers_Driven", animation_frame="Year",
            size="Price")
fig.show()

In [None]:
df = data
fig = px.scatter(df, x="Mileage", y="Kilometers_Driven", animation_frame="Location", animation_group="Year",
            size="Price")
fig.show()

In [None]:

fig = px.area(df, x="Seats", y="Price",color='Owner_Type')
fig.show()

In [None]:

fig = px.box(df, x="Owner_Type", y="Price", notched=True)
fig.show()

In [None]:
#distribution of price
fig = px.histogram(df, x=["Price"])
fig.show()

# Data preprocessing

##Label encoding

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5872 entries, 0 to 6018
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Location           5872 non-null   object 
 1   Year               5872 non-null   int64  
 2   Kilometers_Driven  5872 non-null   int64  
 3   Fuel_Type          5872 non-null   object 
 4   Transmission       5872 non-null   object 
 5   Owner_Type         5872 non-null   object 
 6   Mileage            5872 non-null   object 
 7   Engine             5872 non-null   object 
 8   Power              5872 non-null   object 
 9   Seats              5872 non-null   float64
 10  Price              5872 non-null   float64
dtypes: float64(2), int64(2), object(7)
memory usage: 550.5+ KB


In [None]:
# label encoding the categorical data
le= LabelEncoder()
data = data.apply(le.fit_transform)

In [None]:
data.describe()

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
count,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0,5872.0
mean,5.614101,15.477691,1571.752214,1.896628,0.71015,0.367847,219.981778,50.686308,203.867677,2.285933,469.834639
std,2.950323,3.164568,871.204718,1.007911,0.453731,0.806785,109.519568,40.594291,119.523027,0.797385,348.733056
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,14.0,821.75,1.0,0.0,0.0,136.0,13.0,80.0,2.0,183.75
50%,6.0,16.0,1600.5,1.0,1.0,0.0,235.0,37.5,247.0,2.0,374.0
75%,8.0,18.0,2334.25,3.0,1.0,0.0,310.0,72.0,311.0,2.0,690.0
max,10.0,21.0,3037.0,3.0,1.0,3.0,428.0,138.0,369.0,7.0,1363.0


## Data normalisation

In [None]:
# seperating features and targets
X = data.drop('Price',axis=1) #features
y = data['Price'] #target

#splitting the data
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [None]:
#normalising the train and test data
MM_sc = MinMaxScaler()
X_train = MM_sc.fit_transform(X_train)
X_test = MM_sc.transform(X_test)

## Machine learning models

### Linear regression

In [None]:
#fitting model
LR=LinearRegression()
LR.fit(X_train,y_train)
y_pred=LR.predict(X_test)

In [None]:
scores = cross_val_score(LR, X, y, cv=10, scoring='neg_mean_squared_error')
scores

array([-35911.47862054, -30356.10290952, -30094.69880675, -30082.35509071,
       -29638.61469309, -29437.11911647, -29329.27155561, -31165.45536816,
       -31677.39977753, -35687.31556595])

In [None]:
#displaying
y_pred

array([459.34535871, 217.37319511, 372.64176882, ..., 876.80175296,
       235.53879993, 246.66381972])

In [None]:
#function for evaluation of model
def acc_metrics(y_test,y_pred):
    results=[]
    results.append(mean_squared_error(y_test, y_pred))
    results.append(np.sqrt(results[0]))
    results.append(r2_score(y_test,y_pred))
    results.append(round(r2_score(y_test,y_pred)*100,4))
    return (results)
     


In [None]:

  
Model_scores=pd.DataFrame(index=['MSLE', 'Root MSLE', 'R2 Score','Accuracy(%)']) 
lr_results=acc_metrics(y_test,y_pred)
print('Coefficients: \n', LR.coef_)
print("MSLE : {}".format(lr_results[0]))
print("Root MSLE : {}".format(lr_results[1]))
print("R2 Score : {} or {}%".format(lr_results[2],lr_results[3]))
Model_scores['Linear Regression']=lr_results


Coefficients: 
 [ -75.28880407  870.95805783 -117.43128944 -374.25430192 -312.97988228
  -24.16702809 -444.85980003   70.20051809  -72.83865501 -152.98948794]
MSLE : 31176.21805600681
Root MSLE : 176.5678851207286
R2 Score : 0.7447011813383412 or 74.4701%


Ridge regression

In [None]:
#model object and fitting model
RR=Ridge(alpha=20.336,solver='auto')
RR.fit(X_train,y_train)
y_pred=RR.predict(X_test)

In [None]:
#model scores

ridge_model=acc_metrics(y_test,y_pred)
print("MSLE : {}".format(ridge_model[0]))
print("Root MSLE : {}".format(ridge_model[1]))
print("R2 Score : {} or {}%".format(ridge_model[2],ridge_model[3]))
Model_scores['Ridge Regression']=ridge_model

MSLE : 32337.49818480267
Root MSLE : 179.82630003645926
R2 Score : 0.7351915787148209 or 73.5192%


KNN

In [None]:
#model implementation
KNN=KNeighborsRegressor(n_neighbors=5) 
KNN.fit(X_train,y_train)
y_pred=KNN.predict(X_test)

In [None]:
knn=acc_metrics(y_test,y_pred)
print("MSLE : {}".format(knn[0]))
print("Root MSLE : {}".format(knn[1]))
print("R2 Score : {} or {}%".format(knn[2],knn[3]))
Model_scores['KNN']=knn

MSLE : 16256.62751418842
Root MSLE : 127.50148043920282
R2 Score : 0.8668761620688218 or 86.6876%


In [None]:
#final results
Model_scores

Unnamed: 0,Linear Regression,Ridge Regression,KNN
MSLE,31176.218056,32337.498185,16256.627514
Root MSLE,176.567885,179.8263,127.50148
R2 Score,0.744701,0.735192,0.866876
Accuracy(%),74.4701,73.5192,86.6876


###Model saving

In [None]:
#saving the linear regression model
with open('lr_model','wb') as f:
  pickle.dump(LR,f)

#saving the Ridge regression model
with open('ridge_model','wb') as f:
  pickle.dump(ridge_model,f)

#saving the KNN regression model
with open('knn_model','wb') as f:
  pickle.dump(KNN,f)

### Saved Model loading

In [None]:
with open('/content/lr_model','rb') as f:
  lr_load_model = pickle.load(f)

with open('/content/ridge_model','rb') as f:
  ridge_load_model = pickle.load(f)

with open('/content/knn_model','rb') as f:
  knn_load_model = pickle.load(f)

In [None]:
#sample input
sample_input1 = np.array([[9,2010,72000,0,3,1,26.6,998,58.16,5.0]])
result = knn_load_model.predict(sample_input1)

In [None]:
result

array([669.4])

In [None]:
sample_input2=np.array([[8,	2015,41000,1,2,2,19.67,1582,126.2,7.0]])
result = knn_load_model.predict(sample_input2)
result

array([796.4])