In [None]:


import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
import seaborn as sns
import matplotlib.pyplot as plt


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# importing unclean data set and performing EDA

In [None]:
df=pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/unclean cclass.csv')
df.head()

# Checking for duplicate rows and removing them

In [None]:
df.duplicated().sum()

#as there are 98 duplicate rows so removing them and resetting index
df.drop_duplicates(inplace=True)
df.index=[x for x in range(df.shape[0])]

# Information about each column

In [None]:
df.info()

#some of ther features has null values so dealing with them.
#some of the columns have wrong data type so converting them to correct datatype




# year 

In [None]:
#replacing null value with mode as year can be treated as categorical, which is more relevent
df['year']=df['year'].replace(np.nan,2019.0)

#histogram 
sns.set_style('whitegrid')
sns.set(font_scale=1)
sns.histplot(df.year,color='blue',binwidth=1,kde=True)


'''year data is negatively skewed,i.e most of the cars which are for sell 
are recent'''

#we can leave bins to default if we want,i have chosen it to be 30 for better view


# price 

In [None]:
#price should be of float type ,so converting it to float type
for index in range(df.shape[0]):
    #removing £ and ,
    if type(df['price'][index])==str:
        l=df['price'][index].split('£')

        l1=l[1].split(',')
        df['price'][index]=float(''.join(l1))


#converting it to float type
df['price'].astype('float')

#now replacing null value with average
avg=df['price'].sum()/df.shape[0]
df['price'].fillna(avg,inplace=True)

#now renaming this column as price in £
df.rename(columns={'price':'price in £'},inplace=True)

#histogram
sns.set_style('whitegrid')
sns.set(font_scale = 1)
sns.histplot(df['price in £'],color='blue',bins=50)

In [None]:
#histogram with more bins
sns.set_style('whitegrid')
sns.histplot(df['price in £'],color='blue',bins=200,kde=True)
#data is positively skewed


# Transmission 

In [None]:
# transmission column has one null value so replcaing it with mode of column as it is categorical


df['transmission'].fillna('Semi-Auto',inplace=True)

#barplot
sns.set_style('white')
sns.countplot(df['transmission'],palette='turbo_r')

# mileage 

In [None]:
#mileage should be of float type,so relacing ',' with nothing

for index in range(df.shape[0]):
    if  ',' in str(df['mileage'][index]):
        df['mileage'][index]=df['mileage'][index].replace(',','')


#There are some values as'Unknown' so replacing it with NaN
df['mileage']=df['mileage'].replace('Unknown',np.nan)

        
        
#converting to float type        
df['mileage']=df['mileage'].astype('float')

#replacing missing values with median
median=df['mileage'].median()
df['mileage'].fillna(median,inplace=True)


In [None]:
#histogram
sns.set_style('whitegrid')
sns.histplot(df['mileage'],bins=50,color='blue')

#data is positively skewed

# fuel type 

In [None]:
#fuel type which is represented by a number represent the gasoline and ethanol mixture 
# e.g 30 =30% ethanol +70% gasoline


#as more than 50% values are missing in this column so assigning a new category to missing values

df['fuel type']=df['fuel type'].replace(np.NaN,'U')

In [None]:
#count plot
sns.set_style('whitegrid')
sns.set(font_scale = 3)
plt.figure(figsize=(40,15))
sns.countplot(df['fuel type'])


#  engine size

In [None]:
#lets leave this column for now as values are ambiguous in this column

In [None]:
# mileage should be of float type
#removing ',' and converting it to float type

for index in range(df.shape[0]):
    if ',' in str(df['mileage2'][index]):
        df['mileage2'][index]=df['mileage2'][index].replace(',','')

df['mileage2']=df['mileage2'].replace('Unknown',np.nan)

df['mileage2']=df['mileage2'].astype('float')
# Nan values with median as variation in data is very large
df['mileage2']=df['mileage2'].replace(np.nan,df['mileage'].median())

In [None]:
#histogram
sns.set_style('whitegrid')
sns.set(font_scale = 1)
sns.histplot(np.log(df['mileage2']),color='red',binwidth=1)

#data is highly skewed(+ve skewed)

# fuel type2

In [None]:
#replacing null with mode
#as 'Diesel' is most frequently occuring value
df['fuel type2']=df['fuel type2'].replace(np.nan,'Diesel')


In [None]:
#countplot
sns.set_style('white')
sns.countplot(df['fuel type2'],palette="PuRd_r")

# engine size2

In [None]:
#some values are in liters and some are in cc so converting all the values in liters
for index in range(df.shape[0]):
    if str(df['engine size2'][index])!='Nan' and df['engine size2'][index]!='Unknown':
        
        if float(df['engine size2'][index])>1000:
            df['engine size2'][index]=float(df['engine size2'][index])/1000
            
            
#replacing 'Unknown' with nan

df['engine size2']=df['engine size2'].replace('Unknown',np.nan)

#converting column to float type
df['engine size2']=df['engine size2'].astype('float')

#replacing null with mean 
df['engine size2']=df['engine size2'].replace(np.nan,df['engine size2'].mean())





In [None]:
#histogram
sns.set_style('whitegrid')
sns.histplot(df['engine size2'],bins=10,color='blue')
#data is skewed

# reference

In [None]:
#this column is not relevent for our model so dropping this
df=df.drop(columns='reference')

# some more insights from data

In [None]:
sns.set_style('whitegrid')
sns.set(font_scale = 3)
plt.figure(figsize=(50,30))
sns.countplot(df['year'],hue=df['transmission'],palette='tab20')

#since 1991  share(in %) of cars with manual and automatic transmission are continuously decresing
#and share(in %)  of cars with semi-automatic transmission continuously increasing

In [None]:
sns.set_style('white')
sns.set(font_scale = 3)
plt.figure(figsize=(50,30))
sns.countplot(df['year'],hue=df['fuel type2'],palette='turbo_r')

#number of hybrid cars are very less as compared to diesel and petrol
#Majority of cars available are Diesel and petrol cars

In [None]:
a=df[df['fuel type2']=='Petrol']['price in £'].sum()
b=df[df['fuel type2']=='Petrol']['price in £'].count()
avg_price_petrol=a/b

c=df[df['fuel type2']=='Diesel']['price in £'].sum()
d=df[df['fuel type2']=='Diesel']['price in £'].count()

avg_price_diesel=c/d

e=df[df['fuel type2']=='Hybrid']['price in £'].sum()
f=df[df['fuel type2']=='Hybrid']['price in £'].count()

avg_price_hybrid=e/f

data=pd.DataFrame({'avg price':[avg_price_hybrid,avg_price_diesel,avg_price_petrol],'type of car':['Hybrid','Diesel','Petrol']})

l=[avg_price_hybrid,avg_price_diesel,avg_price_petrol]
sns.set_style('white')
sns.set(font_scale = 1)
sns.barplot(x=data['type of car'],y=data['avg price'],palette='turbo_r')


#avg price of petrol is more as compared to diesel and hybrid car




In [None]:
a=df[df['transmission']=='Semi-Auto']['price in £'].sum()
b=df[df['transmission']=='Semi-Auto']['price in £'].count()
avg_price_semi_auto=a/b

c=df[df['transmission']=='Automatic']['price in £'].sum()
d=df[df['transmission']=='Automatic']['price in £'].count()

avg_price_auto=c/d

e=df[df['transmission']=='Manual']['price in £'].sum()
f=df[df['transmission']=='Manual']['price in £'].count()

avg_price_manual=e/f

data=pd.DataFrame({'avg price':[avg_price_semi_auto,avg_price_auto,avg_price_manual],'type of transmission':['Semi-Auto','Automatic','Manual']})

sns.set_style('whitegrid')
sns.set(font_scale = 1)
sns.barplot(x=data['type of transmission'],y=data['avg price'],palette='terrain_r')


#avg price of semi-automatic cars are more than that of automatic and manual transmission


In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(50,50))
sns.pairplot(df)

# scatter plot between target and features

In [None]:
plt.scatter(df['year'],df['price in £'],color='red')
plt.xlabel('year')
plt.ylabel('price in £')
plt.show()

#it is evident that newer the car more the price. It is quite practical ,and it is also evident that only year doesn't 
#decide the price other factors also influence the car price, newer cars have higher avg selling price


In [None]:
plt.scatter(df['mileage'],df['price in £'],color='orange')
plt.xlabel('mileage')
plt.ylabel('price in £')
plt.show()


# It is evident that more the mileage lesser the price of car,but mileage alone can't decide the price, cars with
#lesser mileage have higher avg selling price

In [None]:
plt.scatter(df['mileage2'],df['price in £'],color='green')
plt.xlabel('mileage2')
plt.ylabel('price in £')
plt.show()

In [None]:
plt.scatter(df['engine size2'],df['price in £'],color='cyan')
plt.xlabel('engine size2')
plt.ylabel('price in £')
plt.show()

#cars with large engine size have higher average price but price also depends on other factors

# box plot 

In [None]:
sns.set_style('white')
sns.set(font_scale=1)
sns.boxplot(df['price in £'],color='red')

In [None]:
sns.set_style('white')
sns.set(font_scale=1)
sns.boxplot(df['year'],color='yellow')

In [None]:
sns.set_style('white')
sns.set(font_scale=1)
sns.boxplot(df['mileage'],color='orange')

In [None]:
sns.set_style('white')
sns.set(font_scale=1)
sns.boxplot(df['engine size2'],color='green')

# model building

In [None]:
#as model column has only one category so dropping it 
# engine size column have ambiguous units so dropping it for now
df=df.drop(columns=['engine size','model'])


In [None]:
#importing libraries
from sklearn.preprocessing import StandardScaler  #for scaling the data
from sklearn.model_selection import train_test_split,GridSearchCV



In [None]:
#encoding categorical features
df=pd.get_dummies(df)


# x and y from data

In [None]:
y=df['price in £']
df=df.drop(columns=['price in £'])
x=df.values

In [None]:
#splitting the data in training and test set
x_tr,x_te,y_tr,y_te=train_test_split(x,y,test_size=.2,random_state=0)

In [None]:
#applying feature scaling so that features with higher values don't dominate the results
sc=StandardScaler()
x_tr=sc.fit_transform(x_tr)
x_te=sc.transform(x_te)

# Decision tree model

In [None]:
from sklearn.tree import DecisionTreeRegressor

dtr=DecisionTreeRegressor()

#finding the optimum hyper parameters
params={'max_depth':[5,10,15,20],'max_features':["auto", "sqrt", "log2"],'splitter':["best", "random"]}

gcv=GridSearchCV(dtr,params,verbose=5,cv=10)
gcv.fit(x_tr,y_tr)

In [None]:
gcv.best_params_

In [None]:
#now training the model with optimal parameters and finding the accuracy
dtr=DecisionTreeRegressor(max_depth=10,max_features='auto',splitter='best')
dtr.fit(x_tr,y_tr)

#predicting the results for test set
y_pr=dtr.predict(x_te)

In [None]:
#importing the library for calculating the accuracy
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
error=mse(y_te,y_pr)
rmse=error**.5
print(rmse)
r2_score(y_te,y_pr)



# Random forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(random_state=0)

#tunning hyper parameters
params={'n_estimators':[100,200],'max_depth':[8,15,18],'max_features':["auto", "sqrt", "log2"]}

gcv=GridSearchCV(rfr,params,verbose=5,cv=10)
gcv.fit(x_tr,y_tr)

In [None]:
gcv.best_params_

In [None]:
#training model with optimum parameters
rfr=RandomForestRegressor(max_depth=15,max_features='auto',n_estimators=100,random_state=0)
rfr.fit(x_tr,y_tr)

#predicting for test set
y_pr=rfr.predict(x_te)


#accuracy of model
error=mse(y_te,y_pr)
rmse=error**.5
print(rmse)
r2_score(y_te,y_pr)

In [None]:
import pickle

In [None]:
filename = "rfr{'max_depth': 15, 'max_features': 'auto', 'n_estimators': 100}.sav"
pickle.dump(rfr, open(filename, 'wb'))

# splitting the data into training and test set ,applying feature scaling and model training

In [None]:
df.shape

In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression

In [None]:
x_tr,x_te,y_tr,y_te=train_test_split(x,y,test_size=.3,random_state=0)
sc=StandardScaler()
x_tr=sc.fit_transform(x_tr)
x_te=sc.transform(x_te)

In [None]:
best_params=[]


# Random forest model

In [None]:
random=RandomForestRegressor(random_state=0)
params={'n_estimators':[100,200,500,1000],'max_depth':[8,12,14],'max_features':["auto", "sqrt", "log2"]}
gcv=GridSearchCV(random,params,verbose=10)
gcv.fit(x_tr,y_tr)


In [None]:
gcv.best_params_

In [None]:
random=RandomForestRegressor(random_state=0,max_depth=14,n_estimators=1000,max_features='auto')
random.fit(x_tr,y_tr)
y_pr=random.predict(x_te)
error=mse(y_te,y_pr)
rmse=error**.5
rmse

In [None]:
r=r2_score(y_te,y_pr) 
r

In [None]:
best_params.append((gcv.best_params_,r))

In [None]:
best_params

# Hyper parameters tunning

In [None]:

reg=XGBRegressor()
    

params={'n_estimators':[100,200,500,1000],'max_depth':[4,6,3,8,10], 'learning_rate':[.07,.08,.03,.05]}

gcv=GridSearchCV(reg,params,verbose=10,cv=3)
gcv.fit(x_tr,y_tr)

                        

In [None]:
gcv.best_params_



In [None]:
reg=XGBRegressor(n_estimators=1000,learning_rate=.08,max_depth=3)
reg.fit(x_tr,y_tr)
y_pr=reg.predict(x_te)

In [None]:
error=mse(y_te,y_pr)
rmse=error**.5
rmse
best_params.append((gcv.best_params_,r2_score(y_te,y_pr) ))

In [None]:

r2_score(y_te,y_pr) 

In [None]:
rmse

In [None]:
df['mileage2'].dtype

In [None]:
df=pd.DataFrame((y_te,y_pr),columns=['y_test','y_pr'])
df

In [None]:
d={'y_te':y_te,'y_pr':y_pr}
df=pd.DataFrame(d)
df