In [1]:
# Importing libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
ds = pd.read_csv("cleaned_car_data.csv")
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 651 entries, 0 to 650
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  651 non-null    int64  
 1   index       651 non-null    int64  
 2   name        651 non-null    object 
 3   company     651 non-null    object 
 4   year        651 non-null    int64  
 5   Price       651 non-null    float64
 6   kms_driven  651 non-null    int64  
 7   fuel_type   651 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 40.8+ KB


In [3]:
# splitting data into dependent and independent
X = ds[["name","company", "year","kms_driven","fuel_type"]]
y = ds[["Price"]]

In [4]:
# Imagine you’re designing a survey about car brands, models, and fuel types.
# You first need to list all the available options before creating your survey form.
# Similarly, OneHotEncoder here is learning all the available options (categories) in the dataset

In [5]:
from sklearn.preprocessing import OneHotEncoder #To convert categorical text data to numbers
ohe = OneHotEncoder() #preparing the machine
ohe.fit(X[["name","company","fuel_type"]]) #fit() means the encoder looks at the dataset and learns all the unique values (categories) in these columns.

In [6]:
#ohe.categories_ is an attribute of the OneHotEncoder object after fitting it on data.
# It stores the unique categories (values) found in each of the columns you encoded.
#After we fit OneHotEncoder, ohe.categories_ tells us exactly what it has learned from the data.
ohe.categories_

[array(['3 Series 320d Sedan', '3 Series 320i', '5 Series 520d Sedan',
        '5 Series 530i', '7 Series 740Li Sedan', 'A3 Cabriolet 40 TFSI',
        'A4 1.8 TFSI Multitronic Premium Plus',
        'A4 2.0 TDI 177bhp Premium', 'A6 2.0 TDI Premium', 'Accent',
        'Accent Executive Edition', 'Accent GLE', 'Accent GLX', 'Accord',
        'Amaze', 'Amaze 1.2 S i VTEC', 'Amaze 1.5 E i DTEC',
        'Amaze 1.5 S i DTEC', 'Amaze 1.5 SX i DTEC', 'Aria Pleasure 4X2',
        'Beat', 'Beat Diesel', 'Beat LS Diesel', 'Beat LS Petrol',
        'Beat LT Diesel', 'Beat LT Opt Diesel', 'Beat LT Petrol',
        'Beat PS Diesel', 'Benz A Class A 180 Sport Petrol',
        'Benz B Class B180 Sports', 'Benz C Class 200 CDI Classic',
        'Benz C Class 200 K MT', 'Benz C Class C 220 CDI Avantgarde',
        'Benz GLA Class 200 CDI Sport', 'Bolero DI', 'Bolero DI BSII',
        'Bolero SLE BS IV', 'Bolt XM Petrol', 'Brio', 'Brio V MT',
        'Brio VX AT', 'City', 'City 1.5 E MT', 'City 1.5 EXi

In [7]:
from sklearn.compose import make_column_transformer
#It helps apply transformations (like encoding) only to specific columns.
ct = make_column_transformer((OneHotEncoder(categories = ohe.categories_),
                              ["name","company","fuel_type"]) , remainder = 'passthrough', force_int_remainder_cols=False)

In [8]:
ct

In [9]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

reg = LinearRegression()
regDec = DecisionTreeRegressor(random_state = 0)
regRFR = RandomForestRegressor(n_estimators = 10,random_state = 0)
pipeLinear = make_pipeline(ct,reg)
pipeDec = make_pipeline(ct,regDec) 
pipeRFR = make_pipeline(ct,regRFR)

scores = []

for i in range (0,101):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 10, random_state = i)
    pipeLinear.fit(X_train,y_train)
    pipeDec.fit(X_train,y_train)
    pipeRFR.fit(X_train,y_train)
    
    result = pipeLinear.predict(X_test)
    scoreLinear = r2_score(y_test,result)
    rmseLinear = np.sqrt(mean_squared_error(y_test,result))

    result = pipeDec.predict(X_test)
    scoreDec = r2_score(y_test,result)
    rmseDec = np.sqrt(mean_squared_error(y_test,result))

    result = pipeRFR.predict(X_test)
    scoreRFR = r2_score(y_test,result)
    rmseRFR = np.sqrt(mean_squared_error(y_test,result))

    scores.append(('Linear',i,scoreLinear,rmseLinear))
    scores.append(('Decision',i,scoreDec,rmseDec))
    scores.append(('Random Forest',i,scoreRFR,rmseRFR))

In [12]:
#created a new dataframe for score values 
scoreDf = pd.DataFrame(data = scores,columns = ["Algo","Iteration","R2 Score","RMSE Score"])

resultDf = scoreDf.sort_values(by = "R2 Score",ascending = False) #sorting the DataFrame by the “R2 Score” column in descending order
resultDf

Unnamed: 0,Algo,Iteration,R2 Score,RMSE Score
29,Random Forest,9,0.987572,4.527732e+04
260,Random Forest,86,0.985421,3.622212e+04
259,Decision,86,0.981122,4.121793e+04
28,Decision,9,0.975988,6.293352e+04
115,Decision,38,0.971653,8.096429e+04
...,...,...,...,...
130,Decision,43,-12.236222,5.989462e+05
105,Linear,35,-15.386453,8.108764e+05
265,Decision,88,-15.739381,7.754005e+05
69,Linear,23,-20.642511,8.074750e+05


In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 10, random_state = 9)
#pipeLinear.fit(X_train,y_train)
#pipeDec.fit(X_train,y_train)
pipeRFR.fit(X_train,y_train)


In [40]:
company = input("enter input name of comapny:")
name = input("enter car name:")
year = int(input("enter year:"))
kms_driven = int(input("enter kms driven:"))
fuel_type = input("enter fuel type:")
columns = ["company","name","year","kms_driven","fuel_type"]
myinput = pd.DataFrame(columns=columns,data=[[company , name,year,kms_driven,fuel_type]])
result = pipeRFR.predict(myinput)
print("You Should Buy It For ~ Price:",result)

enter input name of comapny: Ford
enter car name: Figo
enter year: 2012
enter kms driven: 41000
enter fuel type: Diesel


You Should Buy It For ~ Price: [198999.9]


In [41]:
import pickle as pkl
#save model
with open("car_price_model.pkl","wb") as f:
    pkl.dump(pipeRFR, f)    