# Used Car Price Prediction

Auto Scout data which using for this project, scraped from the on-line car trading company in 2019, contains many features of 9 different car models. In this case, the model will estimate the prices of cars using regression algorithms.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df  = pd.read_csv("car_dataset.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.Owner.unique()

array([0, 1, 3])

In [4]:
df.Seller_Type.unique()

array(['Dealer', 'Individual'], dtype=object)

In [5]:
df.Transmission.unique()

array(['Manual', 'Automatic'], dtype=object)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [7]:
df.isnull().sum()

Car_Name         0
Year             0
Selling_Price    0
Present_Price    0
Kms_Driven       0
Fuel_Type        0
Seller_Type      0
Transmission     0
Owner            0
dtype: int64

In [8]:
df = df.drop("Car_Name", axis=1)
df["Year"] = df["Year"].max()-df["Year"]
df = pd.get_dummies(df,columns=["Fuel_Type",	"Seller_Type",	"Transmission"], drop_first=True)
df

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,4,3.35,5.59,27000,0,0,1,0,1
1,5,4.75,9.54,43000,0,1,0,0,1
2,1,7.25,9.85,6900,0,0,1,0,1
3,7,2.85,4.15,5200,0,0,1,0,1
4,4,4.60,6.87,42450,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...
296,2,9.50,11.60,33988,0,1,0,0,1
297,3,4.00,5.90,60000,0,0,1,0,1
298,9,3.35,11.00,87934,0,0,1,0,1
299,1,11.50,12.50,9000,0,1,0,0,1


In [9]:
X = df.drop("Selling_Price",axis=1)
y = df["Selling_Price"]

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(X)

In [11]:
X=scaler.transform(X)

In [12]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(X,y)
xgb_model.score(X,y)

0.9999845130058066

In [13]:
y_pred= xgb_model.predict(X)

In [14]:
df["y_pred"] = y_pred

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error

"r2",r2_score(y,y_pred), "rmse", mean_absolute_error(y,y_pred)**.5

('r2', 0.9999845130058066, 'rmse', 0.11869281091496868)

In [16]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X,y)
lr_model.score(X,y)

0.8825741581640659

The best model is XGBoost.

### Save the model and scaler

In [17]:
list(df.drop(["Selling_Price","y_pred"],axis=1).columns)

['Year',
 'Present_Price',
 'Kms_Driven',
 'Owner',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Seller_Type_Individual',
 'Transmission_Manual']

In [None]:
!pip list

In [18]:
#inputs
model = xgb_model
scaler = scaler
selected_features = list(df.drop(["Selling_Price","y_pred"],axis=1).columns)

import joblib

# save the model
joblib.dump(model,open("xgb_model.joblib","wb"))

# save our scaler
joblib.dump(scaler,open("scaler.joblib","wb"))

# save column names (selected features)
joblib.dump(selected_features, open("features_list.joblib","wb"))

In [19]:
# #Save the model using pickle
# import pickle
# # save the model to disk
# pickle.dump(model, open(model_file_path, 'wb'))

# #Load the model
# model = pickle.load(open(model_file_path, 'rb'))

# #Saving a Keras model
# # Calling `save('my_model')` creates a SavedModel folder `my_model`.
# model.save("my_model")

---

---

## Real-time Prediction

### Load feature names

In [20]:
columns = joblib.load("features_list.joblib")
columns

['Year',
 'Present_Price',
 'Kms_Driven',
 'Owner',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Seller_Type_Individual',
 'Transmission_Manual']

### Input new data

In [28]:
sample_one = [{
"Year":2014,
"Selling_Price":3.35,
"Kms_Driven":27000,
"Fuel_Type":"Petrol",
"Seller_Type":"Dealer",
"Transmission":"Manual",
"Owner":0
    }]

In [29]:
df_s = pd.DataFrame(sample_one)
df_s

Unnamed: 0,Year,Selling_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,27000,Petrol,Dealer,Manual,0


In [27]:
columns

['Year',
 'Present_Price',
 'Kms_Driven',
 'Owner',
 'Fuel_Type_Diesel',
 'Fuel_Type_Petrol',
 'Seller_Type_Individual',
 'Transmission_Manual']

In [30]:
df_s["Year"] = 2023-df_s["Year"]
df_s = pd.get_dummies(df_s).reindex(columns=columns, fill_value=0)
df_s

Unnamed: 0,Year,Present_Price,Kms_Driven,Owner,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Individual,Transmission_Manual
0,9,0,27000,0,0,1,0,1


### Load Model and its data scaler

In [24]:
scaler = joblib.load(open("scaler.joblib","rb"))
model = joblib.load(open("xgb_model.joblib","rb"))
df_s = scaler.transform(df_s)


### Predict

In [25]:
pred_price = round(model.predict(df_s)[0] * 10_000)
print(f"Your car's price: ${pred_price}")

Your car's price: $20207
