# Used Car Price Prediction

## 1) Problem statement.
This dataset comprises used cars sold on cardehko.com in India as well as important features of these cars.
If user can predict the price of the car based on input features.
Prediction results can be used to give new seller the price suggestion based on market condition.

## 2) Data Collection.
The Dataset is collected from scrapping from cardheko webiste
The data consists of 13 column and 15411 rows.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('cardekho_imputated.csv', index_col=[0])
df.head()

Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

### Handling Missing Values

- Handling Missing Values
- Handling Duplicates
- Check Data Type
- Understand Data

In [3]:
df.isnull().sum()

car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

#### Handling Duplicates

In [4]:
columns_to_drop = ['car_name', 'brand']
df = df.drop(columns=columns_to_drop, axis=1)
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


#### Getting All Different Types of Features

In [5]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(f"Number of Numeric Features: {len(numeric_features)}")

categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f"Number of Categorical Features: {len(categorical_features)}")

discrete_features = [feature for feature in numeric_features if len(df[feature].unique()) <= 25]
print(f"Number of Discrete Features: {len(discrete_features)}")

continuous_features = [feature for feature in numeric_features if feature not in discrete_features]
print(f"Number of Continuous Features: {len(continuous_features)}")

Number of Numeric Features: 7
Number of Categorical Features: 4
Number of Discrete Features: 2
Number of Continuous Features: 5


## Train Test Split

In [6]:
X = df.drop('selling_price', axis=1)
y = df['selling_price']

### Feature Encoding and Scaling

In [7]:
df['model'].nunique()

120

In [8]:
label_encoder = LabelEncoder()
X['model'] = label_encoder.fit_transform(X['model'])

In [9]:
numeric_features = X.select_dtypes(exclude="object").columns
one_hot_columns = ['seller_type', 'fuel_type', 'transmission_type']

numeric_transformer = StandardScaler()
one_hot_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", one_hot_transformer, one_hot_columns),
        ("StandardScaler", numeric_transformer, numeric_features)
    ],
    remainder="passthrough"
)

In [10]:
X = preprocessor.fit_transform(X)

### Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Model Training

In [12]:
def evaluate_model(true_values, predicted_values):
    mae = mean_absolute_error(true_values, predicted_values)
    mse = mean_squared_error(true_values, predicted_values)
    rmse = np.sqrt(mean_squared_error(true_values, predicted_values))
    r2_square = r2_score(true_values, predicted_values)
    return mae, rmse, r2_square

In [13]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
   
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f"- Root Mean Squared Error: {model_train_rmse:.2f}")
    print(f"- Mean Absolute Error: {model_train_mae:.2f}")
    print(f"- R2 Score: {model_train_r2:.2f}")

    print('----------------------------------')
    
    print('Model performance for Test set')
    print(f"- Root Mean Squared Error: {model_test_rmse:.2f}")
    print(f"- Mean Absolute Error: {model_test_mae:.2f}")
    print(f"- R2 Score: {model_test_r2:.2f}")
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 553855.67
- Mean Absolute Error: 268101.61
- R2 Score: 0.62
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502543.59
- Mean Absolute Error: 279618.58
- R2 Score: 0.66


Lasso
Model performance for Training set
- Root Mean Squared Error: 553855.67
- Mean Absolute Error: 268099.22
- R2 Score: 0.62
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502542.67
- Mean Absolute Error: 279614.75
- R2 Score: 0.66


Ridge
Model performance for Training set
- Root Mean Squared Error: 553856.32
- Mean Absolute Error: 268059.80
- R2 Score: 0.62
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 502533.82
- Mean Absolute Error: 279557.22
- R2 Score: 0.66


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 325873.01
- Mean Absolute Error: 91425.47
- R2 Score: 0

## Hyperparameter Tuning

In [14]:
knn_params = {
    "n_neighbors": [2, 3, 10, 20, 40, 50]
}

random_forest_params = {
    "max_depth": [5, 8, 15, None, 10],
    "max_features": [5, 7, "auto", 8],
    "min_samples_split": [2, 8, 15, 20],
    "n_estimators": [100, 200, 500, 1000]}

In [15]:
randomized_cv_models = [
    ('KNN', KNeighborsRegressor(), knn_params),
    ("RF", RandomForestRegressor(), random_forest_params)           
]

In [16]:
model_param = {}

for name, model, params in randomized_cv_models:
    randomized_model = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
)
    randomized_model.fit(X_train, y_train)
    model_param[name] = randomized_model.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
---------------- Best Params for KNN -------------------
{'n_neighbors': 10}
---------------- Best Params for RF -------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': 5, 'max_depth': 15}


In [18]:
models = {
    "K-Neighbors Regressor": KNeighborsRegressor(n_neighbors=10),
    "Random Forest Regressor": RandomForestRegressor(
        n_estimators=1000, min_samples_split=2,
        max_features=5, max_depth=15
    ),
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f"- Root Mean Squared Error: {model_train_rmse:.2f}")
    print(f"- Mean Absolute Error: {model_train_mae:.2f}")
    print(f"- R2 Score: {model_train_r2:.2f}")

    print('----------------------------------')
    
    print('Model performance for Test set')
    print(f"- Root Mean Squared Error: {model_test_rmse:.2f}")
    print(f"- Mean Absolute Error: {model_test_mae:.2f}")
    print(f"- R2 Score: {model_test_r2:.2f}")
    
    print('='*35)
    print('\n')

K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 363460.77
- Mean Absolute Error: 103472.05
- R2 Score: 0.84
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 263888.06
- Mean Absolute Error: 117496.21
- R2 Score: 0.91


Random Forest Regressor
Model performance for Training set
- Root Mean Squared Error: 141618.61
- Mean Absolute Error: 55937.14
- R2 Score: 0.98
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 206454.42
- Mean Absolute Error: 97135.18
- R2 Score: 0.94


