<a href="https://colab.research.google.com/github/sinhvienfpt/UsedCarPricePrediction/blob/develop/OldCarPrice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
url = "https://raw.githubusercontent.com/sinhvienfpt/UsedCarPricePrediction/main/assets/data/cleaned.csv"

In [17]:
import pandas as pd
pd.options.mode.chained_assignment = None #Ignore pandas warning

from sklearn.model_selection import train_test_split


from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Grid CV
from sklearn.model_selection import GridSearchCV

In [18]:
import warnings
warnings.filterwarnings('ignore')

# Modeling

In [19]:
df = pd.read_csv(url)
df.head()

Unnamed: 0,Name,Location,Kilometers_Driven,Owner_Type,Mileage,Engine,Power,Seats,Price,Brand,...,Mitsubishi,Nissan,Porsche,Renault,Skoda,Smart,Tata,Toyota,Volkswagen,Volvo
0,Maruti Wagon R LXI CNG,Mumbai,72000,3,21.01,998.0,58.16,5.0,1.75,Maruti,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Hyundai Creta 1.6 CRDi SX Option,Pune,41000,3,19.67,1582.0,126.2,5.0,12.5,Hyundai,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Honda Jazz V,Chennai,46000,3,18.2,1199.0,88.7,5.0,4.5,Honda,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Maruti Ertiga VDI,Chennai,87000,3,20.77,1248.0,88.76,7.0,6.0,Maruti,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,40670,2,15.2,1968.0,140.8,5.0,17.74,Audi,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Data spliting

In [20]:
df.dropna(inplace=True)

In [21]:
X,y = df.drop(['Name', 'Location', 'Mileage', 'Brand','Price'],axis=1),df["Price"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Evaluating regression models (with default parameters)

Test run with **Linear Regression**

In [22]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Linear Regression")
print("MSE : ",mean_squared_error(y_test,y_pred))
print("MAE : ",mean_absolute_error(y_test,y_pred))
print("R2 : ",r2_score(y_test,y_pred))
print()

Linear Regression
MSE :  2.9755850126631076
MAE :  1.2288477038168486
R2 :  0.8106083193317789



In [23]:
models = {
    'Linear Regression' : LinearRegression(),
    'Random Forest' : RandomForestRegressor(),
    'Decision Tree' : DecisionTreeRegressor(),
    'KNN' : KNeighborsRegressor(),
    'Gradient Boosting' : GradientBoostingRegressor(),
    'XGBoost' : XGBRegressor(),
    'LightGBM' : LGBMRegressor(force_row_wise=True)
}

result = {
    'Model' : [],
    'MSE' : [],
    'MAE' : [],
    'R2' : []
}

In [24]:
for model_name,model in models.items():
  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)
  result['Model'].append(model_name)
  result['MSE'].append(mean_squared_error(y_test,y_pred))
  result['MAE'].append(mean_absolute_error(y_test,y_pred))
  result['R2'].append(r2_score(y_test,y_pred))


[LightGBM] [Info] Total Bins 653
[LightGBM] [Info] Number of data points in the train set: 3953, number of used features: 37
[LightGBM] [Info] Start training from score 6.154430


In [25]:
result_df = pd.DataFrame(result)
result_df.sort_values(by='R2',ascending=False)

Unnamed: 0,Model,MSE,MAE,R2
5,XGBoost,1.83033,0.825613,0.883502
6,LightGBM,1.840353,0.844524,0.882864
1,Random Forest,1.992109,0.880771,0.873205
4,Gradient Boosting,2.02423,0.952584,0.871161
0,Linear Regression,2.975585,1.228848,0.810608
2,Decision Tree,3.183561,1.089055,0.797371
3,KNN,9.915601,2.28655,0.368886


## GridCV

In [26]:
# Define of models and corresponding parameter spaces
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Random Forest': (RandomForestRegressor(), {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'Decision Tree': (DecisionTreeRegressor(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }),
    'KNN': (KNeighborsRegressor(), {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance']
    }),
    'Gradient Boosting': (GradientBoostingRegressor(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7]
    }),
    'XGBoost': (XGBRegressor(), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7]
    }),
    'LightGBM': (LGBMRegressor(force_row_wise=True), {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.3],
        'max_depth': [3, 5, 7]
    })
}

In [27]:
result = {
    'Model': [],
    'Best Parameters': [],
    'MSE': [],
    'MAE': [],
    'R2': []
}

In [28]:
for model_name, (model, params) in models.items():
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    result['Model'].append(model_name)
    result['Best Parameters'].append(grid_search.best_params_)
    result['MSE'].append(mean_squared_error(y_test, y_pred))
    result['MAE'].append(mean_absolute_error(y_test, y_pred))
    result['R2'].append(r2_score(y_test, y_pred))

result_df = pd.DataFrame(result)
print(result_df)

[LightGBM] [Info] Total Bins 653
[LightGBM] [Info] Number of data points in the train set: 3953, number of used features: 37
[LightGBM] [Info] Start training from score 6.154430
               Model                                    Best Parameters  \
0  Linear Regression                                                 {}   
1      Random Forest  {'max_depth': 20, 'min_samples_split': 2, 'n_e...   
2      Decision Tree         {'max_depth': 10, 'min_samples_split': 10}   
3                KNN          {'n_neighbors': 9, 'weights': 'distance'}   
4  Gradient Boosting  {'learning_rate': 0.3, 'max_depth': 3, 'n_esti...   
5            XGBoost  {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   
6           LightGBM  {'learning_rate': 0.1, 'max_depth': 5, 'n_esti...   

        MSE       MAE        R2  
0  2.975585  1.228848  0.810608  
1  1.955539  0.876227  0.875533  
2  2.930324  1.111069  0.813489  
3  9.097375  2.165012  0.420965  
4  1.751426  0.813103  0.888524  
5  1.637998  0.79

In [29]:
result_df.sort_values(by='R2',ascending=False,inplace=True)
result_df.to_csv('Models_with_Best_Param.csv',index=False)

In [30]:
result_df

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
5,XGBoost,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",1.637998,0.79416,0.895744
4,Gradient Boosting,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",1.751426,0.813103,0.888524
6,LightGBM,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",1.871093,0.831715,0.880908
1,Random Forest,"{'max_depth': 20, 'min_samples_split': 2, 'n_e...",1.955539,0.876227,0.875533
2,Decision Tree,"{'max_depth': 10, 'min_samples_split': 10}",2.930324,1.111069,0.813489
0,Linear Regression,{},2.975585,1.228848,0.810608
3,KNN,"{'n_neighbors': 9, 'weights': 'distance'}",9.097375,2.165012,0.420965


## Conclusion
As we can see, XGBoost bring the highest R2 (~0.896) with the its best parameters {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300}.

I'll use it for the last model.