In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings('ignore')

import os
print(os.listdir("../input"))

In [None]:
df = pd.read_csv("../input/craigslist-carstrucks-data/vehicles.csv")

In [None]:
df.columns

In [None]:
drop_columns = ['id', 'url', 'region', 'region_url', 'VIN', 'image_url', 'description', 'lat', 'long', 'posting_date']
df.drop(drop_columns, axis=1, inplace=True)

In [None]:
from pandas_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_notebook_iframe()

In [None]:
# set the threshold for price
# transform year to age
# drop model column
# evaluate other categorical in fuel column
# set the threshold for odometer and seek the connection with age column
# pick just clean title status
# drop county column
# drop state
# drop duplicates
# missing value analysis

In [None]:
# set the threshold for price

In [None]:
df[df['price'] == df['price'].min()]

In [None]:
df.sort_values(by='price', ascending=False).head()

In [None]:
df = df[(df['price'] <= 50000) & (df['price'] >= 1000) ]

In [None]:
# transform year to age

In [None]:
df['age'] = 2021 - df['year']

In [None]:
df.drop(['year'], axis=1, inplace=True)

In [None]:
# drop model, county and state column
df.drop(columns=['model', 'county', 'state'], inplace=True)

In [None]:
# pick just clean title status
df.drop(['title_status'], axis=1, inplace=True)

In [None]:
df['fuel'].value_counts()

In [None]:
# set the threshold for odometer and seek the connection with age column
df.sort_values(by='odometer').head()

In [None]:
# give the minimum threshold for odometer and age, which make sense for used car
# I give an age threshold of 1 years old at minimum age
df = df[df['age'] >= 1]

In [None]:
# and for the odometer, 
# Americans drive an average of 14,300 miles per year, according to the Federal Highway Administration. (https://www.thezebra.com/resources/driving/average-miles-driven-per-year/)
df = df[df['odometer'] >= 14000]

In [None]:
df.drop(['paint_color'], axis=1, inplace=True)

In [None]:
df = df.drop_duplicates()

In [None]:
df.head()

In [None]:
sns.boxplot(data=df, y='odometer')

In [None]:
df[df['odometer'] == df['odometer'].max()]

In [None]:
# based on 14300 miles per year, if we multiple 100 times for 100 years it will be 1430000, so i set to the maximum in the digit which is 9999999
df = df[df['odometer'] <= 9999999 ]

In [None]:
sns.boxplot(data=df, y='odometer')

In [None]:
df['manufacturer'].fillna('other', inplace=True)

In [None]:
df = df.dropna()

In [None]:
from pandas_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_notebook_iframe()

In [None]:
df.head()

### Prepare the dataset

In [None]:
from sklearn.preprocessing import LabelEncoder

cat_columns = ['manufacturer', 'condition', 'cylinders', 'fuel', 'transmission', 'drive', 'size', 'type']

le = {}

for col in cat_columns:
    if col in df.columns:
        le[col] = LabelEncoder()
        le[col].fit(list(df[col].astype(str).values))
        df[col] = le[col].transform(list(df[col].astype(str).values))

I used the label encoder rather than dummy to get simpler model with the least features in the model, cause in the data the categorical column ratio is 8 out of 10. I can also use dummy with the same result, but its increase the computational cost.

In [None]:
df.head()

In [None]:
y = df['price']
X = df.drop(columns='price')

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Compare the regression models

I compare some regression algorithm to get the basic idea of the idea of how is the model behave on the default parameters on the algorithms. here I choose the top 3 algorithms

In [None]:
def model_selection(X_train, y_train, X_test, y_test, models):
    
    from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error, r2_score
    
    R2_result = []
    MSE_result = []
    str_models = []
    
    for model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        MSE = mean_squared_error(y_test, y_pred)
        R2 = r2_score(y_test, y_pred)       
        R2_result.append(R2)
        MSE_result.append(MSE)  
        str_models.append(str(model))
    
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,10))

    ax1.plot(R2_result)
    ax1.set_ylabel('R2_score')


    ax2.plot(str_models,np.sqrt(MSE_result))
    ax2.set_ylabel('RMSE_result')
    ax2.set_xticklabels(str_models, rotation=90)
    plt.tight_layout()
    
    return pd.DataFrame({'models':models, 'R2':R2_result, 'RMSE':np.sqrt(MSE_result)}) 

https://github.com/codebasics/py/blob/master/ML/15_gridsearch/15_grid_search.ipynb
https://github.com/justmarkham/scikit-learn-videos/blob/master/08_grid_search.ipynb

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [None]:
models = [LinearRegression(), Ridge(), Lasso(), BayesianRidge(), SVR(), KNeighborsRegressor(),
          DecisionTreeRegressor(), BaggingRegressor(), RandomForestRegressor(), AdaBoostRegressor(), GradientBoostingRegressor(),
          HistGradientBoostingRegressor(), MLPRegressor(), XGBRegressor()]

model_selection(X_train, y_train, X_test, y_test, models)   

### Cross Validation for the best model in model comparasion

In [None]:
def acc_CV(model, X, y):
    from sklearn.model_selection import cross_val_score

    accuracies = cross_val_score(estimator = model, X= X, y=y,  cv=10)
    accuracies.mean()
    accuracies.std()
    print('akurasi  {:.2f}% +/- {:.2f}%' .format(accuracies.mean()*100, accuracies.std()*100))

In [None]:
rf = RandomForestRegressor()
HGB = HistGradientBoostingRegressor()
XGB = XGBRegressor()

In [None]:
acc_CV(rf, X_train, y_train)

In [None]:
acc_CV(HGB, X_train, y_train)

In [None]:
acc_CV(XGB, X_train, y_train)

## Random Forest model

### Tuning using GridSearchCV per parameters


In [None]:
def tuning_param(X, y, model, parameters):
    
    from sklearn.model_selection import GridSearchCV   
        
    scores = []
   
    fig, axs = plt.subplots(len(parameters))
    k = 0   
    for parameter in parameters:
        
        clf = GridSearchCV(estimator = model, param_grid = parameter, cv=3, scoring='r2', n_jobs=-1)
        clf.fit(X, y)
        

        for name_param, val_param in parameter.items():
            
            grid_mean_scores = clf.cv_results_['mean_test_score']

            if len(parameters) == 1:
               axs.plot(val_param, grid_mean_scores)
               axs.set_xlabel(name_param)
               axs.set_ylabel('R2') 
            
            else:
                axs[k].plot(val_param, grid_mean_scores)
                axs[k].set_xlabel(name_param)
                axs[k].set_ylabel('R2')
                
                k+=1
            
            
        scores.append({'parameter':name_param,
                       'best_R2':clf.best_score_,
                       'best_value':clf.best_params_})
            
    plt.tight_layout()       
    return pd.DataFrame(scores, columns=['parameter', 'best_R2', 'best_value'])

it can calculate in one time using those code above, but because computational reason here i am using one parameter for one code.

In [None]:
tuning_param(X_train, y_train, rf, [{'n_estimators': np.arange(100, 1000, 50)}])

In [None]:
tuning_param(X_train, y_train, rf, [{'min_samples_split':np.arange(1, 25, 1)}])

In [None]:
tuning_param(X_train, y_train, rf, [{'min_samples_leaf':np.arange(1, 10, 1)}])

In [None]:
tuning_param(X_train, y_train, rf, [{'max_depth':np.arange(10,250, 10)}])

In [None]:
tuning_param(X_train, y_train, rf, [{'max_features':['auto', 'sqrt']}])

From the tuning_param function we can see the effect of the parameters change to R2 performance and the parameters result can be the value range for the hyperparameter tuning using RandomizedsearchCV. 

### Hyperparameter using RandomizedsearchCV

In [None]:
def model_randomCV(X, y, model, parameters):
    
    from sklearn.model_selection import RandomizedSearchCV
    
    randCV = RandomizedSearchCV(estimator=model, param_distributions=parameters, n_jobs=-1, cv=5)
    
    randCV.fit(X, y)
   
    print('best_parameters: ' + str(randCV.best_params_))
    print('best_score: ' + str(randCV.best_score_))
    print('best_estimator: ' + str(randCV.best_estimator_))    
    
    return pd.DataFrame(randCV.cv_results_).sort_values(by='rank_test_score')

In [None]:
parameters = {'n_estimators': np.arange(150, 260, 10), 'max_features':['auto', 'sqrt'], 'max_depth':np.arange(100,210, 10), 
             'min_samples_split':np.arange(8, 15, 1), 'min_samples_leaf':np.arange(1,6,1)}

model_randomCV(X_train, y_train, rf, parameters)

the performance using hyperparameter tuning increase from 0.775 to 0.781, even its not big change but still its the performance progress. but before using it in the final model, we should check the underfitting and overfitting from model to get the confidence that the model is works for the unknown data / unseen samples 

### underfit and overfit check

1. Overfitting is when the model’s error on the training set (i.e. during training) is very low but then, the model’s error on the test set (i.e. unseen samples) is large!

2. Underfitting is when the model’s error on both the training and test sets (i.e. during training and testing) is very high.

https://towardsdatascience.com/is-your-model-overfitting-or-maybe-underfitting-an-example-using-a-neural-network-in-python-4faf155398d2

In [None]:
def fit_check(model, kfolds):
    
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    
    kf = KFold(n_splits=kfolds)
    list_training_error = []
    list_testing_error = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_train_data_pred = model.predict(X_train)
        y_test_data_pred = model.predict(X_test)
        fold_training_error = np.sqrt(mean_squared_error(y_train, y_train_data_pred)) 
        fold_testing_error = np.sqrt(mean_squared_error(y_test, y_test_data_pred))
        list_training_error.append(fold_training_error)
        list_testing_error.append(fold_testing_error)
    
    figsize=(5,5)
    plt.plot(range(1, kf.get_n_splits() + 1), np.array(list_training_error).ravel(), 'o-', label = 'training')
    plt.plot(range(1, kf.get_n_splits() + 1), np.array(list_testing_error).ravel(), 'o-', label = 'testing')
    plt.xlabel('number of fold')
    plt.ylabel('RMSE')
    plt.title('RMSE across folds')
    plt.legend()
    plt.tight_layout()
    plt.show()

here in the kaggle notebook, my fit_check got an error. I don't know why, cause in Jupyter Notebook works very well. and I think Iam too lazy to fix it here, so here in my kaggle I skip this process. if you want to check my complete works, you can visit my github at https://github.com/RodzanIskandar/used_car_price_prediction

## HGB model

### Tuning parameters

In [None]:
tuning_param(X_train, y_train, HGB, [{'learning_rate':np.arange(0.1, 1, 0.1)}])

In [None]:
tuning_param(X_train, y_train, HGB, [{'max_leaf_nodes':np.arange(50, 150, 10)}])

In [None]:
tuning_param(X_train, y_train, HGB, [{'max_iter': np.arange(100, 600, 10)}])

In [None]:
tuning_param(X_train, y_train, HGB, [{'max_depth': np.arange(0, 500, 10)}])

In [None]:
tuning_param(X_train, y_train, HGB, [{'min_samples_leaf': np.arange(0, 100, 10)}])

In [None]:
tuning_param(X_train, y_train, HGB, [{'l2_regularization': np.arange(0, 100, 10)}])

### Hyperparameter using RandomizedsearchCV 

In [None]:
parameters = {'learning_rate':np.arange(0.1, 0.4, 0.1), 'max_leaf_nodes':np.arange(100, 150, 10), 'max_iter': np.arange(200, 400, 10), 
             'max_depth': np.arange(50, 150, 10), 'min_samples_leaf':np.arange(0,50,10)}

model_randomCV(X_train, y_train, HGB, parameters)

The result from the HGB model is kind of trade off, the HGB with hyperparameter tuning got the better R2 but the worse overfitting case, and vice versa for the HGB default

## XGB Model

### Tuning parameters

In [None]:
tuning_param(X_train, y_train, XGB, [{'n_estimators': np.arange(50, 250, 10)}])

In [None]:
tuning_param(X_train, y_train, XGB, [{'max_depth': np.arange(1, 11, 1)}])

In [None]:
tuning_param(X_train, y_train, XGB, [{'eta': np.arange(0.1, 1, 0.1)}])

In [None]:
tuning_param(X_train, y_train, XGB, [{'eta': np.arange(0.1, 0.3, 0.01)}])

In [None]:
tuning_param(X_train, y_train, XGB, [{'subsample': np.arange(0.1, 1, 0.1)}])

In [None]:
tuning_param(X_train, y_train, XGB, [{'colsample_bytree': np.arange(0.1, 1, 0.1)}])

In [None]:
parameters = {'n_estimators': np.arange(125, 175, 5), 'max_depth': np.arange(4, 9, 1), 'eta': np.arange(0.1, 0.4, 0.1), 
             'subsample': [1], 'colsample_bytree': [1]}

model_randomCV(X_train, y_train, XGB, parameters)

### Final predicted model

In [None]:
HGB.fit(X_train, y_train)

In [None]:
def predicted_price(manufacturer, condition, cylinders, fuel, odometer, transmission, drive, size, type, age):
    
    x = np.zeros(10)
    x[0] = le['manufacturer'].transform([manufacturer])
    x[1] = le['condition'].transform([condition])
    x[2] = le['cylinders'].transform([cylinders])
    x[3] = le['fuel'].transform([fuel])
    x[4] = odometer
    x[5] = le['transmission'].transform([transmission])
    x[6] = le['drive'].transform([drive])
    x[7] = le['size'].transform([size])
    x[8] = le['type'].transform([type])
    x[9] = age
    
    x = scaler.transform([x])
    
    
    return HGB.predict(x)

In [None]:
predicted_price('toyota', 'excellent', '4 cylinders', 'gas', 1000000.0 , 'automatic', 'rwd', 'mid-size', 'sedan', 20)

In [None]:
predicted_price('toyota', 'excellent', '4 cylinders', 'gas', 1000000.0 , 'automatic', 'rwd', 'mid-size', 'sedan', 1)

In [None]:
predicted_price('ford', 'excellent', '4 cylinders', 'gas', 1000000.0 , 'automatic', 'rwd', 'mid-size', 'sedan', 1)