In [25]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, mean_absolute_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [26]:
df = pd.read_csv("final_scout_20200925.csv")

In [27]:
X = df.drop("price", axis=1)
y = df["price"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    mse = mean_squared_error(actual, pred)
    score = r2_score(actual, pred)
    return print("r2_score:", score, "\n","mae:", mae, "\n","mse:",mse, "\n","rmse:",rmse)

### Random Forest

In [6]:
rf_model = RandomForestRegressor().fit(X_train,y_train)

In [7]:
y_pred = rf_model.predict(X_test)

In [9]:
eval_metrics(y_test, y_pred)

r2_score: 0.9563214447685126 
 mae: 859.4685949744631 
 mse: 2357225.9457382727 
 rmse: 1535.3260063381565


### RF Tunning

In [10]:
rf = RandomForestRegressor()

In [46]:
rf_params = {"n_estimators":[50,100,300],
             "max_depth":[10,20,30],
             "max_features":[30,50,70],
             "min_samples_split":[2,4,6]    
}

In [47]:
# rf_cv_model = GridSearchCV(rf, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed: 33.2min finished


In [48]:
rf_cv_model.best_params_

{'max_depth': 30,
 'max_features': 50,
 'min_samples_split': 2,
 'n_estimators': 300}

In [8]:
rf_tuned = RandomForestRegressor(max_depth = 33,
                                  max_features = 50,
                                  min_samples_split = 2,
                                  n_estimators = 500).fit(X_train, y_train)

In [9]:
y_pred = rf_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9637963748514383 
 mae: 818.1396401831217 
 mse: 1953822.0547288493 
 rmse: 1397.7918495716197


In [6]:
rf_tuned = RandomForestRegressor(max_depth = 33,
                                  max_features = 50,
                                  min_samples_split = 2,
                                  n_estimators = 500).fit(X_train, y_train)

In [7]:
y_pred = rf_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9638859258624611 
 mae: 817.6308165355388 
 mse: 1948989.2033322882 
 rmse: 1396.062034199157


### Gradient Boosting

In [23]:
gb = GradientBoostingRegressor(max_depth=6, 
                               n_estimators=300,
                               random_state=4)

In [24]:
gb.fit(X_train,y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=6,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=300,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=4, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
y_pred = gb.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9626290948340375 
 mae: 881.7777242405136 
 mse: 2016817.3330382279 
 rmse: 1420.146940650237


### XGBOOST

In [20]:
xgb_model = XGBRegressor().fit(X_train, y_train)

In [21]:
y_pred = xgb_model.predict(X_test)

In [22]:
eval_metrics(y_test, y_pred)

r2_score: 0.9580704691892622 
 mae: 931.8713301437814 
 mse: 2262835.3295086254 
 rmse: 1504.2723588195806


### XGBoost Tunning

In [11]:
xgb = XGBRegressor()

In [12]:
xgb_params = {"n_estimators": [50, 100, 300],
             "subsample":[0.5,0.8,1],
             "max_depth":[10,20,30],
             "learning_rate":[0.1,0.01,0.3]}

In [None]:
xgb_cv_model = GridSearchCV(xgb, xgb_params, cv = 3, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [15]:
xgb_tuned = XGBRegressor(learning_rate= 0.1,
                         max_depth= 30,
                         n_estimators= 300,
                         subsample= 1).fit(X_train, y_train)

In [16]:
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9617005104054279 
 mae: 836.3680253863297 
 mse: 2066930.7879436458 
 rmse: 1437.6824364036886


In [8]:
xgb_tuned = XGBRegressor(learning_rate= 0.1,
                         max_depth= 33,
                         n_estimators= 1000,
                         subsample= 0.8).fit(X_train, y_train)

In [9]:
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

r2_score: 0.9636398264370835 
 mae: 798.6047132409971 
 mse: 1962270.5938832627 
 rmse: 1400.8106916651025


### Recursive Feature Elimiation (RFE)

*Feature selection refers to techniques that select a subset of the most relevant features (columns) for a dataset. Fewer features can allow machine learning algorithms to run more efficiently (less space or time complexity) and be more effective. Some machine learning algorithms can be misled by irrelevant input features, resulting in worse predictive performance.*

In [11]:
# check scikit-learn version
import sklearn
from sklearn.feature_selection import RFE
print(sklearn.__version__)

0.23.2


*The RFE method is available via the RFE class in scikit-learn.*

*RFE is a transform. To use it, first the class is configured with the chosen algorithm specified via the “estimator” argument and the number of features to select via the “n_features_to_select” argument.*

In [13]:
# define the method
rfe = RFE(estimator=XGBRegressor(), n_features_to_select=10)
# fit the model
rfe.fit(X, y)

RFE(estimator=XGBRegressor(base_score=None, booster=None,
                           colsample_bylevel=None, colsample_bynode=None,
                           colsample_bytree=None, gamma=None, gpu_id=None,
                           importance_type='gain', interaction_constraints=None,
                           learning_rate=None, max_delta_step=None,
                           max_depth=None, min_child_weight=None, missing=nan,
                           monotone_constraints=None, n_estimators=100,
                           n_jobs=None, num_parallel_tree=None,
                           random_state=None, reg_alpha=None, reg_lambda=None,
                           scale_pos_weight=None, subsample=None,
                           tree_method=None, validate_parameters=None,
                           verbosity=None),
    n_features_to_select=10)

In [14]:
X.shape

(15915, 140)

In [23]:
X_df =pd.DataFrame(X) 
dset = pd.DataFrame()
dset['attr'] = X_df.columns
dset['importance'] = rfe.ranking_[:10]
dset = dset.sort_values(by='importance')

In [24]:
dset.head(20)

Unnamed: 0,attr,importance
3,3,1
5,5,1
1,1,3
0,0,7
8,8,18
9,9,20
6,6,30
4,4,45
7,7,53
2,2,81


In [15]:
# transform the data
X = rfe.transform(X)
# y = rfe.transform(y)

In [16]:
X.shape

(15915, 10)

In [17]:
rfe.ranking_

array([  7,   3,  81,   1,  45,   1,  30,  53,  18,  20, 117, 103,  21,
        10,  37, 105, 125,  15,  92,  27,  66,   4,  23,  49,  68, 123,
       100,  83,  90,  28,  80,  29,  38,   5,  22,  26,  65,  55,  72,
        70,  34, 107, 109, 112,  14, 101,  97, 126,  11,  84,  52,  62,
        60,  99,  89,  78, 118,  50,  48, 111,  67, 122, 128,  54,  58,
       120, 130,  57,  32,  64,  61,  82, 108,  87, 116,  43,  74,  85,
        35,  76,  91,  95,  56, 106,  94,  98,  93, 102, 115,  46,  63,
        51,  88,   9,  36, 113,  86,  13,  44,  69,  42,  73,  77,   8,
         1,  75,   1,   1,   1,   1,   1,  24,   1,  96,  47, 124, 119,
        39,  79, 114,  12,  71, 129,   6, 110,  17,  41,  59,  19, 104,
        16,  40,  33, 131,  31,   1,   2,  25, 127, 121])

In [18]:
print("Optimal number of features: {}".format(rfe.n_features_))

Optimal number of features: 10


In [19]:
rfe.estimator_.feature_importances_

array([0.15044728, 0.11374374, 0.06124549, 0.13714488, 0.05301202,
       0.03813469, 0.03263375, 0.06343748, 0.31886244, 0.0313382 ],
      dtype=float32)

In [29]:
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train, y_train)
y_pred = xgb_tuned.predict(X_test)
eval_metrics(y_test, y_pred)

#### 2.Method (without designated number of features)

In [31]:
# define the method
rfe = RFE(estimator=XGBRegressor())

In [32]:
# fit the model in 10 min
rfe.fit(X_train, y_train)

RFE(estimator=XGBRegressor(base_score=None, booster=None,
                           colsample_bylevel=None, colsample_bynode=None,
                           colsample_bytree=None, gamma=None, gpu_id=None,
                           importance_type='gain', interaction_constraints=None,
                           learning_rate=None, max_delta_step=None,
                           max_depth=None, min_child_weight=None, missing=nan,
                           monotone_constraints=None, n_estimators=100,
                           n_jobs=None, num_parallel_tree=None,
                           random_state=None, reg_alpha=None, reg_lambda=None,
                           scale_pos_weight=None, subsample=None,
                           tree_method=None, validate_parameters=None,
                           verbosity=None))

In [33]:
print("Num Features: %d" % rfe.n_features_)

Num Features: 70


In [34]:
print("Selected Features: %s" % rfe.support_)

Selected Features: [ True  True False  True  True  True  True  True  True  True False False
  True  True  True False False False False  True  True  True  True  True
 False False False False False  True False  True  True  True  True  True
  True False False False  True  True False False  True False False False
  True False False False False  True False False False  True  True False
  True False False  True False False False  True  True  True False False
 False False False False False False  True  True False False False False
 False  True False False False  True False  True  True  True  True False
  True  True  True  True False False False  True  True False  True  True
  True  True  True  True  True False False False False  True False False
  True False False  True False  True  True False  True False  True  True
  True False  True  True  True  True False False]


In [35]:
print("Feature Ranking: %s" % rfe.ranking_)

Feature Ranking: [ 1  1 16  1  1  1  1  1  1  1 45  7  1  1  1 30 63 19 23  1  1  1  1  1
 17 61 26 28 48  1 27  1  1  1  1  1  1 29 21 44  1  1 32 38  1 51 58 71
  1 52 10 13 22  1 42 47 50  1  1 55  1 64 66  1  2 43 67  1  1  1 36 53
 56  6 49  9 18 25  1  1 46 39  5 41 33  1 24  8 40  1 20  1  1  1  1 35
  1  1  1  1  4 11 12  1  1 68  1  1  1  1  1  1  1  3 31 62 57  1 15 54
  1 14 65  1 37  1  1 34  1 60  1  1  1 69  1  1  1  1 70 59]


In [36]:
len(rfe.ranking_)

140

In [37]:
X_df =pd.DataFrame(X) 
dset = pd.DataFrame()
dset['attr'] = X_df.columns
dset['importance'] = rfe.ranking_
dset = dset.sort_values(by='importance')

In [39]:
dset

Unnamed: 0,attr,importance
0,km,1
98,ss_Rear airbag,1
97,ss_Power steering,1
96,ss_Passenger-side airbag,1
94,ss_Lane departure warning system,1
...,...,...
66,ex_Sliding door,67
105,make_model_Audi A2,68
133,Upholstery_type_Part/Full Leather,69
138,Drive_chain_front,70


In [40]:
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

In [41]:
# Fitting our baseline model with the transformed data
xgb_tuned = XGBRegressor(n_estimators=1000,
                         subsample=0.8,
                         max_depth=33,
                         learning_rate=0.1).fit(X_train_rfe, y_train)
y_pred = xgb_tuned.predict(X_test_rfe)
eval_metrics(y_test, y_pred)

r2_score: 0.9636679661784083 
 mae: 819.145074296065 
 mse: 1960751.9601279604 
 rmse: 1400.268531435296
