## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import vaex as vx
import matplotlib.pyplot as plt
import math
import sklearn
from vaex import ml
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble._bagging import BaggingClassifier
from sklearn.tree._classes import DecisionTreeClassifier
from sklearn.ensemble._forest import ExtraTreesClassifier
from sklearn.ensemble._forest import RandomForestClassifier

plt.style.use('fivethirtyeight') # For better style
random_state = 17

### Load training set

In [2]:
%%time
final_train_data = vx.open("final-train-dataset.hdf5")

CPU times: user 6.67 ms, sys: 1e+03 ns, total: 6.67 ms
Wall time: 8.99 ms


In [3]:
final_train_data.head(5)

#,scaled_site_name,scaled_posa_continent,scaled_user_location_country,scaled_user_location_region,scaled_user_location_city,scaled_user_id,scaled_is_package,scaled_channel,scaled_srch_children_cnt,scaled_srch_destination_id,scaled_srch_destination_type_id,scaled_is_booking,scaled_cnt,scaled_hotel_continent,scaled_hotel_country,scaled_hotel_market,scaled_review_cluster,scaled_days,scaled_ev_month,scaled_ev_hour,hotel_cluster
0,2.26975,-2.24274,-0.287829,1.83644,-0.483952,-1.03992,1.75354,0.841894,0.912957,-0.495056,-0.736419,-0.29565,0.422329,0.519276,-0.521026,-0.779365,0.0450958,2.82355,1.17975,0.95445,26
1,2.26975,-2.24274,-0.287829,1.83644,-0.483952,-1.03992,1.75354,0.841894,0.912957,-0.495056,-0.736419,-0.29565,1.24154,0.519276,-0.521026,-0.779365,0.0450958,2.82355,1.17975,1.13103,26
2,2.26975,-2.24274,-0.287829,1.83644,-0.483952,-1.03992,1.75354,0.841894,0.912957,-0.495056,-0.736419,-0.29565,1.24154,0.519276,-0.521026,-0.779365,0.0450958,2.82355,1.17975,1.13103,0
3,2.26975,-2.24274,-0.287829,1.83644,-0.483952,-1.03992,1.75354,0.841894,0.912957,-0.495056,-0.736419,-0.29565,-0.396877,0.519276,-0.521026,-0.779365,0.0450958,2.82355,1.17975,1.13103,90
4,2.26975,-2.24274,-0.287829,1.83644,-0.483952,-1.03992,1.75354,-1.5801,0.912957,-0.495056,-0.736419,-0.29565,-0.396877,0.519276,-0.521026,-0.779365,0.0450958,2.82355,1.17975,1.30761,26


## Split dataset for Grid Search

In [4]:
target_variable = "hotel_cluster"
features = [col for col in final_train_data.column_names if col != target_variable]
first_sample, rest_data = final_train_data.ml.train_test_split(test_size=0.9996)
first_sample.shape
#train_data_X = final_train_data[features].values
#train_data_Y = final_train_data[target_variable].values

(10422, 21)

In [5]:
part_train_X = first_sample[features].values
part_train_Y = first_sample[target_variable].values
                                                                          

### Let's take some records to find best model

In [6]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=random_state)

clfs = [
    BaggingClassifier(n_jobs=15, random_state=random_state, warm_start=True),
    DecisionTreeClassifier(random_state=random_state),
    ExtraTreesClassifier(n_jobs=15, random_state=random_state, warm_start=True),
    RandomForestClassifier(n_jobs=15, random_state=random_state, warm_start=True),
]
params = [
    {"n_estimators":[10, 50, 100], },
    {"criterion":["gini", "entropy"], "splitter":["best", "random"], "max_depth":[10, 100, None] },
    {"n_estimators":[100, 200, 300], "criterion":["gini", "entropy"], "max_depth":[10, 100, None]},
    {"n_estimators":[100, 200, 300], "criterion":["gini", "entropy"],},

]
scores = []
for i in tqdm(range(len(clfs))):
    try:
        clf = clfs[i]
        print("Evaluating Model", clf)
        grid_clf = GridSearchCV(clf,param_grid=params[i], cv=cv, verbose=1, n_jobs=15)
        #for 
        grid_clf.fit(part_train_X, part_train_Y)
        scores.append((clf, grid_clf.best_score_, grid_clf.best_params_))
    except Exception as e:
        print("Unable to execute ", clf, e)

  0%|          | 0/4 [00:00<?, ?it/s]Evaluating Model BaggingClassifier(n_jobs=15, random_state=17, warm_start=True)
Fitting 30 folds for each of 3 candidates, totalling 90 fits
[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    2.0s
[Parallel(n_jobs=15)]: Done  90 out of  90 | elapsed:   22.4s finished
 25%|██▌       | 1/4 [00:23<01:11, 23.71s/it][Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:    0.2s
Evaluating Model DecisionTreeClassifier(random_state=17)
Fitting 30 folds for each of 12 candidates, totalling 360 fits
[Parallel(n_jobs=15)]: Done 331 out of 360 | elapsed:    1.8s remaining:    0.2s
[Parallel(n_jobs=15)]: Done 360 out of 360 | elapsed:    2.1s finished
 50%|█████     | 2/4 [00:25<00:22, 11.05s/it][Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
Evaluating Model ExtraTreesClassif

In [7]:
for name, score, params in scores:
    nm = name.__class__.__name__
    print("%s:\t%0.2f \t %s" % (nm.rjust(30, ' '), score, params))

             BaggingClassifier:	0.30 	 {'n_estimators': 100}
        DecisionTreeClassifier:	0.28 	 {'criterion': 'entropy', 'max_depth': 100, 'splitter': 'best'}
          ExtraTreesClassifier:	0.30 	 {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}
        RandomForestClassifier:	0.30 	 {'criterion': 'entropy', 'n_estimators': 200}
