In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import vaex as vx
import matplotlib.pyplot as plt
import math
import sklearn
import pickle
from vaex import ml
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble._bagging import BaggingClassifier
from sklearn.tree._classes import DecisionTreeClassifier
from sklearn.ensemble._forest import ExtraTreesClassifier
from sklearn.ensemble._forest import RandomForestClassifier

plt.style.use('fivethirtyeight') # For better style
random_state = 17

In [2]:
%%time
final_train_data = vx.open("final-train-dataset.hdf5")
final_train_data = final_train_data[0:15_000_000]

CPU times: user 7.69 ms, sys: 0 ns, total: 7.69 ms
Wall time: 7.34 ms


In [3]:
target_variable = "hotel_cluster"
features = [col for col in final_train_data.column_names if col != target_variable]
#train_data_X = final_train_data[features].values
#train_data_Y = final_train_data[target_variable].values

### Train the model with All training data

In [4]:
def fit_data(model, df):
    count = df.count()
    batch_size=100_000
    iter = math.ceil(count/batch_size) 
    for i in tqdm(range(iter)):
        start_idx = i * batch_size
        end_index = min(start_idx + batch_size, count)
        #print(start_idx, end_index)
        part_data = df[start_idx: end_index]
        model.fit(df=part_data)
        vaex_model.model.n_estimators += 1 


In [5]:
from vaex.ml.sklearn import Predictor
model = RandomForestClassifier(n_jobs=15, random_state=random_state, warm_start=True, criterion="entropy", n_estimators=200)
vaex_model = Predictor(features=features, target=target_variable, model=model, prediction_name="predicted_hotel_cluster")
fit_data(vaex_model, final_train_data)


100%|██████████| 150/150 [02:45<00:00,  1.10s/it]


In [6]:
dest_data = vx.open("destination-data-with-clusters.hdf5")
review_df = dest_data["srch_destination_id", "review_cluster"]

In [7]:
dest_data.head(5)

#,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,d38,d39,d40,d41,d42,d43,d44,d45,d46,d47,d48,d49,d50,d51,d52,d53,d54,d55,d56,d57,d58,d59,d60,d61,d62,d63,d64,d65,d66,d67,d68,d69,d70,d71,d72,d73,d74,d75,d76,d77,d78,d79,d80,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90,d91,d92,d93,d94,d95,d96,d97,d98,d99,d100,d101,d102,d103,d104,d105,d106,d107,d108,d109,d110,d111,d112,d113,d114,d115,d116,d117,d118,d119,d120,d121,d122,d123,d124,d125,d126,d127,d128,d129,d130,d131,d132,d133,d134,d135,d136,d137,d138,d139,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149,review_cluster
0,0,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-1.89763,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,-2.19866,24
1,1,-2.18169,-2.18169,-2.18169,-2.08256,-2.18169,-2.16503,-2.18169,-2.18169,-2.0316,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.04379,-2.18169,-2.18169,-2.18169,-2.16503,-2.18169,-2.13351,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.13351,-2.18169,-2.18169,-2.18169,-2.18169,-2.08256,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.16503,-2.16503,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.08256,-2.18169,-2.13351,-2.18169,-2.18169,-2.18169,-2.18169,-2.08256,-2.18169,-2.18169,-2.18169,-2.18169,-2.16503,-2.18169,-2.18169,-2.18169,-2.18169,-2.14898,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.13351,-2.18169,-2.18169,-2.18169,-2.18169,-2.18169,-2.13351,-2.18169,-2.18169,-2.18169,-2.14898,-2.18169,-2.18169,-2.18169,-2.18169,-2.14898,-2.18169,-2.18169,-2.16503,-2.18169,-2.16503,-2.18169,-2.18169,-2.16503,-2.18169,-2.18169,-2.18169,-2.18169,38
2,2,-2.18349,-2.22416,-2.22416,-2.18956,-2.10582,-2.07541,-2.22416,-2.11848,-2.14039,-2.22416,-2.20985,-2.22416,-2.11072,-2.18601,-2.22416,-2.12447,-2.22416,-2.15647,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.15332,-2.18601,-2.22416,-2.22416,-2.22416,-2.12824,-2.15831,-2.05972,-1.91027,-2.22416,-2.12305,-2.22416,-2.15405,-2.19733,-1.94546,-2.22416,-2.22416,-2.19733,-2.16689,-2.19201,-1.90378,-2.22416,-2.18956,-2.22416,-2.0831,-2.22416,-2.0701,-2.15774,-2.19638,-2.06981,-2.22416,-2.22416,-2.07295,-2.22416,-2.22416,-2.22416,-2.22416,-2.14039,-2.22416,-2.22416,-2.09655,-2.22416,-2.18753,-2.22416,-2.22416,-2.09867,-2.01637,-2.22416,-2.12612,-2.18956,-2.00832,-2.08583,-2.22416,-2.22416,-2.22416,-2.21836,-2.21934,-2.22416,-2.22416,-2.16049,-2.22416,-2.22416,-2.0768,-2.22416,-1.97508,-2.16615,-2.22416,-2.22416,-2.22416,-2.22416,-2.22416,-2.16375,-2.22416,-2.07236,-2.10028,-2.22416,-2.20353,-2.22416,-2.22416,-2.09436,-2.22416,-2.14039,-2.22416,-2.16472,-2.22416,-2.155,-2.19733,-2.22416,-2.19733,-2.22416,-2.22416,-2.19733,-2.18701,-2.02626,-2.22416,-2.22416,-2.22416,-2.22382,-2.22416,-2.04928,-2.18956,-2.11848,-2.14556,-2.11781,-2.22416,-2.18018,-2.22416,-2.22416,-2.21457,-2.18601,-2.19157,-2.22416,-2.22416,-2.19638,-2.22416,-2.19201,-2.22416,-2.22416,-2.22416,-2.22416,-2.05755,21
3,3,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.11548,-2.17741,-2.17741,-2.17741,-2.17741,-2.16108,-2.17741,-2.17741,-2.17741,-2.17741,-2.13016,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.10129,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.14534,-2.17741,-2.17741,-2.17741,-2.14534,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.10129,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.16108,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.11548,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.16108,-2.17741,-2.17741,-2.16108,-2.13016,-2.17741,-2.17741,-2.17741,-2.17741,-2.11548,-2.17741,-2.16108,-2.16108,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,-2.17741,7
4,4,-2.18956,-2.18778,-2.19401,-2.17115,-2.1523,-2.05662,-2.19401,-2.19401,-2.14591,-2.19401,-2.08909,-2.19401,-2.15521,-2.071,-2.19401,-2.07496,-2.18553,-2.19401,-2.18956,-2.19178,-2.18503,-2.15022,-2.19401,-2.18923,-2.19401,-2.19163,-2.14603,-2.19401,-2.13026,-2.17781,-2.19401,-2.15965,-2.19401,-2.1707,-2.19401,-2.19401,-1.94652,-2.19178,-2.19401,-2.17222,-2.1889,-2.19401,-2.10932,-2.18116,-2.1756,-2.19401,-2.18068,-2.19401,-2.09243,-2.19401,-2.17254,-2.19401,-2.14542,-2.19401,-2.1157,-2.12079,-2.18804,-2.14921,-2.19401,-2.19401,-2.19178,-2.19401,-2.18503,-2.19401,-2.19401,-2.16987,-2.19401,-2.19401,-2.16799,-2.19401,-2.15842,-2.19401,-2.19178,-2.17454,-2.18071,-2.17794,-2.1889,-2.16549,-2.19178,-2.14596,-2.19401,-2.18736,-2.16987,-2.19172,-2.19111,-2.19401,-2.19401,-2.17613,-2.19401,-2.19401,-2.19401,-2.19401,-1.96429,-2.17679,-2.19178,-2.19178,-2.18956,-2.19401,-2.19401,-2.19401,-2.16589,-2.19401,-2.14599,-2.19401,-2.19401,-2.19178,-2.19401,-2.09742,-2.19401,-2.18003,-2.19401,-2.19401,-2.19401,-2.19401,-2.19178,-2.19401,-2.19401,-2.19401,-2.19401,-2.19401,-2.15274,-2.13918,-2.19401,-2.1823,-2.19401,-2.19401,-2.19401,-2.15716,-2.19401,-2.17445,-2.13937,-2.0993,-2.19401,-2.19401,-2.19401,-2.19401,-2.15147,-2.19401,-2.16324,-2.18736,-2.19401,-2.19178,-2.19401,-2.19401,-2.18516,-2.19401,-2.19401,-2.19401,-2.18804,29


In [8]:
test_data = vx.open("test-data-split.hdf5")
test_data = test_data[0:30_000]

In [9]:
test_data.head(5)

#,date_time,site_name,posa_continent,user_location_country,user_location_region,user_location_city,orig_destination_distance,user_id,is_mobile,is_package,channel,srch_ci,srch_co,srch_adults_cnt,srch_children_cnt,srch_rm_cnt,srch_destination_id,srch_destination_type_id,is_booking,cnt,hotel_continent,hotel_country,hotel_market,hotel_cluster
0,2014-08-11 07:46:59.000000000,2,3,66,348,48862,2234.26,12,0,1,9,2014-08-27,2014-08-31,2,0,1,8250,1,0,3,2,50,628,1
1,2014-08-11 08:22:12.000000000,2,3,66,348,48862,2234.26,12,0,1,9,2014-08-29,2014-09-02,2,0,1,8250,1,1,1,2,50,628,1
2,2014-08-11 08:24:33.000000000,2,3,66,348,48862,2234.26,12,0,0,9,2014-08-29,2014-09-02,2,0,1,8250,1,0,1,2,50,628,1
3,2014-08-09 18:05:16.000000000,2,3,66,442,35390,913.193,93,0,0,3,2014-11-23,2014-11-28,2,0,1,14984,1,0,1,2,50,1457,80
4,2014-08-09 18:08:18.000000000,2,3,66,442,35390,913.626,93,0,0,3,2014-11-23,2014-11-28,2,0,1,14984,1,0,1,2,50,1457,21


In [10]:
def pipeline_for_transformation(input_df):
    print("Transforming.....")
    input_df["srch_ci"] = input_df.srch_ci.astype("datetime64[D]")
    input_df["srch_co"] = input_df.srch_co.astype("datetime64[D]")
    input_df = input_df.join(review_df, how="left", on="srch_destination_id", rprefix="", inplace=True)
    input_df.fillna(99, column_names=["review_cluster"], inplace=True)
    input_df["review_cluster"] = input_df.review_cluster.fillmissing(99)
    input_df = input_df[input_df.srch_ci<np.datetime64('2020-01-01')]
    input_df = input_df[input_df.srch_co<np.datetime64('2020-01-01')]
    input_df.drop(columns = ["orig_destination_distance"], inplace=True)
    input_df["days"] = input_df["srch_co"] - input_df["srch_ci"]
    input_df["days"] = input_df["days"] / np.timedelta64(1, 'D')
    input_df = input_df[(input_df.days>0) & (input_df.days<15)]
    input_df["ev_day"] = input_df.date_time.dt.day
    input_df["ev_month"] = input_df.date_time.dt.month
    input_df["ev_hour"] = input_df.date_time.dt.hour
    columns_to_ignore = ['date_time','srch_ci', 'srch_co', target_variable, "srch_rm_cnt", "is_mobile", "srch_adults_cnt", "ev_day"]
    columns_for_model = [col for col in input_df.column_names if col not in columns_to_ignore and not col.startswith("__") ]
    #columns_for_model
    scaler = ml.StandardScaler(features=columns_for_model, prefix='scaled_')
    scaler.fit(input_df)
    scaled_df = scaler.transform(input_df)
    print("Transformation complete")
    return scaled_df
scaled_df = pipeline_for_transformation(test_data)

Transforming.....
Transformation complete


In [11]:
#scaled_df.head(2)
features = [col for col in scaled_df.column_names if col.startswith("scaled_")]
features.append("hotel_cluster")
features
df = scaled_df[features].to_pandas_df()

In [12]:
predictions = vaex_model.predict(scaled_df)

In [13]:
score = sklearn.metrics.accuracy_score(df["hotel_cluster"], predictions, normalize=True)
score

0.12014681617671819