# 1. Info

This notebook is to select the best model. The best model will be pick after tuning the parameters of each model.

To understand why some features are used and not all, go to the notebook "01_feature_importance.ipynb". Before running this notebook, you should run the notebook "00_eda.ipynb".

# 2. Select the model

## 2.1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

## 2.2. Read the data

In [2]:
data = pd.read_csv('../data/Hotel_Cancelations.csv')

## 2.3. Prepare the data

In [3]:
most_meaningful_features = ['lead_time','arrival_year','avg_price_per_room','no_of_special_requests','market_segment_type']

In [4]:
df = data[most_meaningful_features + ['booking_status']].copy()

## 2.4. split the data

In [5]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.booking_status.values
y_val = df_val.booking_status.values
y_test = df_test.booking_status.values

del df_train['booking_status']
del df_val['booking_status']
del df_test['booking_status']

## 2.5. Vectorize the data

In [6]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[most_meaningful_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[most_meaningful_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[most_meaningful_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

## 2.6. Train the models

In [7]:
def find_best_threshold(model, x_data, y_data):
    # make predictions
    y_pred = model.predict_proba(x_data)[:, 1]

    # test different threshols
    thresholds = np.linspace(0, 1, 21)
    scores = []
    thr = []
    for t in thresholds:
        score = accuracy_score(y_data, y_pred >= t)
        scores.append(score)
        thr.append(t)

    # return only the best three scores
    return pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(3)

### 2.6.1 Logistic Regression

In [None]:
scores = pd.DataFrame(data={'threshold':[], 'accuracy_score':[], 'C':[], 'solver':[],'data':[]})

C_list = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10]
solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
max_iter_list = [100, 300, 500, 1000]
for max_iter in max_iter_list:

    print(max_iter)
    
    for c in C_list:
        for solver in solver_list:
            # create the model with the current parameters
            lr_model = LogisticRegression(C=c, solver=solver)
            lr_model.fit(X_train, y_train)
            
            # find the bes threshold in the validation on test data and save it into a single data frame
            val_df = find_best_threshold(lr_model, X_val, y_val)
            val_df['C'] = c
            val_df['solver'] = solver
            val_df['max_iter'] = max_iter
            val_df['data'] = 'val'

            scores = pd.concat([scores, val_df])

            test_df = find_best_threshold(lr_model, X_test, y_test)
            test_df['C'] = c
            test_df['solver'] = solver
            test_df['max_iter'] = max_iter
            test_df['data'] = 'test'

            scores = pd.concat([scores, test_df])     

            del val_df
            del test_df  

In [9]:
scores.shape

(960, 6)

In [10]:
scores.query('data == "val"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,C,solver,data,max_iter
10,0.5,0.802895,5.0,newton-cg,val,1000.0
10,0.5,0.802895,5.0,newton-cg,val,100.0
10,0.5,0.802895,5.0,newton-cg,val,500.0
10,0.5,0.802895,5.0,newton-cg,val,300.0
10,0.5,0.802757,0.5,newton-cg,val,1000.0


In [11]:
scores.query('data == "test"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,C,solver,data,max_iter
10,0.5,0.790489,5.0,newton-cg,test,300.0
10,0.5,0.790489,5.0,newton-cg,test,1000.0
10,0.5,0.790489,5.0,newton-cg,test,500.0
10,0.5,0.790489,5.0,newton-cg,test,100.0
10,0.5,0.790351,10.0,newton-cg,test,500.0


The difference between the validation and test data is really low, which could let us think the model is performing consistently. Also, the parameters at the top are being repeated which shows great consistency.

### 2.6.2. Decision Tree Classifier

In [12]:
max_depth_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 15, 17, 20, 50, None]
min_samples_leaf_list = [1, 3, 4, 5, 6, 7, 10, 15, 20, 50, 100]

first_run = True

for max_depth in max_depth_list:
    for min_samples_leaf in min_samples_leaf_list:

        # create the model with the current parameters
        dt_model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
        dt_model.fit(X_train, y_train)
        
        # find the bes threshold in the validation on test data and save it into a single data frame
        val_df = find_best_threshold(dt_model, X_val, y_val)
        val_df['max_depth'] = max_depth
        val_df['min_samples_leaf'] = min_samples_leaf
        val_df['data'] = 'val'

        if first_run:
            scores_dt = val_df.copy()
            first_run = False
        else:
            scores_dt = pd.concat([scores_dt, val_df])

        test_df = find_best_threshold(dt_model, X_test, y_test)
        test_df['max_depth'] = max_depth
        test_df['min_samples_leaf'] = min_samples_leaf
        test_df['data'] = 'test'

        scores_dt = pd.concat([scores_dt, test_df])  

        del val_df
        del test_df

In [13]:
scores_dt.shape

(1056, 5)

In [14]:
scores_dt.query('data == "val"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,max_depth,min_samples_leaf,data
12,0.6,0.859683,20.0,4,val
9,0.45,0.85858,17.0,5,val
10,0.5,0.858442,13.0,5,val
12,0.6,0.858305,,6,val
12,0.6,0.858305,13.0,4,val


In [15]:
scores_dt.query('data == "test"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,max_depth,min_samples_leaf,data
10,0.5,0.861199,13,5,test
11,0.55,0.861199,13,1,test
11,0.55,0.861061,20,1,test
12,0.6,0.860924,13,7,test
10,0.5,0.860924,13,3,test


The test data performed better than the validation data, but the key is that both scores were close which means consistency in the model performance. When looking for consistency there are two sets of parameters that repeat inside the top 5 so the best one will be picked.

### 2.6.4. Xgbost

In [16]:
def model_accuracy(model, x_data, y_data):
    y_pred = model.predict(x_data)
    thresholds = np.linspace(0, 1, 21)
    scores = []
    thr = []
    for t in thresholds:
        score = accuracy_score(y_data, y_pred >= t)
        scores.append(score)
        thr.append(t)
    return pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}).sort_values(by=['accuracy_score'], ascending=False).head(5)

In [17]:
features = dv.feature_names_
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=features)

In [18]:
historical_accuracy_df_val = pd.DataFrame({'':[]})

eta_list = np.linspace(0.1, 1.5, 15)
min_child_list = np.linspace(1, 10, 10)
max_depth = [3, 4, 5, 6, 7, 8, 9, 10, 15]

for eta in eta_list:
    for min_child in min_child_list:
        for max_depth in max_depth_list:
            xgb_params = {
                'eta': eta, 
                'max_depth': max_depth,
                'min_child_weight': min_child,
                
                'objective': 'binary:logistic',
                'nthread': 8,
                
                'seed': 1,
                'verbosity': 1,
            }
            model = xgb.train(xgb_params, dtrain, num_boost_round=100)

            accuracy_df = model_accuracy(model, dval, y_val) 
            accuracy_df['eta'] = eta
            accuracy_df['min_child'] = min_child
            accuracy_df['max_depth'] = max_depth
            accuracy_df['data'] = 'val'
            historical_accuracy_df_val = accuracy_df if historical_accuracy_df_val.shape[0] == 0 else pd.concat([historical_accuracy_df_val,accuracy_df])

In [33]:
historical_accuracy_df_val.sort_values(by=['accuracy_score'], ascending=False).head(3)

Unnamed: 0,threshold,accuracy_score,eta,min_child,max_depth,data
11,0.55,0.876775,0.1,1.0,9,val
11,0.55,0.87581,0.2,2.0,8,val
9,0.45,0.875672,0.1,1.0,9,val


In [20]:
historical_accuracy_df = pd.DataFrame({'':[]})

eta_list = np.linspace(0.1, 1.5, 15)
min_child_list = np.linspace(1, 10, 10)
max_depth = [3, 4, 5, 6, 7, 8, 9, 10, 15]

for eta in eta_list:
    for min_child in min_child_list:
        for max_depth in max_depth_list:
            xgb_params = {
                'eta': eta, 
                'max_depth': max_depth,
                'min_child_weight': min_child,
                
                'objective': 'binary:logistic',
                'nthread': 8,
                
                'seed': 1,
                'verbosity': 1,
            }
            model = xgb.train(xgb_params, dtrain, num_boost_round=100)

            accuracy_df = model_accuracy(model, dtest, y_test) 
            accuracy_df['eta'] = eta
            accuracy_df['min_child'] = min_child
            accuracy_df['max_depth'] = max_depth
            accuracy_df['data'] = 'test'
            historical_accuracy_df = accuracy_df if historical_accuracy_df.shape[0] == 0 else pd.concat([historical_accuracy_df,accuracy_df])

In [32]:
historical_accuracy_df.sort_values(by=['accuracy_score'], ascending=False).head(3)

Unnamed: 0,threshold,accuracy_score,eta,min_child,max_depth,data
10,0.5,0.876223,0.3,2.0,9,test
10,0.5,0.876223,0.1,1.0,13,test
11,0.55,0.875948,0.1,1.0,13,test


In [23]:
final = pd.concat([historical_accuracy_df, historical_accuracy_df_val])

In [35]:
final.groupby(['threshold','eta','min_child','max_depth'])['accuracy_score'].mean().reset_index().sort_values(by=['accuracy_score'], ascending=False).head(3)

Unnamed: 0,threshold,eta,min_child,max_depth,accuracy_score
2489,0.5,0.1,1.0,13,0.875121
4477,0.55,0.1,1.0,9,0.874983
4479,0.55,0.1,1.0,13,0.874776


The Xgboost is the model that showed the best performance across both, validation and test data.

The best parameter combination for the xgboost is:
* eta = 0.1
* min_child = 1.0 
* max_depth = 13
* threshold = 0.50

End of the notebook