# 1. Info

This notebook is to select the best model. The best model will be pick after tuning the parameters of each model.

To understand why some features are used and not all, go to the notebook "01_feature_importance.ipynb". Before running this notebook, you should run the notebook "00_eda.ipynb".

# 2. Select the model

## 2.1. Import libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

## 2.2. Read the data

In [2]:
data = pd.read_csv('../data/Hotel_Cancelations.csv')

## 2.3. Prepare the data

In [3]:
most_meaningful_features = ['lead_time','arrival_year','avg_price_per_room','no_of_special_requests','market_segment_type']

In [4]:
df = data[most_meaningful_features + ['booking_status']].copy()

## 2.4. split the data

In [5]:
df_full_train, df_test = train_test_split(data, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.booking_status.values
y_val = df_val.booking_status.values
y_test = df_test.booking_status.values

del df_train['booking_status']
del df_val['booking_status']
del df_test['booking_status']

## 2.5. Vectorize the data

In [6]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[most_meaningful_features].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[most_meaningful_features].to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test[most_meaningful_features].to_dict(orient='records')
X_test = dv.transform(test_dict)

## 2.6. Train the models

In [7]:
def find_best_threshold(model, data):
    # make predictions
    y_pred = model.predict_proba(data)[:, 1]

    # test different threshols
    thresholds = np.linspace(0, 1, 21)
    scores = []
    thr = []
    for t in thresholds:
        score = accuracy_score(y_val, y_pred >= t)
        scores.append(score)
        thr.append(t)

    # return only the best three scores
    return pd.DataFrame(data={'threshold':thr,'accuracy_score':scores}). sort_values(by=['accuracy_score'], ascending=False).head(3)

### 2.6.1 Logistic Regression

In [8]:
scores = pd.DataFrame(data={'threshold':[], 'accuracy_score':[], 'C':[], 'solver':[],'data':[]})

C_list = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 5, 10]
solver_list = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
for c in C_list:
    for solver in solver_list:
        # create the model with the current parameters
        lr_model = LogisticRegression(C=c, solver=solver)
        lr_model.fit(X_train, y_train)
        
        # find the bes threshold in the validation on test data and save it into a single data frame
        val_df = find_best_threshold(lr_model, X_val)
        val_df['C'] = c
        val_df['solver'] = solver
        val_df['data'] = 'val'

        scores = pd.concat([scores, val_df])

        test_df = find_best_threshold(lr_model, X_test)
        test_df['C'] = c
        test_df['solver'] = solver
        test_df['data'] = 'test'

        scores = pd.concat([scores, test_df])     

        del val_df
        del test_df  



In [9]:
scores.shape

(240, 5)

In [10]:
scores.query('data == "val"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,C,solver,data
10,0.5,0.802895,5.0,newton-cg,val
10,0.5,0.802757,0.5,newton-cg,val
10,0.5,0.802757,1.0,newton-cg,val
10,0.5,0.802619,10.0,newton-cg,val
10,0.5,0.802619,0.1,newton-cg,val


In [11]:
scores.query('data == "test"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,C,solver,data
20,1.0,0.675258,0.0001,newton-cg,test
20,1.0,0.675258,0.01,newton-cg,test
20,1.0,0.675258,0.01,liblinear,test
20,1.0,0.675258,0.01,sag,test
20,1.0,0.675258,0.01,saga,test


The difference between the validation and test data could be explained with overfitting.

### 2.6.2. Decision Tree Classifier

In [None]:
scores_dt = pd.DataFrame(data={'threshold':[], 'accuracy_score':[], 'max_depth':[], 'min_samples_leaf':[],'data':[]})

max_depth_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 13, 15, 17, 20, None]
min_samples_leaf_list = [1, 3, 5, 10, 15, 20, 50, 100]

for max_depth in max_depth_list:
    for min_samples_leaf in min_samples_leaf_list:

        # create the model with the current parameters
        dt_model = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf)
        dt_model.fit(X_train, y_train)
        
        # find the bes threshold in the validation on test data and save it into a single data frame
        val_df = find_best_threshold(dt_model, X_val)
        val_df['max_depth'] = max_depth
        val_df['min_samples_leaf'] = min_samples_leaf
        val_df['data'] = 'val'

        scores_dt = pd.concat([scores_dt, val_df])

        test_df = find_best_threshold(dt_model, X_test)
        test_df['max_depth'] = max_depth
        test_df['min_samples_leaf'] = min_samples_leaf
        test_df['data'] = 'test'

        scores_dt = pd.concat([scores_dt, test_df])  

        del val_df
        del test_df

In [20]:
scores_dt.shape

(720, 5)

In [21]:
scores_dt.query('data == "val"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,max_depth,min_samples_leaf,data
9,0.45,0.859132,17.0,5.0,val
10,0.5,0.85858,13.0,5.0,val
10,0.5,0.858029,17.0,5.0,val
11,0.55,0.857615,10.0,1.0,val
11,0.55,0.857615,13.0,10.0,val


In [22]:
scores_dt.query('data == "test"').sort_values(by=['accuracy_score'], ascending=False).head()

Unnamed: 0,threshold,accuracy_score,max_depth,min_samples_leaf,data
20,1.0,0.67581,6.0,10.0,test
20,1.0,0.67581,5.0,10.0,test
20,1.0,0.67581,5.0,15.0,test
20,1.0,0.675258,3.0,5.0,test
20,1.0,0.675258,6.0,20.0,test
