In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
#!pip install pycountry-convert
#!pip install pandas-profiling
import warnings
warnings.filterwarnings('ignore') #Used only for maintaining a clean notebook. Not a best practice.

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_rows',9999)
pd.set_option('display.max_columns',9999)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import pandas_profiling

import matplotlib.pyplot as plt
import seaborn as sns


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
files = []
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        files.append(os.path.join(dirname, filename))
df = pd.read_csv(files[0])
# Any results you write to the current directory are saved as output.

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.select_dtypes('object')

In [None]:
df.select_dtypes('int')

In [None]:
df.select_dtypes('float')

In [None]:
canceled_corr = df.corr()['is_canceled'].abs()
relation =  pd.Series(df.corr()['is_canceled']/df.corr()['is_canceled'].abs(),name='sign')
pd.concat([canceled_corr,relation],axis=1).sort_values(by='is_canceled',ascending=False)

In [None]:
df['is_canceled'].value_counts()/df['is_canceled'].shape[0]

In [None]:
df.dropna(subset=['country'],inplace=True)

# 1 Cleaning, Wrangling, & Feature Engineering

In [None]:
y = df['is_canceled']
X = df.drop(columns=['is_canceled','reservation_status','reservation_status_date','arrival_date_year','lead_time'])

X['fulfilled_room_request'] = np.where(X['assigned_room_type']==X['reserved_room_type'],1,0) # Requested room was granted to customer
X['company'] = np.where(X['company'].notnull(),1,0) # the data shows whether it was a company booking or not
X['agent'] = np.where(X['agent'].notnull(),1,0) # if booked by an agent
X['children'] = X['children'].fillna(0) 

In [None]:
cat_cols = X.select_dtypes('object').columns.tolist()

In [None]:
num_cols = (X.select_dtypes('float').columns.tolist())+(X.select_dtypes('int').columns.tolist())

# 2 Base Model & Comparison

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.metrics import recall_score

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

onehot = OneHotEncoder(sparse=False,handle_unknown='ignore')
standard = StandardScaler()
col_transformer = ColumnTransformer(transformers=[('scaler',standard,num_cols),('onehot',onehot,cat_cols)],)


log = Pipeline(steps=[('transform',col_transformer),('model',LogisticRegression(random_state=11))])
rf = Pipeline(steps=[('transform',col_transformer),('model',RandomForestClassifier(random_state=43))])
sv = Pipeline(steps=[('transform',col_transformer),('model',LinearSVC())])
base = Pipeline(steps=[('transform',col_transformer),('model',DummyClassifier(random_state=19))])

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)


In [None]:
np.mean(cross_val_score(log,X_train,y_train,cv=3,scoring='recall'))

In [None]:
np.mean(cross_val_score(rf,X_train,y_train,cv=3,scoring='recall'))

In [None]:
np.mean(cross_val_score(sv,X_train,y_train,cv=3,scoring='recall'))

In [None]:
np.mean(cross_val_score(base,X_train,y_train,cv=3,scoring='recall'))

# 3 Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

rf_params = {
    'model__max_depth':[20,100,None],
    'model__n_estimators':[x for x in range(200,2000,200)]
}

log_params = {
    'model__C':np.logspace(-4,4,10)
}

rf_grid = GridSearchCV(rf,rf_params,cv=3,scoring='recall',verbose=True)
log_grid = GridSearchCV(log,log_params,cv=3,scoring='recall',verbose=True)

In [None]:
rf_grid.fit(X_train,y_train)

In [None]:
log_grid.fit(X_train,y_train)

In [None]:
rf_grid.best_estimator_

In [None]:
log_grid.best_estimator_

# 4 Final Validation

In [None]:
best_log = Pipeline(steps=[('transformer',col_transformer),('model',LogisticRegression(C=166.81005372000558))])

In [None]:
best_rf = Pipeline(steps=[('transformer',col_transformer),('model',RandomForestClassifier(max_depth=100,n_estimators=400))])

In [None]:
best_log.fit(X_train,y_train)
log_train_preds = best_log.predict(X_train)
log_test_preds = best_log.predict(X_test)
print('Train Score: ',recall_score(y_train,log_train_preds))
print('Test Score: ',recall_score(y_test,log_test_preds))

In [None]:
best_rf.fit(X_train,y_train)
rf_train_preds = best_rf.predict(X_train)
rf_test_preds = best_rf.predict(X_test)
print('Train Score: ',recall_score(y_train,rf_train_preds))
print('Test Score: ',recall_score(y_test,rf_test_preds))

> RandomForest overfits but still retains a higher test score while LogisticRegression maintains a stable performance. Given its speed and stability, we will stick with LogisticRegression