In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')

## EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
#get dummy variables for for object columns
cols = ['hotel', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'deposit_type', 'customer_type']
df = pd.get_dummies(df, columns=cols, dtype=np.int64)

In [None]:
#see the balance of target classes
df.is_canceled.value_counts()

In [None]:
df.describe().T

In [None]:
#Set up a correlation table for correlations greater than absolute value of .1
corr_df = df.corr().reset_index()
corr_df = corr_df[corr_df.columns[:2]]
corr_df.columns = ['Column', 'Correlation']
corr_df[abs(corr_df['Correlation']) > .1].sort_values(by='Correlation')

In [None]:
#Create set of predictor variables and target along with test train split
X_cols = corr_df[abs(corr_df['Correlation']) > .1].sort_values(by='Correlation').Column.unique()
X = df[X_cols]
X.drop('is_canceled', axis=1 ,inplace=True)
y = df.is_canceled
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=42)

In [None]:
#Fit a logistic Regression model
lr = LogisticRegression(solver='liblinear')
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

In [None]:
#Create coefficents table
coef_df = pd.DataFrame()
coef_df['Column'] = X_train.columns
coef_df['Coefficient'] = lr.coef_[0]
coef_df.sort_values(by='Coefficient')

In [None]:
#Map model predictions onto test data along with certainty levels
test_df = X_test
real = y_test
predict = lr.predict(X_test)
not_canceled = []
canceled = []
for x in lr.predict_proba(X_test):
    not_canceled.append(x[0])
    canceled.append(x[1])
test_df['is_canceled_real'] = real
test_df['is_canceled_predict'] = np.array(predict)
test_df['probability_not_canceled'] = np.array(not_canceled)
test_df['probability_canceled'] = np.array(canceled)

In [None]:
#Merge test dataframe with original dataframe
predicted_df = pd.merge(df, test_df[test_df.columns[-4:]], left_index=True ,right_index=True)
predicted_df.head()