# **This Notebook deals with Data cleaning, Feature Engineering, EDA, Feature selection, model selection, model tuning and prediction**

In [None]:
# Import basic libraries. Other libraries will be added as and when required
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_columns", None)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Read and inspect the dataset.

In [None]:
df = pd.read_csv("/kaggle/input/hotel-booking-demand/hotel_bookings.csv")
df.head()

**On going through this dataset we can do the following feature engineering**
* Drop arrival_date_year
* Drop arrival_date_week_number
* Make a new column for total stays in nights
* Drop Null Values
* It is obvious that babies and children are just guests they won't pay or cancel the booking so we can either make a column for total guests or simply drop them all keeping adults in a column named paying_guests
* identify the Undefined categorical values if they are meaningless then drop them or replace them with some relevant attribute via google search on terminoligies on hotel data


In [None]:
df.drop(["arrival_date_year"], axis = 1, inplace = True)

In [None]:
df.drop(["arrival_date_week_number"], axis = 1, inplace = True)

In [None]:
df["stays_in_nights"] = df["stays_in_weekend_nights"] + df["stays_in_weekend_nights"]

In [None]:
df.isnull().sum()

In [None]:
df.children.fillna(0, inplace = True)

In [None]:
df.country.fillna("Unknown", inplace = True)

In [None]:
df.agent.fillna(0, inplace = True)

In [None]:
df.company.fillna(0, inplace = True)

In [None]:
df.isnull().sum()

In [None]:
df.drop("babies", axis = 1, inplace = True)

In [None]:
df["paying_guests"] = df["adults"] 

In [None]:
df.drop(["adults", "children"], axis = 1, inplace = True)

In [None]:
df.meal.unique()

In [None]:
df.meal.value_counts()

In [None]:
df.meal.replace(to_replace = dict(Undefined = "SC"), inplace = True)

In [None]:
df.market_segment.value_counts()

In [None]:
df.drop(df[df["market_segment"] == "Undefined"].index, inplace = True)

In [None]:
df.distribution_channel.value_counts()

In [None]:
df.drop(df[df["distribution_channel"] == "Undefined"].index, inplace = True)

**Removing Out-liars**
adr stands for average daily rate. Its descriptive stats says that it has a minimum value in negative which is possibly an error or at least is unjustified we can drop it and on making distplot it is found that the maximum value is also an outlier so delete it as well. 
Form a new column price multiplying adr with paying guests

In [None]:
df.adr.describe()

In [None]:
df.drop(df[df["adr"] == -6.38].index, inplace = True)

In [None]:
df.drop(df[df["adr"] == 5400].index, inplace = True)

In [None]:
df.adr.describe()

In [None]:
df.reservation_status.value_counts()

In [None]:
df.is_canceled.value_counts()

It is intriguing that reservation_status_values and is_canceled has the same data we can drop either of the columns

In [None]:
df.drop("reservation_status", axis = 1, inplace = True)

In [None]:
df.drop("reservation_status_date", axis = 1, inplace = True)

In [None]:
df["price"] = df["adr"] * df["paying_guests"]

In [None]:
df.drop(["adr"], axis = 1, inplace = True)

In [None]:
df.info()

In [None]:
df.describe()

**Heatmap for Correlation**

In [None]:
corr = df.corr()
sns.heatmap(corr,
           xticklabels = corr.columns,
           yticklabels = corr.columns)

**One Hot encode the categorical records**

In [None]:
# Convert categorical values to numeric using label encoder
from sklearn import preprocessing
from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

# Encoding the categorical variable
fit = df.select_dtypes(include=['object']).fillna('NA').apply(lambda x: d[x.name].fit_transform(x))

#Convert the categorical columns based on encoding
for i in list(d.keys()):
    df[i] = d[i].transform(df[i].fillna('NA'))

In [None]:
features = df[df.columns.difference(['is_canceled'])]
labels = df['is_canceled']

# Feature Selection
**Weight of Evidence and Information Value**
(reference from Sundar Balkrishnan's github repository)

In [None]:
import pandas as pd
import numpy as np
import pandas.core.algorithms as algos
from pandas import Series
import scipy.stats.stats as stats
import re
import traceback
import string

max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):
    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    
    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [None]:
final_iv, IV = data_vars(df[df.columns.difference(["is_canceled"])],df.is_canceled)

In [None]:
final_iv

In [None]:
IV = IV.rename(columns={'VAR_NAME':'index'})
IV.sort_values(['IV'],ascending=0)

In [None]:
transform_vars_list = df.columns.difference(['is_canceled'])
transform_prefix = 'new_' # leave this value blank if you need replace the original column values

In [None]:
transform_vars_list

In [None]:
for var in transform_vars_list:
    small_df = final_iv[final_iv['VAR_NAME'] == var]
    transform_dict = dict(zip(small_df.MAX_VALUE,small_df.WOE))
    replace_cmd = ''
    replace_cmd1 = ''
    for i in sorted(transform_dict.items()):
        replace_cmd = replace_cmd + str(i[1]) + str(' if x <= ') + str(i[0]) + ' else '
        replace_cmd1 = replace_cmd1 + str(i[1]) + str(' if x == "') + str(i[0]) + '" else '
    replace_cmd = replace_cmd + '0'
    replace_cmd1 = replace_cmd1 + '0'
    if replace_cmd != '0':
        try:
            df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd))
        except:
            df[transform_prefix + var] = df[var].apply(lambda x: eval(replace_cmd1))

In [None]:
df.head()

**Random Forest Classifier for feature selection**

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(features,labels)

preds = clf.predict(features)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(preds,labels)
print(accuracy)

In [None]:
from pandas import DataFrame
VI = DataFrame(clf.feature_importances_, columns = ["RF"], index=features.columns)

In [None]:
VI = VI.reset_index()
VI.sort_values(['RF'],ascending=0)

**Recursive Feature Elimination for feature selection**

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
rfe = RFE(model, n_features_to_select = 20)
fit = rfe.fit(features, labels)


In [None]:
from pandas import DataFrame
Selected = DataFrame(rfe.support_, columns = ["RFE"], index=features.columns)
Selected = Selected.reset_index()

In [None]:
Selected[Selected["RFE"] == True]

**Extra Trees Classifier for feature selection**

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(features, labels)

print(model.feature_importances_)

In [None]:
from pandas import DataFrame
FI = DataFrame(model.feature_importances_, columns = ["Extratrees"], index=features.columns)

In [None]:
FI = FI.reset_index()

In [None]:
FI.sort_values(["Extratrees"], ascending = False)

**Chi2 Test**

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

model = SelectKBest(score_func=chi2, k=5)
fit = model.fit(features.abs(), labels)

In [None]:
from pandas import DataFrame
pd.options.display.float_format = '{:.2f}'.format
chi_sq = DataFrame(fit.scores_, columns = ["Chi_Square"], index=features.columns)


In [None]:
chi_sq = chi_sq.reset_index()

In [None]:
chi_sq.sort_values('Chi_Square',ascending=0)

**L1 for Feature Selection**

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(features, labels)
model = SelectFromModel(lsvc,prefit=True)

In [None]:
from pandas import DataFrame
l1 = DataFrame(model.get_support(), columns = ["L1"], index=features.columns)
l1 = l1.reset_index()

In [None]:
l1[l1['L1'] == True]

**Combine all**

In [None]:
from functools import reduce
dfs = [IV, VI, Selected, FI, chi_sq, l1]
final_results = reduce(lambda left,right: pd.merge(left,right,on='index'), dfs)

In [None]:
columns = ['IV', 'RF', 'Extratrees', 'Chi_Square']

score_table = pd.DataFrame({},[])
score_table['index'] = final_results['index']

for i in columns:
    score_table[i] = final_results['index'].isin(list(final_results.nlargest(5,i)['index'])).astype(int)
    
score_table['RFE'] = final_results['RFE'].astype(int)
score_table['L1'] = final_results['L1'].astype(int)

In [None]:
score_table['final_score'] = score_table.sum(axis=1)

In [None]:
score_table.sort_values('final_score',ascending=0)

final table for importances of various features is above

In [None]:
x = df.lead_time
y = df.is_canceled
area = np.pi*3

# Plot
plt.scatter(x, y, s=area, alpha=0.5)
plt.title('Scatter plot lead_time vs cancellation')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
x = df.deposit_type
y = df.is_canceled
area = np.pi*3

# Plot
plt.scatter(x, y, s=area, alpha=0.5)
plt.title('Scatter plot deposit_type vs cancellation')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

In [None]:
x = df.country
y = df.is_canceled
area = np.pi*3

# Plot
plt.scatter(x, y, s=area, alpha=0.5)
plt.title('Scatter plot country vs cancellation')
plt.xlabel('x')
plt.ylabel('y')
plt.show()

**Multicolinearity check**

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
def calculate_vif(features):
    vif = pd.DataFrame()
    vif["Features"] = features.columns
    vif["VIF"] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]    
    return(vif)

In [None]:
features = features[list(score_table[score_table['final_score'] >= 2]['index'])]

In [None]:
vif = calculate_vif(features)
while vif['VIF'][vif['VIF'] > 10].any():
    remove = vif.sort_values('VIF',ascending=0)['Features'][:1]
    features.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(features)

In [None]:
list(vif['Features'])

**Make new dataframe with relevant fetures to end the curse of dimensionality**

In [None]:
final_vars = list(vif['Features']) + ["is_canceled"]

In [None]:
df1 = df[final_vars].fillna(0)

In [None]:
df1.describe()

In [None]:
bar_color = '#058caa'
num_color = '#ed8549'

final_iv,_ = data_vars(df1,df1['is_canceled'])
final_iv = final_iv[(final_iv.VAR_NAME != 'is_canceled')]
grouped = final_iv.groupby(['VAR_NAME'])
for key, group in grouped:
    ax = group.plot('MIN_VALUE','EVENT_RATE',kind='bar',color=bar_color,linewidth=1.0,edgecolor=['black'])
    ax.set_title(str(key) + " vs " + str('is_canceled'))
    ax.set_xlabel(key)
    ax.set_ylabel(str('is_canceled') + " %")
    rects = ax.patches
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.01*height, str(round(height*100,1)) + '%', 
                ha='center', va='bottom', color=num_color, fontweight='bold')

# Model Building

In [None]:
x = df1.iloc[:, :-1]
y = df1.iloc[:, -1]

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
x_train = sc_X.fit_transform(x_train)
x_test = sc_X.transform(x_test)
'''sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)''' #since already its categorical dep variable


**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression() #classifier is the object of logistic reg class
classifier.fit(x_train, y_train)


In [None]:
pred_train = classifier.predict(x_train)
pred_test = classifier.predict(x_test)

In [None]:
from sklearn.metrics import confusion_matrix 
cm = confusion_matrix(y_test, pred_test)                                           
cm

Not great distinction of true positives and true negatives

In [None]:
pd.crosstab(y_train,pd.Series(pred_train),rownames=['ACTUAL'],colnames=['PRED'])

In [None]:
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,y_train)
accuracy_test = accuracy_score(pred_test,y_test)

print(accuracy_train,accuracy_test)

good score but we can check for better and worse

In [None]:
from sklearn.naive_bayes import GaussianNB 
classifier = GaussianNB()

classifier.fit(x_train,y_train)

pred_train = classifier.predict(x_train)
pred_test = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,y_train)
accuracy_test = accuracy_score(pred_test,y_test)
print(accuracy_train,accuracy_test)

Terrible score!! drop the idea of naaaivee bayes immediately.

**Random Forest Classifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()

classifier.fit(x_train,y_train)

pred_train = classifier.predict(x_train)
pred_test = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,y_train)
accuracy_test = accuracy_score(pred_test,y_test)

print(accuracy_train,accuracy_test)

wuhooo!! good score..
let's check the confusion matrix

In [None]:
plt.scatter(y_test, pred_test, alpha = 0.5)
plt.xlabel("y_test")
plt.ylabel("pred_test")
plt.show()

In [None]:
cm = confusion_matrix(y_test, pred_test)                                           
cm

Better than Logistic Regression

# Tuning of model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 500, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(3, 10, num = 1)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(x_train, y_train)

print(rf_random.best_params_)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(**rf_random.best_params_)

classifier.fit(x_train,y_train)

pred_train = classifier.predict(x_train)
pred_test = classifier.predict(x_test)

from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(pred_train,y_train)
accuracy_test = accuracy_score(pred_test,y_test)

In [None]:
print(accuracy_train, accuracy_test)

**Final scores look fine. With this we can expect good prediction model**