## Introduction

> This data set contains booking information for a city hotel and a resort hotel, and includes information such as when the booking was made, length of stay, the number of adults, children, and/or babies, and the number of available parking spaces, among other things.

> Our analysis will try to answer questions of how duration between booking and reservation and other factors affect reservation price. Further questions of price analysis will also be performed to find optimum independent parameters to find minimum dependent variable value, in this case, the booking price.

important commands
> - Shift + Enter (executes cell)
> - Alt + Enter (executes and creates new cell)
> - Esc + Shift + Up/Down (expands selection)
> - Up/Down (moves cells up/down)

> Esc + M (converts cell to markup)
> Esc + Y (converts cell to code)
> Esc + H (shows all commands)

> Underscore twice (converts to bold)
> Astrisk once (convrets to italics)


In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoLars
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

## Model Analysis

Apply model analysis to prepared data from Data Prep notebook.

# is_cancelled Prediction


In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

X_train_norm_clf_pca = pd.read_pickle('X_train_norm_clf_pca.pkl')
X_test_norm_clf_pca = pd.read_pickle('X_test_norm_clf_pca.pkl')
y_train_clf = pd.read_pickle('y_train_clf.pkl')
y_test_clf = pd.read_pickle('y_test_clf.pkl')
X_train_norm_clf_pca.head()

y_test_clf.head()


118260    0
40600     1
78329     0
62276     1
81700     0
Name: is_canceled, dtype: int64

In [4]:
def calc_metrics(model, X, y):
    y_hat = model.predict(X)
    metrics = [accuracy_score, precision_score, recall_score, f1_score, roc_auc_score]
    scores = [fn(y, y_hat) for fn in metrics]
    return scores

def plot_precision_recall_vs_threshold(y, y_pred):
    precisions, recalls, thresholds = precision_recall_curve(y, y_pred)
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])
    plt.figure(figsize=(8, 4))

    
def plot_roc_curve(y, y_pred, label=None):
    fpr, tpr , thresholds = roc_curve(y, y_pred)
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.figure(figsize=(8, 6))


In [5]:
KNNclf = KNeighborsClassifier(n_neighbors = 2)
KNNclf_params = {"weights": ["uniform", "distance"], "n_neighbors": [2,5,10]}

forestclf = RandomForestClassifier(n_estimators = 10, bootstrap="true", random_state = 42)
forerstclf_params = {"criterion": ["gini"], "n_estimators": [10,20,50]}

#models = [(KNNclf, KNNclf_params)]

"""
for model in models:
    model_name = str(model[0])
    print(model_name)
    grid = GridSearchCV(estimator = model[0], param_grid = model[1], cv = 3)
    model = grid.fit(X_train_norm_clf_pca, y_train_clf)
    y_pred = grid.predict(X_test_norm_clf_pca)
    plot_precision_recall_vs_threshold(y_pred, y_test_clf)
    print('parameters: ', grid.best_params_)"""

"\nfor model in models:\n    model_name = str(model[0])\n    print(model_name)\n    grid = GridSearchCV(estimator = model[0], param_grid = model[1], cv = 3)\n    model = grid.fit(X_train_norm_clf_pca, y_train_clf)\n    y_pred = grid.predict(X_test_norm_clf_pca)\n    plot_precision_recall_vs_threshold(y_pred, y_test_clf)\n    print('parameters: ', grid.best_params_)"

In [None]:
grid = GridSearchCV(estimator = KNNclf, param_grid = KNNclf_params, cv = 3)
grid.fit(X_train_norm_clf_pca, y_train_clf)
y_pred = grid.predict(X_test_norm_clf_pca)
plot_precision_recall_vs_threshold(y_test_clf, y_pred)
plot_roc_curve(y_test_clf)

In [None]:
grid.best_params_

In [None]:
confusion_matrix(y_pred,y_test_clf)

In [None]:
grid = GridSearchCV(estimator = forestclf, param_grid = forestclf_params, cv = 3)
grid.fit(X_train_norm_clf_pca, y_train_clf)
y_pred = grid.predict(X_test_norm_clf_pca)
plot_precision_recall_vs_threshold(y_test_clf, y_pred)

In [None]:
grid_best_params_

In [None]:
confusion_matrix(y_pred, y_test_clf)