# **Predicting whether a person booking a hotel will cancel their booking**

# **Importing Libraries**

In [None]:
import pandas as pd
import pickle

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
#Importing Our dataset
df = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv", sep=',')

# **Defining Our Function to clean Meal and to Seggregate Reservation status date**

In [None]:
def meal_parser(x):
    if x != 'Undefined':
        return x.split(' ')[0]
    else:
        return 'Undefined'

def clean_RSD(x, dt='d'):
    try:
        if dt=='d':
            dt_no = 2
        elif dt=='m':
            dt_no = 1
        elif dt=='y':
            dt_no = 0
        return list(x)[0].split('-')[dt_no]
    except:
        return x.split('-')[dt_no]

# **Creating custom function to clean anomalies in data**

In [None]:
def cleananomalies(filepath='../input/hotel-booking-demand/hotel_bookings.csv'):

    df = pd.read_csv(filepath, sep=',')
    
    # Cleaning reservation_status_date
    
    RSD = pd.DataFrame(pd.to_datetime(df["reservation_status_date"]).astype("str"))
    
    RSD["ReservationStatusDate_year"] = RSD["reservation_status_date"].apply(clean_RSD, args=("y")).astype(int)
    RSD["ReservationStatusDate_month"] = RSD["reservation_status_date"].apply(clean_RSD, args=("m")).astype(int)
    RSD["ReservationStatusDate_day"] = RSD["reservation_status_date"].apply(clean_RSD, args=("d")).astype(int)
    
    df = pd.concat([df, RSD.iloc[:, 1:]], axis=1)
    df.drop("reservation_status_date", axis=1, inplace=True)
    
    # Cleaning arrival_date_month
    
    month_to_num = {
        'January': '01',
        'February': '02',
        'March': '03',
        'April': '04',
        'May': '05',
        'June': '06',
        'July': '07',
        'August': '08',
        'September':'09',
        'October': '10',
        'November': '11',
        'December': '12'
    }
    
    df["arrival_date_month"] = df["arrival_date_month"].map(month_to_num).astype(int)
    
    df.drop("company", axis=1, inplace=True)
    df.drop("hotel", axis=1, inplace=True)
    
    # Cleaning meal
    
    meal_data = df["meal"].apply(meal_parser)
    meal_data = pd.get_dummies(meal_data)
    
    # df = pd.concat([df, meal_data], axis=1)
    df.drop("meal", axis=1, inplace=True)
    
    # Cleaning reservation_status
    
    df = pd.concat([df, pd.get_dummies(df["reservation_status"])], axis=1)
    df.drop("reservation_status", axis=1, inplace=True)
    
    # Cleaning customer_type
    
    # df = pd.concat([df, pd.get_dummies(df["customer_type"])], axis=1)
    df.drop("customer_type", axis=1, inplace=True)
    
    # Cleaning distribution_channel
    
    # df = pd.concat([df, pd.get_dummies(df["distribution_channel"])], axis=1)
    df.drop("distribution_channel", axis=1, inplace=True)
    
    # Cleaning market_segment
    
    # df = pd.concat([df, pd.get_dummies(df["market_segment"])], axis=1)
    df.drop("market_segment", axis=1, inplace=True)
    
    drop_object_cols = ['country',
                    'reserved_room_type',
                    'assigned_room_type',
                    'deposit_type','agent', 'children']
    df.drop(drop_object_cols, axis=1, inplace=True)
    
    return(df)


# **Using Random Forest Classification to predict which person is more likely to cancel their booking**

In [None]:
def RandomForestClassifierImpl(X_train, X_test, y_train, y_test):
    rfc = RandomForestClassifier(n_estimators=1000)
    rfc.fit(X_train, y_train)
    
    # predicting values
    pred = rfc.predict(X_test)
    pred_proba = rfc.predict_proba(X_test)
    
    accuracy = accuracy_score(y_test, pred)
    
    print(accuracy)
    
    print(mean_squared_error(y_test, pred))
    
    df1 = pd.DataFrame()
    df1 = df1.append(X_test)
    df1['Predictions'] = pred
    
    return(df1)

In [None]:
#Calling our custom function to clean Input data
df = cleananomalies()

In [None]:
# Label
y = df["Canceled"]

# Features
X = df.drop(['is_canceled', 'Canceled', 'Check-Out'], axis=1)

# Splitting the dataset into Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y)

df1 = RandomForestClassifierImpl(X_train, X_test, y_train, y_test)

In [None]:
df1.head()

In [None]:
#Exporting Output to export.csv
df1.to_csv("./export.csv", index=False)