In [None]:
!pip install pycaret

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import category_encoders as ce
from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [None]:
df = pd.read_csv("../input/hotel-booking-demand/hotel_bookings.csv")

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df["hotel"].value_counts()

In [None]:
def ordinal_encoding(df,col,mapping):
    ordinal_encoder=ce.OrdinalEncoder(cols=col,return_df=True,mapping=[{'col':col,'mapping':mapping}])
    df_final = ordinal_encoder.fit_transform(df)
    return df_final


In [None]:
df = ordinal_encoding(df,"hotel",{"City Hotel":0,"Resort Hotel":1})

In [None]:
df["lead_time"].value_counts()

In [None]:
df["arrival_date_year"].value_counts()

In [None]:
def one_hot_encoding(df,col):
    one_hot_encoder=ce.OneHotEncoder(cols=col,return_df=True,use_cat_names=True)
    df_final = one_hot_encoder.fit_transform(df)
    return df_final

In [None]:
df = one_hot_encoding(df,"arrival_date_year")

In [None]:
df["arrival_date_month"].value_counts()

In [None]:
df = one_hot_encoding(df,"arrival_date_month")

In [None]:
df["arrival_date_week_number"].value_counts()

In [None]:
df["arrival_date_day_of_month"].value_counts()

In [None]:
df.columns

In [None]:
df["stays_in_weekend_nights"].value_counts()

In [None]:
df["stays_in_week_nights"].value_counts()

In [None]:
df["adults"].value_counts()

In [None]:
df["children"].value_counts()

In [None]:
df["children"] = df["children"].fillna(3.0)

In [None]:
df['babies'].value_counts()

In [None]:
df['meal'].value_counts()

In [None]:
df = one_hot_encoding(df,"meal")

In [None]:
df["country"].value_counts()

In [None]:
country_list = list(df["country"])
country_list = list(np.unique(country_list))
country_list.pop(177)
country_list

In [None]:
random = []
for i in range(488):
    random.append(np.random.randint(1,177))
def change(x):
    x=np.random.randint(1,177)
    return country_list[x]


In [None]:
df[df['country'].isnull()!=True]['country']

In [None]:
country_null=df[df['country'].isnull()]['country'].map(lambda x:change(x))
country_null

In [None]:
country_notnull=df[df['country'].isnull()!=True]['country']

In [None]:
country_null_df=pd.DataFrame(country_null)
country_notnull_df=pd.DataFrame(country_notnull)

In [None]:
country_null_df

In [None]:
df_country = pd.concat([country_null_df,country_notnull_df],axis = 0)

In [None]:
df["country"]=df_country.sort_index(axis=0)

In [None]:
def target_mean_encoding(df,col,target):
    target_encoder=ce.TargetEncoder(cols='country')
    df = target_encoder.fit_transform(df[col],df[target])
    return df

In [None]:
df["country"] = target_mean_encoding(df,"country","is_canceled")

In [None]:
df = one_hot_encoding(df,"market_segment")

In [None]:
df['distribution_channel'].value_counts()

In [None]:
df = one_hot_encoding(df,"distribution_channel")

In [None]:
df = ordinal_encoding(df,"reserved_room_type",{"P":0,"L":1,"H":2,"G":3,"F":4,"E":5,"D":6,"C":7,"B":8,"A":9})

In [None]:
df = ordinal_encoding(df,"assigned_room_type",{"P":0,"L":1,"H":2,"G":3,"F":4,"E":5,"D":6,"C":7,"B":8,"A":9})

In [None]:
df["deposit_type"].value_counts()

In [None]:
df = one_hot_encoding(df,"deposit_type")

In [None]:
columns_to_bes_removed = ["agent","company","adr"]

In [None]:
df["days_in_waiting_list"].value_counts()

In [None]:
df["customer_type"].value_counts()

In [None]:
df = one_hot_encoding(df,"customer_type")

In [None]:
df["required_car_parking_spaces"].value_counts()

In [None]:
df["total_of_special_requests"].value_counts()

In [None]:
df["reservation_status"].value_counts()

In [None]:
df = one_hot_encoding(df,"reservation_status")

In [None]:
df["reservation_status_date"].value_counts()

In [None]:
df[["reservation_status_year", "reservation_status_month", "reservation_status_date"]] = df["reservation_status_date"].str.split("-", expand = True)

In [None]:
columns_to_be_removed = ['agent', 'company','adr']
df = df.drop(columns_to_be_removed,axis = 1)

In [None]:
for  i,j in df.isnull().sum().iteritems():
    print(i,j)

In [None]:
df.to_csv("data_preprocessed.csv",index=False)

In [None]:
# experiment = setup(df,target = "is_canceled",imputation_type="iterative")

In [None]:
# compare_models()

In [None]:
X = df.drop("is_canceled",axis = 1)
Y = df["is_canceled"]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [None]:
clf=RandomForestClassifier(n_estimators=100)

In [None]:
clf.fit(X_train,Y_train)

In [None]:
Y_pred=clf.predict(X_test)

In [None]:
print("Accuracy:",accuracy_score(Y_test, Y_pred))

In [None]:
print("Classification Report:",classification_report(Y_test, Y_pred))