In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
encoder = LabelEncoder()
OHE = OneHotEncoder(sparse=False)

In [None]:
dados = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
dados.head()

## Preparação dos dados / Preparing and transforming data ##

In [None]:
ax, fig = plt.subplots(figsize=(16,8))
sns.heatmap(dados.corr(), annot=True)

In [None]:
dados.isnull().sum()

In [None]:
dados['country'].fillna(dados['country'].mode()[0], inplace = True)
dados['agent'].fillna(dados['agent'].mode()[0], inplace = True)
dados['company'].fillna(dados['company'].mode()[0], inplace = True)
dados['children'].fillna(dados['children'].mode()[0], inplace=True)

In [None]:
binarizer = LabelBinarizer()
dados['hotel'] = binarizer.fit_transform(dados['hotel'])

In [None]:
dados['reservation_status_date'] = pd.to_datetime(dados['reservation_status_date'], format = "%Y-%m-%d")

In [None]:
dados['reservation_status_date'] = dados['reservation_status_date'].apply(lambda x: x.strftime("%Y-%m"))

In [None]:
dados.set_index("reservation_status_date", inplace=True)

In [None]:
dados.head()

In [None]:
remover1 = ['reserved_room_type','assigned_room_type','reservation_status']
remover2 = ['arrival_date_month','meal','market_segment','distribution_channel','deposit_type','customer_type']
remover2 = np.array(remover2)

In [None]:
for rem in remover1:
  dados[rem] = encoder.fit_transform(dados[rem])

In [None]:
for rem2 in remover2:
  dados = pd.get_dummies(dados)

In [None]:
dados.head()

In [None]:
dados.groupby(dados.index).is_canceled.sum().plot(kind='line', figsize=(12,6), xlabel="Período", ylabel='Número de Cancelamentos', title='Número de Cancelamentos de Hospedagem ao Mês', grid=True)

In [None]:
dados.groupby(dados.index).lead_time.sum().plot(kind='line', figsize=(12,6), xlabel="Período", ylabel='Número de Cancelamentos', title='Número de Cancelamentos de Hospedagem ao Mês', grid=True)

## Criando modelo / Creating model ##

In [None]:
x = dados.drop("is_canceled", 1)
y = dados.loc[:,['is_canceled']]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
x_test, x_train, y_test, y_train = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [None]:
modelo = RandomForestClassifier()
modelo.fit(x_train, y_train)

In [None]:
predicao = modelo.predict(x_test)

In [None]:
print(classification_report(y_test, predicao))

In [None]:
print(confusion_matrix(y_test, predicao))

In [None]:
print(accuracy_score(y_test, predicao))

### Devido ao elevado grau do coeficiente de predição, acima de 0.99, pode-se deduzir que o modelo sofre de sobreajustamento, assim sendo, deve-se re-analizar as váriaveis utilizadas ###

### Model looks overfitted ###

In [None]:
dados2 = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
dados2.head()

In [None]:
dados2.groupby(['is_canceled']).sum()

In [None]:
dados2.groupby('reservation_status').sum()

In [None]:
x2 = dados.drop(['is_canceled', 'reservation_status'], axis = 1)

In [None]:
x_test, x_train, y_test, y_train = train_test_split(x2, y, test_size=0.2, random_state=0)

In [None]:
modelo2 = RandomForestClassifier()
modelo2.fit(x_train, y_train)

In [None]:
predicao = modelo2.predict(x_test)

In [None]:
print(classification_report(y_test, predicao))

In [None]:
print(confusion_matrix(y_test, predicao))