In [None]:
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dtypes = {
        'is_canceled':'float64',
        'lead_time':'float64',
        'stays_in_weekend_nights':'float64',
        'stays_in_week_nights':'float64',
        'adults':'float64',
        'children':'float64',
        'babies':'float64',
        'meal':'category',
        'country':'category',
        'market_segment':'category',
        'distribution_channel':'category',
        'is_repeated_guest':'float64',
        'previous_cancellations':'float64',
        'previous_bookings_not_canceled':'float64',
        'reserved_room_type':'category',
        'assigned_room_type':'category',
        'booking_changes':'float64',
        'deposit_type':'category',
        'agent':'category',
        'company':'category',
        'days_in_waiting_list':'float64',
        'customer_type':'category',
        'adr':'float64',
        'required_car_parking_spaces':'float64',
        'total_of_special_requests':'float64',
        'reservation_status':'category'
        }

In [None]:
import pandas as pd
import numpy as np
from pandas import DataFrame
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller
from tensorflow.keras.layers import Dense
from tensorflow.keras import layers
from tensorflow.keras.layers import LSTM

train_df = pd.read_csv("/kaggle/input/hotel-booking-demand/hotel_bookings.csv", dtype=dtypes, converters={'arrival_date_week_number': '{:0>2}'.format})
train_df

In [None]:
train_df.shape

In [None]:
train_df.isna().sum()

In [None]:
train_df.info()

In [None]:
a=train_df.head()
b=train_df
b

In [None]:
c=b.sort_values(['arrival_date_year','arrival_date_week_number'], ascending=True)
c=pd.DataFrame(c)
c
type(c)

In [None]:
df = DataFrame(c, columns= ['arrival_date_year', 'arrival_date_week_number']) 
df

In [None]:
# https://datatofish.com/concatenate-values-python/

df1 = df['arrival_date_year'].map(str) + df['arrival_date_week_number'].map(str)
print (df1)
df1=pd.DataFrame(df1)

In [None]:
df2= c['is_canceled']
df2

In [None]:
type(df1)

In [None]:
df3=pd.concat([df1, df2], axis = 1)
df3.columns = ['FullDate', 'IsCanceled']
df3

In [None]:
df3.sort_values(['FullDate','IsCanceled'], ascending=True)

In [None]:
df4 = df3.groupby('FullDate').agg(sum)
df4.sort_values(['FullDate'], ascending=True)
df4

In [None]:
tseries=df4['IsCanceled']
tseries

### Visualize the Timeseries and understand TimeSeries Pattern

In [None]:
plt.plot(tseries)
plt.tick_params(
    axis='x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom=False,      # ticks along the bottom edge are off
    top=False,         # ticks along the top edge are off
    labelbottom=False) # labels along the bottom edge are off
plt.ylabel('Cancellations')
plt.title("Cancellations Per Week")
plt.show()

**There is a seasonality present in Cancellations**

##### Run Augmented Dickey Fuller Test to confirm if the timeseries are Stationary or not

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.stattools import adfuller

In [None]:
# Run the ADF test on the series and print out the results
results = adfuller(tseries)
print(results)

In [None]:
print('ADF Statistic: %f' % results[0])
print('p-value: %f' % results[1])
print('Critical Values:')
for key, value in results[4].items():
    print('\t%s: %.3f' % (key, value))

p-value < 0.05, so we can accept the Null Hypothesis that the timeseries is Stationary. Hence, the Dicky Fuller test says Time series is stationary

In [None]:
df=pd.DataFrame(tseries)
df

In [None]:
df=np.array(df)

**Training and Validation data partition**

In [None]:
train_size = int(len(df) * 0.8)
val_size = len(df) - train_size
train, val = df[0:train_size,:], df[train_size:len(df),:]

**Form dataset matrix**

In [None]:
def create_dataset(df, previous=1):
    dataX, dataY = [], []
    for i in range(len(df)-previous-1):
        a = df[i:(i+previous), 0]
        dataX.append(a)
        dataY.append(df[i + previous, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
df

##### Normalize dataset with MinMaxScaler

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
train = scaler.fit_transform(train)
train

In [None]:
val = scaler.fit_transform(val)
val

In [None]:
# Lookback period
lookback = 5
X_train, y_train = create_dataset(train, lookback)
X_val, y_val = create_dataset(val, lookback)

In [None]:
X_train

In [None]:
y_train

### Configure LSTM model

In [None]:
# reshape input to be [samples, time steps, features]
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

X_train

In [None]:
X_val

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy, mean_squared_error

In [None]:
# Generate LSTM network
model= Sequential([
    LSTM(4, input_shape=(1, lookback)),
    Dense(1),
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
history=model.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=1, verbose=2)

#### Loss History Graph

In [None]:
def plot_loss(loss, val_loss):
  plt.figure(figsize=(12,8))
  plt.plot(loss)
  plt.plot(val_loss)
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'Test'], loc='upper right')
  plt.show()


# Plot train vs test loss during training
plot_loss(history.history['loss'], history.history['val_loss'])

In [None]:
# Generate predictions
trainpred = model.predict(X_train)
valpred = model.predict(X_val)

In [None]:
trainpred

In [None]:
valpred

In [None]:
# Convert predictions back to normal values
trainpred = scaler.inverse_transform(trainpred)
y_train = scaler.inverse_transform([y_train])
valpred = scaler.inverse_transform(valpred)
y_val = scaler.inverse_transform([y_val])
predictions = valpred

In [None]:
type(predictions)

In [None]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
np.mean(y_val)

In [None]:
X_train[:10]

In [None]:
X_val[:10]

In [None]:
import math
from math import sqrt

# calculate RMSE
trainScore = math.sqrt(mean_squared_error(y_train[0], trainpred[:,0]))
print('Train Score: %.2f RMSE' % (trainScore))
valScore = math.sqrt(mean_squared_error(y_val[0], valpred[:,0]))
print('Validation Score: %.2f RMSE' % (valScore))

In [None]:
# Train predictions
trainpredPlot = np.empty_like(df)
trainpredPlot[:, :] = np.nan
trainpredPlot[lookback:len(trainpred)+lookback, :] = trainpred

In [None]:
# Validation predictions
valpredPlot = np.empty_like(df)
valpredPlot[:, :] = np.nan
valpredPlot[len(trainpred)+(lookback*2)+1:len(df)-1, :] = valpred

In [None]:
# Plot all predictions
inversetransform, =plt.plot(scaler.inverse_transform(df))
trainpred, =plt.plot(scaler.inverse_transform(trainpredPlot))
valpred, =plt.plot(scaler.inverse_transform(valpredPlot))
plt.xlabel('Number of weeks')
plt.ylabel('Cancellations')
plt.title("Predicted vs. Actual Cancellations Per Week")
plt.show()

In [None]:
y_val=y_val.reshape(-1)
y_val.shape
y_val=pd.Series(y_val)
y_val[:10]

In [None]:
np.mean(y_val)

In [None]:
predictions=predictions.reshape(-1)

In [None]:
predictions.shape

In [None]:
predictions=pd.Series(predictions)
predictions

In [None]:
def mda(actual: np.ndarray, predicted: np.ndarray):
    """ Mean Directional Accuracy """
    return np.mean((np.sign(actual[1:] - actual[:-1]) == np.sign(predicted[1:] - predicted[:-1])).astype(int))

In [None]:
mda(y_val, predictions)

In [None]:
# https://machinelearningmastery.com/time-series-forecasting-performance-measures-with-python/

mse = mean_squared_error(y_val, predictions)
rmse = sqrt(mse)
print('RMSE: %f' % rmse)

In [None]:
forecast_error = (predictions-y_val)
forecast_error

In [None]:
mean_forecast_error = np.mean(forecast_error)
mean_forecast_error

In [None]:
X_train[:10]

In [None]:
X_train.shape

**Validate Predictions on Test Data**

In [None]:
# Test (unseen) predictions
# (t) and (t-5)
Xnew = np.array([tseries.iloc[95:100],tseries.iloc[96:101],tseries.iloc[97:102],tseries.iloc[98:103],tseries.iloc[99:104],tseries.iloc[100:105],tseries.iloc[101:106],tseries.iloc[102:107],tseries.iloc[103:108],tseries.iloc[104:109],tseries.iloc[105:110],tseries.iloc[106:111],tseries.iloc[107:112],tseries.iloc[108:113],tseries.iloc[109:114]])

In [None]:
Xnew[:10]

In [None]:
Xnew = scaler.fit_transform(Xnew)
Xnew
Xnewformat = np.reshape(Xnew, (Xnew.shape[0], 1, Xnew.shape[1]))
ynew=model.predict(Xnewformat)

In [None]:
Xnew.shape

In [None]:
ynew

In [None]:
type(ynew)

In [None]:
print(ynew.shape)

In [None]:
actual = tseries.iloc[100:115]
actual

In [None]:
actual = np.array(actual)
actual

In [None]:
actual=actual.reshape(15,-1)
actual.shape

In [None]:
actual

In [None]:
ynew=ynew.reshape(-1)
actual=actual.reshape(-1)
actualpd=pd.Series(actual)

In [None]:
np.min(df)

In [None]:
ynew

In [None]:
maxcancel=np.max(tseries)

In [None]:
mincancel=np.min(tseries)

In [None]:
ynew = ynew * np.abs(maxcancel-mincancel) + np.min(tseries)
ynewpd=pd.Series(ynew)

In [None]:
actualpd

In [None]:
ynewpd

In [None]:
mda(actualpd, ynewpd)

In [None]:
mse = mean_squared_error(actualpd, ynewpd)
rmse = sqrt(mse)
print('RMSE: %f' % rmse)

In [None]:
forecast_error = (ynewpd-actualpd)
forecast_error

In [None]:
mean_forecast_error = np.mean(forecast_error)
mean_forecast_error

In [None]:
predictons, =plt.plot(ynewpd, label='Predictions')
actual, =plt.plot(actualpd, label='Actual')
plt.xlabel('Number of weeks')
plt.ylabel('Cancellations')
plt.title("Predicted vs. Actual Cancellations Per Week")
plt.legend(loc = 'upper center')
plt.show()

In [None]:
np.mean(ynewpd)