In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense, Bidirectional, Dropout, BatchNormalization
from keras.optimizers import Adam, RMSprop, SGD
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import tensorflow as tf
import matplotlib.pyplot as plt

## Load Data

In [None]:
df = pd.read_csv('train_data.csv')
print(df)

## Creates all date related columns

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['day_of_year'] = df['date'].dt.dayofyear
df['day_of_month'] = df['date'].dt.day
df['weather_rain'] = df['weather_rain'].fillna(0)
df.dropna(inplace=True)

# Adds School Holidays

In [None]:
def add_city_school_holidays(df, holidays_info, date_column_name):
    result_df = df.copy()
    if not pd.api.types.is_datetime64_any_dtype(result_df[date_column_name]):
        result_df[date_column_name] = pd.to_datetime(result_df[date_column_name])
    for city, start_date, end_date in holidays_info:
        column_name = f"Lov-{city}"
        start_date = pd.to_datetime(start_date)
        end_date = pd.to_datetime(end_date)
        if column_name not in result_df.columns:
            result_df[column_name] = 0
        holiday_mask = result_df[date_column_name].between(start_date, end_date)
        result_df.loc[holiday_mask, column_name] = 1

    return result_df

holidays_info = [
    ('Stockholm', '2021-06-11', '2021-08-18'),
    ('Norrköping', '2021-06-10', '2021-08-15'),
    ('Stockholm', '2022-06-11', '2022-08-18'),
    ('Norrköping', '2022-06-10', '2022-08-15'),
    ('Stockholm', '2023-05-19', '2023-05-19'),
    ('Norrköping', '2023-05-19', '2023-05-19'),
    ('Stockholm', '2023-06-05', '2023-06-05'),
    ('Norrköping', '2023-06-05', '2023-06-05'),
    ('Stockholm', '2023-06-14', '2023-08-21'),
    ('Norrköping', '2023-06-16', '2023-08-17'),
    ('Stockholm', '2024-04-02', '2024-04-07'),
    ('Norrköping', '2024-04-02', '2024-04-07'),
    ('Stockholm', '2024-05-19', '2024-05-19'),
    ('Norrköping', '2024-05-19', '2024-05-19'),
    ('Stockholm', '2024-06-07', '2024-06-07'),
    ('Norrköping', '2024-06-07', '2024-06-07'),
    ('Stockholm', '2024-06-13', '2024-08-19'),
    ('Norrköping', '2024-06-14', '2024-08-15'),
]
df = add_city_school_holidays(df, holidays_info, 'date')


# Adds Bank Holidays

In [None]:
easter_dates = ['2020-04-10','2020-04-11','2020-04-12','2020-04-13','2021-04-02','2021-04-03','2021-04-04','2021-04-05'
                ,'2022-04-15','2022-04-16','2022-04-17','2022-04-18','2023-04-07','2023-04-08','2023-04-09','2023-04-10'
                ,'2024-03-29','2024-03-30','2024-03-31','2024-04-01']
kristi = ['2020-05-21','2020-05-22','2020-05-23','2020-05-24',
          '2021-05-13','2021-05-14','2021-05-15','2021-05-16','2022-05-26',
          '2022-05-27','2022-05-28','2022-05-29',
          '2023-05-18','2023-05-19','2023-05-20','2023-05-21',
          '2024-05-08','2024-05-09','2024-05-10','2024-05-11']
pingsti = ['2020-05-31','2021-05-23','2022-06-05','2023-05-28','2024-05-19']
midsummer = ['2020-06-19','2021-06-25','2022-06-24','2023-06-23','2024-06-21']
national_dagen = ['2020-06-06','2021-06-06',
                  '2022-06-06','2023-06-06','2024-06-06','2025-06-06']

easter_dates_datetime = pd.to_datetime(easter_dates)
national_dates = pd.to_datetime(national_dagen)
midsummer_dates = pd.to_datetime(midsummer)
kristihimmel = pd.to_datetime(kristi)
df['Påsk'] = df['date'].isin(easter_dates_datetime).astype(int)
df['Midsommar'] = df['date'].isin(midsummer_dates).astype(int)
df['Nationaldagen'] = df['date'].isin(national_dates).astype(int)
df['Kristihimmel'] = df['date'].isin(kristihimmel).astype(int)
df['Pingst'] = df['date'].isin(pingsti).astype(int)


 # Function for Creating Time Sequences For LSTM Model

In [None]:
def create_sequences(data, sequence_length):
    """
    Creates a sequence of data with a specified number of time steps.

    :param data: Input data as a 2D NumPy array where each row is an observation.
    :param sequence_length: The number of time steps in each sequence.
    :return: A 3D NumPy array in the format (number of samples, time steps, features).
    """
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        sequence = data[i:i + sequence_length]
        sequences.append(sequence)
    return np.array(sequences)

# Training Process

bookings_increases is created in statistical model notebook

In [None]:
train_df = df[df['date'].between('2022-01-01', '2024-04-01')].copy()

features = ['year','day_of_year','day_of_month','weather_rain','bookings','weather_temp','weather_available','days_before',
            'Lov-Stockholm','Lov-Norrköping','bookings_increases',
            'Påsk', 'Nationaldagen', 'Midsommar', 'Kristihimmel', 'Lov-Stockholm', 'Lov-Norrköping']
           

X = train_df[features]
y = train_df['visitors']

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(train_df[features])

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

sequence_length = 1
X_scaled_sequences = create_sequences(X_scaled, sequence_length)
y_scaled_adjusted = y_scaled[sequence_length - 1:]

test_size = 0.2

split_idx = int(len(X_scaled_sequences) * (1 - test_size))

X_train = X_scaled_sequences[:split_idx]
X_test = X_scaled_sequences[split_idx:]

y_train = y_scaled_adjusted[:split_idx]
y_test = y_scaled_adjusted[split_idx:]

# Add params here if you want to test ceveral values

In [None]:
hyperparam_space = {
    'units': [64],
    'learning_rate': [0.001],
    'dropout_rate': [0.5],
    'activation': ['relu'],
    'lstm_layers': [1],
    'optimizer': [Adam]
}

# Model Architecture

In [None]:
def build_and_compile_model(units, learning_rate, dropout_rate, activation_option, num_lstm_layers, optimizer_value,input_shape):
    model = Sequential()
    for i in range(num_lstm_layers):
        model.add(LSTM(units=units,activation=activation_option,return_sequences=(i < num_lstm_layers - 1), input_shape=input_shape))
        model.add(Dropout(dropout_rate))
        model.add(BatchNormalization())
    model.add(Dense(1, activation='linear'))

    model.compile(optimizer=optimizer_value(learning_rate=learning_rate),
                  loss='mean_squared_error',
                  metrics=['mean_squared_error'])
    return model

In [None]:
from itertools import product

best_score = float('inf')
best_model = None
best_params = None
for units,learning_rate, dropout_rate, activation, lstm_layers, optimizer in product(*hyperparam_space.values()):

    model = build_and_compile_model(units, learning_rate, dropout_rate, activation, lstm_layers, optimizer, input_shape=(X_train.shape[1], X_train.shape[2]))
    model.fit(X_train, y_train, epochs=300, batch_size=32,validation_data=(X_test, y_test))
    evaluation = model.evaluate(X_test, y_test, verbose=0)
    val_loss = evaluation[0]
    params = {
            'units': units,
            'learning_rate': learning_rate,
            'dropout_rate': dropout_rate,
            'activation': activation,
            'layers': lstm_layers
           }
    if val_loss < best_score:
        best_score = val_loss
        best_model = model
        best_params = params

print(f"Bästa val_loss: {best_score}")
print(f"Bästa hyperparametrar: {best_params}")

# Test

In [None]:
test_df = df[df['date'].between('2024-04-02', '2024-12-02')].copy()
test_df = test_df[test_df['days_before'].isin([4])].copy()
test_df.head()


X_test = test_df[features]
y_test = test_df['visitors']
X_test_scaled = scaler_X.transform(X_test)
X_test_scaled_sequences = create_sequences(X_test_scaled, sequence_length)
y_test = y_test[sequence_length - 1:]

predicted_y = best_model.predict(X_test_scaled_sequences)
predictions = scaler_y.inverse_transform(predicted_y).flatten()

In [None]:
plt.scatter(y_test, predictions, alpha=0.6, color='red', label='Förutsagda')
min_max_range = [y_test.min(), y_test.max()]
plt.plot(min_max_range, min_max_range, 'k--', label='Ideala linjen')
plt.xlabel('Faktiska Värden')
plt.ylabel('Förutsagda Värden')
plt.title('Faktiska vs. Förutsagda Värden')
plt.legend()
plt.show()


# Creates the resulting dataframe and prints the values

In [None]:
actual = y_test
predicted = predictions.flatten()
actual_adjusted = actual[-len(predicted):]
diff = actual_adjusted - predicted
results_df = pd.DataFrame({
    'Actual': actual_adjusted,
    'Predicted': predicted,
    'Diff': diff,
    'Bookings':test_df['bookings']
})
results_df = pd.concat([results_df,test_df['date']], axis=1)
results_df['date'] = pd.to_datetime(df['date'])
results_df = results_df.set_index('date')
print(results_df)



# Percent of predictions under 1000, 500 and 250 in absolute difference

In [None]:
count_under_1000 = len(results_df[abs(results_df['Diff']) < 1000])
count_under_500 = len(results_df[abs(results_df['Diff']) < 500])
count_under_250 = len(results_df[abs(results_df['Diff']) < 250])
total_rows = len(results_df)
print("Andel förekomster av värden under 1000 i procentform:", (count_under_1000 / total_rows) * 100, "%")
print("Andel förekomster av värden under 500 i procentform:", (count_under_500 / total_rows) * 100, "%")
print("Andel förekomster av värden under 250 i procentform:", (count_under_250 / total_rows) * 100, "%")

# Plot of prediction vs actual visitors

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(results_df['Predicted'], label='Prediktioner')
plt.plot(results_df['Actual'], label='Riktiga Värden')
plt.title('Tensorflow Modell Maj')
plt.xlabel('Datum')
plt.ylabel('Besökare')
plt.legend()
plt.show()

# Uncomment the rows below to save the model and scalers

In [None]:
# tf.keras.models.save_model(model,'new_model')

In [None]:
# import joblib

# joblib.dump(scaler_y, 'new_scaler_y.pkl')
# joblib.dump(scaler_X, 'new_scaler_X.pkl')