In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Flatten, Conv3D
from keras.optimizers import Adam
import tensorflow as tf
import matplotlib.pyplot as plt
from itertools import product





## Load Data

Data should contain columns date, visitors, bookings, weather_rain, weather_temp, weather_available, days_before and bookings_increases. bookings_increases is statistical increases of bookings X days before target date and can be created in the simple statistical model file.

In [None]:
df = pd.read_csv('train_data_for_cnn_model.csv')
print(df)

# Looping the columns filling NaN with mean value 
columns_to_fill = ['bookings_increase']
for column in columns_to_fill:
    mean_value = df[column].mean()
    df[column] = df[column].fillna(mean_value)
print(df.isna().sum())
df['weather_rain'] = df['weather_rain'].fillna(0)


## Creates all date related columns

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['day_of_year'] = df['date'].dt.dayofyear
df['day_of_month'] = df['date'].dt.day
df.dropna(inplace=True)


# Adds Bank Holidays

In [None]:
import holidays

df['Good Friday'] = 0
df['Easter Sunday'] = 0
df['Easter Monday'] = 0
df['Ascension Day'] = 0
df['National Day'] = 0
df['Midsummer Eve'] = 0
df['Midsummer Day'] = 0
df['Whit Sunday'] = 0

for year in range(2010, 2025):
    year_holidays = holidays.Sweden(years=[year])
    for date in df['date']:
        date_str = date.strftime('%Y-%m-%d')
        if date_str in year_holidays:
            holiday_name = year_holidays[date_str]
            if 'Good Friday' in holiday_name:
                df.loc[df['date'] == date, 'Good Friday'] = 1
            if 'Easter Sunday' in holiday_name:
                df.loc[df['date'] == date, 'Easter Sunday'] = 1
            if 'Easter Monday' in holiday_name:
                df.loc[df['date'] == date, 'Easter Monday'] = 1
            if 'Ascension Day' in holiday_name:
                df.loc[df['date'] == date, 'Ascension Day'] = 1
            if 'National Day' in holiday_name:
                df.loc[df['date'] == date, 'National Day'] = 1
            if 'Midsummer Eve' in holiday_name:
                df.loc[df['date'] == date, 'Midsummer Eve'] = 1
            if 'Midsummer Day' in holiday_name:
                df.loc[df['date'] == date, 'Midsummer Day'] = 1
            if 'Whit Sunday' in holiday_name or 'Whitsun' in holiday_name:
                df.loc[df['date'] == date, 'Whit Sunday'] = 1
print(df)

# Creates 3D Sequences

In [None]:
def create_sequences_3d(data, sequence_length):
    """
    Creates 3D sequences from the given data with the specified sequence length.
    Parameters:
        data (numpy.ndarray): The given data.
        sequence_length (int): The length of the sequences to be created.
    Returns:
        numpy.ndarray: A 3D array containing sequences of the specified length.
    """
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i + sequence_length]
        seq = seq.reshape((sequence_length, 1, -1, 1))  
        sequences.append(seq)
    return np.array(sequences)

# Training Process

In [None]:
train_df = df[df['date'].between('2013-01-01', '2023-03-31')].copy()

features = ['year','day_of_year','day_of_month','weather_rain','bookings','weather_temp','weather_available','days_before',
            'Easter Sunday','bookings_increase',
            'Easter Monday','Ascension Day','National Day','Midsummer Eve','Midsummer Day','Whit Sunday','Good Friday',]

print(train_df)

X = train_df[features]
y = train_df['visitors']

scaler_X = StandardScaler()
X_scaled = scaler_X.fit_transform(train_df[features])

scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

#Possible to adjust sequence length
sequence_length = 1

X_scaled_sequences = create_sequences_3d(X_scaled, sequence_length)
y_scaled_adjusted = y_scaled[sequence_length - 1:]

test_size = 0.2
split_idx = int(len(X_scaled_sequences) * (1 - test_size))
X_train = X_scaled_sequences[:split_idx]
X_test = X_scaled_sequences[split_idx:]
y_train = y_scaled_adjusted[:split_idx]
y_test = y_scaled_adjusted[split_idx:]

# Add params here if you want to test ceveral values

In [None]:
hyperparam_space = {
    'num_filters': [32],
    'filter_size': [(1,1,17)], 
    'learning_rate': [0.001],
    'dropout_rate': [0.3],  
    'activation_option': ['relu'], 
    'num_conv_layers': [20],  
    'optimizer': [Adam]  
}

# Model Architecture

In [None]:
def build_and_compile_cnn_model(num_filters, filter_size, learning_rate, dropout_rate, activation_option, num_conv_layers, optimizer_value, input_shape):
    model = Sequential()
    for i in range(num_conv_layers):
        if i == 0:
            model.add(Conv3D(num_filters, filter_size,activation=activation_option,input_shape=input_shape, padding='same'))
        else:
            model.add(Conv3D(num_filters, filter_size,activation=activation_option, padding='same'))
        model.add(Dropout(dropout_rate))
        model.add(BatchNormalization())
    model.add(Flatten()) 
    model.add(Dense(units=1, activation='linear'))

    model.compile(optimizer=optimizer_value(learning_rate=learning_rate), loss='mean_squared_error', metrics=['mean_squared_error'])

    return model

In [None]:

input_shape = X_train.shape[1:]
best_score = float('inf')
best_model = None
best_params = None

for num_filters, filter_size, learning_rate, dropout_rate, activation_option, num_conv_layers, optimizer in product(*hyperparam_space.values()):
    model = build_and_compile_cnn_model(num_filters, filter_size, learning_rate, dropout_rate, activation_option, num_conv_layers, optimizer, input_shape)

    model.fit(X_train, y_train, epochs=300, batch_size=64, validation_data=(X_test, y_test))  
    evaluation = model.evaluate(X_test, y_test, verbose=0)
    val_loss = evaluation[0]

    params = {
        'num_filters': num_filters,
        'filter_size': filter_size,
        'learning_rate': learning_rate,
        'dropout_rate': dropout_rate,
        'activation_option': activation_option,
        'num_conv_layers': num_conv_layers,
        'optimizer': optimizer.__name__  
    }

    if val_loss < best_score:
        best_score = val_loss
        best_model = model
        best_params = params

print(f"Bästa val_loss: {best_score}")
print(f"Bästa hyperparametrar: {best_params}")

# Test

In [None]:
import keras
test_df = df[df['date'].between('2024-04-01', '2024-12-02')].copy()
print(test_df)

test_df = test_df[test_df['days_before'].isin([10])].copy()

X_test = test_df[features]
y_test = test_df['visitors']
X_test_scaled = scaler_X.transform(X_test)
X_test_scaled_sequences = create_sequences_3d(X_test_scaled, sequence_length)

if len(y_test) >= sequence_length:
    y_test = y_test[sequence_length - 1:]
else:
    raise ValueError("y_test has fewer elements than required for the given sequence_length")

predicted_y = best_model.predict(X_test_scaled_sequences)
predictions = scaler_y.inverse_transform(predicted_y).flatten()


In [None]:
plt.scatter(y_test, predictions, alpha=0.6, color='red', label='Förutsagda')
min_max_range = [y_test.min(), y_test.max()]
plt.plot(min_max_range, min_max_range, 'k--', label='Ideala linjen')
plt.xlabel('Faktiska Värden')
plt.ylabel('Förutsagda Värden')
plt.title('Faktiska vs. Förutsagda Värden')
plt.legend()
plt.show()


# Creates the dataframe containing the result and prints the values

In [None]:
actual = y_test.to_numpy()  
predicted = predictions 

if len(actual) > len(predicted):
    actual_adjusted = actual[-len(predicted):]  
else:
    actual_adjusted = actual

diff = actual_adjusted - predicted

results_df = pd.DataFrame({
    'Actual': actual_adjusted,
    'Predicted': predicted,
    'Diff': diff
})

if len(test_df) >= len(predicted):
    results_df['Bookings'] = test_df['bookings'].values[-len(predicted):]  
    results_df['Date'] = test_df['date'].values[-len(predicted):]  
else:
    results_df['Bookings'] = test_df['bookings'].values  
    results_df['Date'] = test_df['date'].values

results_df['Date'] = pd.to_datetime(results_df['Date'])
results_df = results_df.set_index('Date')

print(results_df)

# Percent of predictions under 1000, 500 and 250 in absolute difference

In [None]:
count_under_1000 = len(results_df[abs(results_df['Diff']) < 1000])
count_under_500 = len(results_df[abs(results_df['Diff']) < 500])
count_under_250 = len(results_df[abs(results_df['Diff']) < 250])
total_rows = len(results_df)
print("Andel förekomster av värden under 1000 i procentform:", (count_under_1000 / total_rows) * 100, "%")
print("Andel förekomster av värden under 500 i procentform:", (count_under_500 / total_rows) * 100, "%")
print("Andel förekomster av värden under 250 i procentform:", (count_under_250 / total_rows) * 100, "%")

# Plot of prediction vs actual visitors

In [None]:
plt.figure(figsize=(15, 7))
plt.plot(results_df['Predicted'], label='Prediktioner')
plt.plot(results_df['Actual'], label='Riktiga Värden')
plt.title('Tensorflow Modell Maj')
plt.xlabel('Datum')
plt.ylabel('Besökare')
plt.legend()
plt.show()

# Uncomment the rows below to save the model and scalers

In [None]:
# tf.keras.models.save_model(model,'model')

In [None]:
# import joblib

# joblib.dump(scaler_y, 'scaler_y.pkl')
# joblib.dump(scaler_X, 'scaler_X.pkl')