# Import

In [None]:
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings('ignore')
import holidays
import warnings
warnings.filterwarnings('ignore')
#Save requirements
os.system("pip freeze > requirements.txt")

In [None]:
df = pd.read_csv('Data/london_merged.csv')
display(df.head(2))

- Metadata:
  - "timestamp" - timestamp field for grouping the data
  - "cnt" - the count of a new bike shares
  - "t1" - real temperature in C
  - "t2" - temperature in C "feels like"
  - "hum" - humidity in percentage
  - "wind_speed" - wind speed in km/h
  - "weather_code" - category of the weather
  - "is_holiday" - boolean field - 1 holiday / 0 non holiday
  - "is_weekend" - boolean field - 1 if the day is weekend
  - "season" - category field meteorological seasons: 0-spring ; 1-summer; 2-fall; 3-winter.
  - "weathe_code" category description:
     - 1 = Clear ; mostly clear but have some values with haze/fog/patches of fog/ fog in vicinity 
     - 2 = scattered clouds / few clouds 
     - 3 = Broken clouds 
     - 4 = Cloudy 
     - 7 = Rain/ light Rain shower/ Light rain 
     - 10 = rain with thunderstorm 
     - 26 = snowfall 
     - 94 = Freezing Fog

In [None]:
df.info()

# Data Wrangling

In [None]:
#Convert the timestamp to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])
#Keep records from 2015 and 2016
df = df[(df['timestamp'].dt.year == 2015) | (df['timestamp'].dt.year == 2016)]
#Sort the values by timestamp
df = df.sort_values('timestamp')

In [None]:
#Missing values
df.isnull().sum()

- No missing values. But there might be missing timestamps.

In [None]:
#Check for missing timestamps
all_days = pd.date_range(start=df['timestamp'].min(), end=df['timestamp'].max(), freq='h')
missing_days = all_days[~all_days.isin(df['timestamp'])]
print('Number of missing timestamps:', len(missing_days))

In [None]:
missing_days[0]

- 130 timestamps are missing. We will imput them using existing values.

In [None]:
#London holidays
uk_holidays = holidays.UK(years=[df['timestamp'].dt.year.min(), df['timestamp'].dt.year.max()])
uk_holidays

In [None]:
#Create new dataframe using all days
df_full = pd.DataFrame(all_days, columns=['timestamp'])
#Merge with df to get cnt, t1, t2, hum, wind_speed, weather_code, season
df_full = df_full.merge(df[['timestamp', 'cnt', 't1', 't2', 'hum', 'wind_speed', 'weather_code', 'season']], on='timestamp', how='left')
#is_holiday column: 1 if holiday, 0 if not
df_full['is_holiday'] = np.where(df_full['timestamp'].dt.date.isin(uk_holidays), 1, 0)
df_full['is_weekend'] = np.where(df_full['timestamp'].dt.dayofweek.isin([5, 6]), 1, 0)

#Backfill missing values
df_full = df_full.ffill()
df = df_full.copy()

In [None]:
df.isnull().sum()

In [None]:
missing_days = all_days[~all_days.isin(df['timestamp'])]
print('Number of missing timestamps:', len(missing_days))

In [None]:
#Set the timestamp as the index
df.set_index('timestamp', inplace=True)
#Set period to 1 hour
df.index = pd.DatetimeIndex(df.index).to_period('h')
df.head(2)

In [None]:
df = df.resample('D').agg({'cnt':'sum', 
                           't1':'median', 
                           't2':'median', 
                           'hum':'median', 
                           'wind_speed':'median', 
                           'weather_code': lambda x: x.value_counts().index[0], 
                           'season': lambda x: x.value_counts().index[0], 
                           'is_holiday':'max', 
                           'is_weekend':'max'})
df.head(2)

In [None]:
#Remove duplicates
df.drop_duplicates(inplace=True)

In [None]:
# # Boxplot of all the columns
plt.figure(figsize=(10, 12))
cols = df.columns
print(cols)
for i in range(1, len(cols)):
    print(cols[i])
    plt.subplot(3, 3, i)
    sns.boxplot(df[cols[i-1]])
    plt.title(cols[i-1])
plt.tight_layout()
plt.show()

- There is no abnormal data in the dataset.

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

- Real and feels like temperature are highly correlated. Let's use feels like temperature since it is more likely to impact the decision.

In [None]:
#Drop t1
df.drop(['t1'], axis=1, inplace=True)

In [None]:
# #Pairplot
sns.pairplot(df[['cnt', 't2', 'wind_speed', 'is_holiday', 'is_weekend', 'weather_code',
                  'season',
                  ]],
             hue='cnt', 
             palette='coolwarm',
             height=3,
             aspect=1.5)
plt.show()

In [None]:
#Map codes
#Map weather code:
weather_desc = {
    1: 'Clear', 2: 'Scattered_Clouds', 3: 'Broken_Clouds', 4: 'Cloudy', 7: 'Rain', 10: 'Storm', 26: 'Snowfall', 94: 'Freezing_Fog'
}
df['weather_code'] = df['weather_code'].map(weather_desc)
#Map season:
seasons = {0:'Spring', 1:'Summer', 2:'Fall', 3:'Winter'}
df['season'] = df['season'].map(seasons)
df.head(2)

In [None]:
#One hot encoding for categorical variables
df = pd.get_dummies(df, drop_first=True, dtype=int)
df.head(2)

In [None]:
# Add month sine and cosine columns
df['month_sin'] = np.sin(2*np.pi*df.index.month/12)
df['month_cos'] = np.cos(2*np.pi*df.index.month/12)

In [None]:
#Training df till June 2016 and testing df from July 2016
train_df = df.loc[:'2016-06-30'].copy()
test_df = df.loc['2016-07-01':].copy()

In [None]:
#In case some models need validation set, split train_df to get validation set. Use validation from April 2016 to June 2016. 
train_train_df = train_df.loc[:'2016-03-31'].copy()
val_df = train_df.loc['2016-04-01':].copy()

# Diagnosis

### Check for seasonality

In [None]:
#Plot seasonal decomposition
from statsmodels.tsa.seasonal import seasonal_decompose

fig_df = train_df.copy()
fig_df = fig_df.asfreq('D')
fig_df.index = pd.DatetimeIndex(fig_df.index.to_timestamp())
print(fig_df.index.freq)
fig, axes = plt.subplots(4, 1, figsize=(20, 8))
seasonal_decompose = seasonal_decompose(fig_df['cnt'], model='additive')
seasonal_decompose.observed.plot(ax=axes[0], title='Observed')
seasonal_decompose.trend.plot(ax=axes[1], title='Trend')
seasonal_decompose.seasonal.plot(ax=axes[2], title='Seasonal')
seasonal_decompose.resid.plot(ax=axes[3], title='Residual')
plt.tight_layout()
plt.show()

There is clear seasonality in the data. Also, the data does not look stationary.

### Check for stationarity

In [None]:
def check_stationarity(data):
    print('Null Hypothesis: Presence of unit root (Data is not stationary)')
    print('Alternate Hypothesis: Absence of unit root (Data is stationary)')
    result = adfuller(data, autolag='AIC')
    print(result)
    print('ADF Statistic:', result[0])
    print('p-value:', result[1])
    if result[1] > 0.05:
        print('Data is not stationary')
    else:
        print('Data is stationary')

check_stationarity(train_df['cnt'])

Hence, the models to be used should be able to handle seasonality and non-stationarity. SARIMAX might be a good choice. However, there are multiple seasonalities in the data which might be difficult to capture with SARIMAX. We will use XGBoost, LSTM and Prophet for this task. Let's still try SARIMAX and see how it performs.

# Models

## SARIMAX

- We will use Auto ARIMA to find the best parameter and not rely only on the ACF and PACF plots. However, ACF and PACF plots are useful to define the search space for Auto ARIMA. It is to note that Auto Arima leads to memory issue for high order models. Therefore, we will limit the search space.

In [None]:
from sarimax import SARIMAX_model, SARIMAX_metrics, SARIMAX_plot
sarimax_y_train, sarimax_y_test, sarimax_y_pred = SARIMAX_model(train_df, test_df)

In [None]:
sarimax_test_metrics = SARIMAX_metrics(sarimax_y_test, sarimax_y_pred)
sarimax_test_metrics

In [None]:
#Plot predictions
SARIMAX_plot(sarimax_y_train, sarimax_y_test, sarimax_y_pred)

## XGBoost

In [None]:
from XGBoost import xgboost_model, xgboost_metrics, xgboost_plot
xgboost_y_train, xgboost_y_test, xgboost_y_pred = xgboost_model(train_train_df, val_df, test_df)

In [None]:
xgb_test_metrics = xgboost_metrics(xgboost_y_test, xgboost_y_pred)
xgb_test_metrics

In [None]:
#Plot predictions
xgboost_plot(train_df, test_df, xgboost_y_pred)

## LSTM with encoder and decoder

In [None]:
from lstm_encode_decode import lstm_model, lstm_metrics, lstm_plot
lstm_y_test, lstm_y_pred = lstm_model(train_train_df,
                                        val_df,
                                        test_df,
                                        lookback=28,
                                        forecast_horizon=7,
                                        )

In [None]:
lstm_test_metrics = lstm_metrics(lstm_y_test, lstm_y_pred)
lstm_test_metrics

In [None]:
#Plot predictions
lstm_plot(train_df, test_df, lstm_y_pred, lookback=28)

## Prophet

In [None]:
from Prophet import prophet_model, prophet_metrics, prophet_plot
prophet_test_df, prophet_y_pred = prophet_model(train_df, test_df)

In [None]:
prophet_test_metrics = prophet_metrics(prophet_test_df, prophet_y_pred)
prophet_test_metrics

In [None]:
#Plot predictions
prophet_plot(train_df, test_df, prophet_y_pred)

# Export Results

In [None]:
# Export Results to csv
models = ['sarimax', 'xgboost', 'lstm', 'prophet']
metrics_dict = [sarimax_test_metrics, xgb_test_metrics, lstm_test_metrics, prophet_test_metrics]
#Create a dataframe to store the results
results = []
for model, metrics in zip(models, metrics_dict):
    results.append({'model': model, 'rmse': metrics['RMSE'], 'mae': metrics['MAE'], 'mape': metrics['MAPE'], 'r-squared': metrics['R2']})
results = pd.DataFrame(results)
results.to_csv('Results/metrics_bad_feature_engineering.csv', index=False)