## Data Analysis

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
from data import get_data, extract_date_data
from model import load_model_data
from utils import get_path
plt.style.use('ggplot')

In [None]:
X, y = get_data(include_date = True)
df = X.copy()
X.drop(columns = ['date'], inplace = True)
df['total'] = y.copy()
df.head()

In [None]:
f = plt.figure(figsize = (14, 11))
plt.matshow(df.corr(), fignum = f.number, cmap = 'coolwarm', vmin = -1, vmax = 1)
plt.xticks(range(df.shape[1]), df.columns, fontsize=14, rotation=65)
plt.yticks(range(df.shape[1]), df.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)

In [None]:
xticks = list(range(1, len(df.index), 5))
ax = df.total.plot(figsize = (15, 7), title = 'Total BSRT demand', rot = 45, xticks = xticks, color = 'green')
ax.set_xticklabels(df.date[xticks])
plt.show()

## Using the model to impute data

In [None]:
weather_df = pd.read_csv(get_path('data') / 'weather_data.tsv', sep = '\t')
weather_df.head()

In [None]:
from datetime import datetime
dates = weather_df['date'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
date_df = extract_date_data(dates)
date_df.head()

In [None]:
pred_df = pd.concat([date_df, weather_df], axis = 1)
pred_df.head()

In [None]:
from utils import month, day_of_week, precip_type
pred_df['month'] = pred_df['month'].map(month)
pred_df['day_of_week'] = pred_df['day_of_week'].map(day_of_week)
pred_df['precip_type'] = pred_df['precip_type'].map(precip_type)
pred_df.head()

### Imputing with Gaussian process

In [None]:
gaussian_process = load_model_data('gaussian_process')['model']
gaussian_process

In [None]:
gaussian_df = pred_df.copy()
means, stds = gaussian_process.predict(gaussian_df.drop(columns = ['date']), return_std = True)
gaussian_df['lower_bound'] = means - 1.96 * stds
gaussian_df['upper_bound'] = means + 1.96 * stds
gaussian_df.head()

In [None]:
fig, ax = plt.subplots(figsize = (40, 13))
plt.fill_between(gaussian_df.date, gaussian_df.lower_bound, 
                 gaussian_df.upper_bound, figure = fig, color = 'purple')
plt.scatter(df.date, df.total, color = 'green')
plt.xticks(range(1, len(gaussian_df.index), 25), 
           gaussian_df.date[list(range(1, len(gaussian_df.index), 25))], 
           rotation = 45)
plt.show()

### Imputing with random forest

In [None]:
random_forest = load_model_data('random_forest')['model']
random_forest

In [None]:
random_df = pred_df.copy()
random_df['predictions'] = random_forest.predict(random_df.drop(columns = ['date']))
random_df.head()

In [None]:
fig, ax = plt.subplots(figsize = (40, 13))
plt.plot(random_df.date, random_df.predictions,
         figure = fig, color = 'purple')
plt.scatter(df.date, df.total, color = 'green')
plt.xticks(range(1, len(random_df.index), 25), 
           random_df.date[list(range(1, len(random_df.index), 25))], 
           rotation = 45)
plt.title(f'Imputing total demand with {type(random_forest).__name__}', fontsize = 30)
plt.show()

## Visualising model performance

In [None]:
def earlier_than(date1: str, date2: str):
    from datetime import datetime
    date1, date2 = datetime.strptime(date1, '%Y-%m-%d'), datetime.strptime(date2, '%Y-%m-%d')
    return date1 < date2

In [None]:
def viz_model(model_fname: str = 'random_forest', cutoff_date = '2019-09-01'):
    train_idxs = [idx for idx, date in zip(df.index, df.date) if earlier_than(date, cutoff_date)]
    
    X_train, X_val = X.iloc[train_idxs, :], X.iloc[list(set(df.index) - set(train_idxs)), :]
    y_train, y_val = y[train_idxs], y[list(set(df.index) - set(train_idxs))]
    idxs = sorted(X_val.index)

    model = load_model_data(model_fname)['model'].fit(X_train, y_train)
    
    fig, ax = plt.subplots(figsize = (15, 7))
    plt.plot(df.date[idxs], df.total[idxs], label = 'true', color = 'grey')
    plt.plot(df.date[idxs], model.predict(X_val), label = 'prediction', color = 'blue')
    plt.legend(fontsize = 17)
    plt.xticks(rotation = 60)
    plt.title(f'Predictions - {type(model).__name__}', fontsize = 18)  
    
    plt.show()

In [None]:
viz_model('random_forest', cutoff_date = '2019-09-01')

In [None]:
viz_model('gaussian_process', cutoff_date = '2019-09-01')