In [2]:
import configparser
import os
from joblib import dump, load
import json
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels as sm
import numpy as np
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA
import scienceplots
import latex
import random

from sklearn.metrics import accuracy_score
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization

# Set plotting parameters
plt.style.use(['science', 'ieee'])
plt.rcParams['figure.dpi'] = 100

pd.set_option('display.max_rows', 200)

# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')

# Read data
df_og = load(config['PATH']['DATA_DIR'] + '/full_df_daily_cap95.joblib')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Construct target as bad (0-6), okay (6-8), good (8-10)
df = df_og.copy()
# df = df.drop(columns = ['time'])
df['target'] = df['target'].apply(lambda x: 0 if x < 7 else 1)

# Impute outliers (larger than 1.5 IQR) with mean of column in all _sum columns
# for col in df.columns:
#     if col.endswith('_sum'):
#         df[col] = df[col].mask(df[col] > df[col].quantile(0.75) + 1.5 * (df[col].quantile(0.75) - df[col].quantile(0.25)), df[col].mean())

In [4]:
X, y = df.drop(columns=['target'], axis = 1), df['target']

X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_ts(X, y, split_ratios=[0.5, 0.2])

# Full train set for final model
X_train_full = pd.concat([X_train, X_val])
y_train_full = pd.concat([y_train, y_val])

# Get non-dummy columns
if len([var for var in df.columns if var.startswith('day')]) == 7:
    df = df.drop(columns=['day_of_week_0'])
dummy_cols = [var for var in df.columns if var.startswith('day')] + ['target', 'activity_idc', 'call_idc', 'sms_idc', 'id']
cols_no_dummy = [var for var in X_train.columns if var not in dummy_cols]

# Scale data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

ct = ColumnTransformer([('scaler', StandardScaler(), cols_no_dummy)], remainder='passthrough')
ct.set_output(transform='pandas')
X_train_scaled = ct.fit_transform(X_train)
X_val_scaled = ct.transform(X_val)


In [5]:
def shape_lstm(X, y, seq_length):
    import numpy as np

    X_lstm = []
    y_lstm = []
    
    # Iterate over persons:
    for person in X['remainder__id'].unique():
        # Get all days for this person
        X_person = X[X['remainder__id'] == person].drop(columns = ['remainder__id']).values
        y_person = y[X['remainder__id'] == person].drop(columns = ['remainder__id']).values

        i = 0
        while i < len(X_person):
            # If days are less than seq_length, pad with zeros at the beginning of X
            if len(X_person) < seq_length:
                n_missing = seq_length - len(X_person)
                X_padded = np.pad(X_person, ((n_missing, 0), (0, 0)), mode='constant', constant_values=0)
                X_lstm.append(X_padded)
                y_lstm.append(y_person[-1])
                break

            # If days are more than seq_length, create sequences of seq_length days
            elif i + seq_length <= len(X_person):
                X_lstm.append(X_person[i:i+seq_length])
                y_lstm.append(y_person[i+seq_length-1])
                i += 1

            # If the remaining days are not enough to form a full sequence, stop iterating
            else:
                break
                
        # Add sequenced data for this person to the list of all sequences

    # Convert lists to numpy arrays and reshape y_lstm
    X_lstm = np.array(X_lstm).astype('float32'	)
    y_lstm = np.array(y_lstm).reshape(-1, 1).astype('float32')

    # Return full sequenced dataframe
    return X_lstm, y_lstm


# Define the LSTM model
def build_lstm_model(input_shape, hunits = 128):
    model = Sequential()
    model.add(LSTM(hunits, input_shape=input_shape, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    return model



In [6]:
seq_length = 7
X_lstm_train, y_lstm_train = shape_lstm(X_train_scaled, y_train, seq_length=seq_length)
X_lstm_val, y_lstm_val = shape_lstm(X_val_scaled, y_val, seq_length=seq_length)


In [7]:
# Define the objective function for the optimization
def objective(trial: Trial):
    hidden_units = trial.suggest_int('hunits', 16, 256, step=16)
    epochs = trial.suggest_int('epochs', 5, 40, step=5)


    input_shape = (seq_length, X_lstm_train.shape[2])
    model = build_lstm_model(input_shape, hidden_units)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(X_lstm_train, y_lstm_train, epochs=epochs, verbose=0)
    score = model.evaluate(X_lstm_val, y_lstm_val, verbose=0)

    return score[1]

# Optimize the pipeline using Optuna
manual_seed = 42
np.random.seed(manual_seed)
random.seed(manual_seed)
tf.random.set_seed(manual_seed)
sampler = TPESampler(seed=manual_seed)

study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=50)

# Print the best trial
best_trial = study.best_trial
print(f"Best trial: {best_trial.number}, Accuracy: {best_trial.value}")
print(f"Best params: {best_trial.params}")


[32m[I 2023-04-20 16:59:16,443][0m A new study created in memory with name: no-name-d85f60d7-fc45-40d6-9394-a296eb81202d[0m
[32m[I 2023-04-20 16:59:27,048][0m Trial 0 finished with value: 0.698113203048706 and parameters: {'hunits': 96, 'epochs': 40}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 16:59:34,175][0m Trial 1 finished with value: 0.5849056839942932 and parameters: {'hunits': 192, 'epochs': 25}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 16:59:38,896][0m Trial 2 finished with value: 0.6415094137191772 and parameters: {'hunits': 48, 'epochs': 10}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 16:59:46,408][0m Trial 3 finished with value: 0.6603773832321167 and parameters: {'hunits': 16, 'epochs': 35}. Best is trial 0 with value: 0.698113203048706.[0m




[32m[I 2023-04-20 16:59:55,718][0m Trial 4 finished with value: 0.6226415038108826 and parameters: {'hunits': 160, 'epochs': 30}. Best is trial 0 with value: 0.698113203048706.[0m




[32m[I 2023-04-20 17:00:03,735][0m Trial 5 finished with value: 0.6037735939025879 and parameters: {'hunits': 16, 'epochs': 40}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 17:00:10,661][0m Trial 6 finished with value: 0.6603773832321167 and parameters: {'hunits': 224, 'epochs': 10}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 17:00:15,348][0m Trial 7 finished with value: 0.6415094137191772 and parameters: {'hunits': 48, 'epochs': 10}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 17:00:22,747][0m Trial 8 finished with value: 0.6415094137191772 and parameters: {'hunits': 80, 'epochs': 25}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 17:00:29,269][0m Trial 9 finished with value: 0.6415094137191772 and parameters: {'hunits': 112, 'epochs': 15}. Best is trial 0 with value: 0.698113203048706.[0m
[32m[I 2023-04-20 17:00:50,128][0m Trial 10 finished with value: 0.6226415038108826 an

Best trial: 18, Accuracy: 0.7358490824699402
Best params: {'hunits': 48, 'epochs': 35}


In [8]:
# Train the model on the full training set
ct = ColumnTransformer([('scaler', StandardScaler(), cols_no_dummy)], remainder='passthrough')
ct.set_output(transform='pandas')

X_train_full_scaled = ct.fit_transform(X_train_full)
X_test_scaled = ct.transform(X_test)

X_train_full_lstm, y_train_full_lstm = shape_lstm(X_train_full_scaled, y_train_full, seq_length=7)
X_test_lstm, y_test_lstm = shape_lstm(X_test_scaled, y_test, seq_length=7)

# Set the input_shape for your model based on your input data
input_shape = (seq_length, X_train_full_lstm.shape[2])
model2 = build_lstm_model(input_shape, hunits=best_trial.params['hunits'])
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.summary()
hist_full = model2.fit(X_train_full_lstm, y_train_full_lstm, epochs=best_trial.params['epochs'], batch_size=32)


# Predict on the test set
y_pred = model2.predict(X_test_lstm)

# Evaluate the model
score = model2.evaluate(X_test_lstm, y_test_lstm, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

# Print the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

y_pred = np.round(y_pred)
cm = confusion_matrix(y_test_lstm, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm, annot=True, cmap='Blues', fmt='.2f')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('figures/lstm_confusion_matrix.pdf')
plt.show()


Model: "sequential_50"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_50 (LSTM)              (None, 48)                18432     
                                                                 
 dense_50 (Dense)            (None, 1)                 49        
                                                                 
Total params: 18,481
Trainable params: 18,481
Non-trainable params: 0
_________________________________________________________________
Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35
Test loss: 1.028864860534668
Test 

ModuleNotFoundError: No module named 'seaborn'

In [186]:
# dataframe with x = 1,2,3,4
df = pd.DataFrame({'x': [1,2,3,4]})
df.shift(-1)

Unnamed: 0,x
0,2.0
1,3.0
2,4.0
3,


In [31]:
# classification report
from sklearn.metrics import classification_report
print(classification_report(y_test_lstm, y_pred.flatten()))


              precision    recall  f1-score   support

         0.0       0.68      0.56      0.62       108
         1.0       0.68      0.77      0.72       127

    accuracy                           0.68       235
   macro avg       0.68      0.67      0.67       235
weighted avg       0.68      0.68      0.67       235



In [29]:
y_pred.flatten()

array([0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 1., 1.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0.