**TASK 2**

In [23]:
import numpy as np
import pandas as pd

**TASK 2A**
Random Forest and LSTM

In [24]:
#shift mood by 1 (since we're predicting for the day after)
data = pd.read_csv('ARIMA_imputed_data.csv')
data['mood'] = data['mood'].shift(1)


In [25]:
data['time'] = pd.to_datetime(data['time'])

In [26]:
print("Number of unique IDs:", data['id'].nunique())

Number of unique IDs: 27


In [10]:
#Split train/test - with logical temporal order and perform Random Forest classifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report

#FOR RF, CHANGE DATETIME TO DAY OF THE WEEK (RF NOT HANDLING DT OBJECT)
data['day_of_week'] = data['time'].dt.dayofweek
#drop time column
data.drop('time', axis=1, inplace=True)

#drop rows where mood == NaN (can't train on missing values)
data = data.dropna(subset=['mood'])

#1-hot encode ID
encoder = OneHotEncoder()
encoded_ids = encoder.fit_transform(data[['id']]).toarray()
encoded_id_df = pd.DataFrame(encoded_ids, 
                             columns=["id_" + str(i) for i in range(encoded_ids.shape[1])])
data.reset_index(drop=True, inplace=True)
encoded_id_df.reset_index(drop=True, inplace=True)
data = pd.concat([data, encoded_id_df], axis=1)
data.drop('id', axis=1, inplace=True)

#Create bins for the mood (classification)
bins = [0,4,7,10]
labels = ["sad", "neutral", "happy"]
data['mood_category'] = pd.cut(data['mood'], bins=bins, labels=labels, include_lowest=True)
data.drop('mood', axis=1, inplace=True)

#TIME SERIE SPLIT - MAYYBE OTHER TRAIN TEST SPLIT WOULD GIVE BETTER RESULTS
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(data):
    
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]
    
    X_train = train_data.drop('mood_category', axis='columns')
    y_train = train_data['mood_category']


    X_test = test_data.drop('mood_category', axis='columns')
    y_test = test_data['mood_category']
    
    # Train 
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X_train, y_train)

    #test
    y_pred = rf.predict(X_test)

    #score
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(classification_report(y_test, y_pred))



Accuracy: 38.23%
              precision    recall  f1-score   support

       happy       0.18      0.56      0.28        68
     neutral       0.73      0.34      0.47       255
         sad       0.00      0.00      0.00         4

    accuracy                           0.38       327
   macro avg       0.30      0.30      0.25       327
weighted avg       0.61      0.38      0.42       327



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 48.32%
              precision    recall  f1-score   support

       happy       0.46      0.24      0.31       161
     neutral       0.49      0.73      0.59       165
         sad       0.00      0.00      0.00         1

    accuracy                           0.48       327
   macro avg       0.32      0.32      0.30       327
weighted avg       0.47      0.48      0.45       327



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 51.68%
              precision    recall  f1-score   support

       happy       0.34      0.35      0.35       119
     neutral       0.62      0.61      0.62       208

    accuracy                           0.52       327
   macro avg       0.48      0.48      0.48       327
weighted avg       0.52      0.52      0.52       327

Accuracy: 49.85%
              precision    recall  f1-score   support

       happy       0.59      0.27      0.37       179
     neutral       0.47      0.77      0.58       148

    accuracy                           0.50       327
   macro avg       0.53      0.52      0.48       327
weighted avg       0.53      0.50      0.47       327

Accuracy: 43.12%
              precision    recall  f1-score   support

       happy       0.71      0.26      0.38       220
     neutral       0.34      0.78      0.47       107

    accuracy                           0.43       327
   macro avg       0.52      0.52      0.43       327
weighted avg       0.59

In [27]:
#Perform RNN LSTM
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import classification_report, accuracy_score

# Data preprocessing
data['day_of_week'] = data['time'].dt.dayofweek
data.drop('time', axis=1, inplace=True)
data = data.dropna(subset=['mood'])  # Ensure no NaNs in mood

# Encode categorical data
encoder = OneHotEncoder()
encoded_ids = encoder.fit_transform(data[['id']]).toarray()
encoded_id_df = pd.DataFrame(encoded_ids, columns=["id_" + str(i) for i in range(encoded_ids.shape[1])])
data = pd.concat([data.reset_index(drop=True), encoded_id_df.reset_index(drop=True)], axis=1)
data.drop('id', axis=1, inplace=True)

# Binning mood for classification
bins = [1, 4, 7, 10]
labels = [0, 1, 2]  # Use numeric labels for to_categorical
data['mood_category'] = pd.cut(data['mood'], bins=bins, labels=labels, include_lowest=True)
data.drop('mood', axis=1, inplace=True)

# Normalize features
scaler = MinMaxScaler()
data[data.columns.difference(['mood_category'])] = scaler.fit_transform(data[data.columns.difference(['mood_category'])])

# Prepare data for LSTM
tscv = TimeSeriesSplit(n_splits=5)
for train_index, test_index in tscv.split(data):
    train_data, test_data = data.iloc[train_index], data.iloc[test_index]
    
    # Assuming here we're just using 'n' days of data directly as features, without additional sequencing.
    X_train = train_data.drop('mood_category', axis=1).values
    y_train = to_categorical(train_data['mood_category'].values)
    X_test = test_data.drop('mood_category', axis=1).values
    y_test = to_categorical(test_data['mood_category'].values)
    
    # Reshape for LSTM [samples, time steps, features]
    X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
    X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    
    # Build LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax'))  # Output layer: 3 units for 3 categories
    
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    # Fit model
    model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=2)
    
    # Predict
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Evaluate the model
    #print("Accuracy:", accuracy_score(y_true_classes, y_pred_classes))
    accuracy = accuracy_score(y_true_classes, y_pred_classes)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(classification_report(y_true_classes, y_pred_classes))


Epoch 1/20
11/11 - 2s - loss: 1.0451 - accuracy: 0.4848 - val_loss: 1.0436 - val_accuracy: 0.2080 - 2s/epoch - 187ms/step
Epoch 2/20
11/11 - 0s - loss: 0.9898 - accuracy: 0.5394 - val_loss: 1.0099 - val_accuracy: 0.2080 - 64ms/epoch - 6ms/step
Epoch 3/20
11/11 - 0s - loss: 0.9399 - accuracy: 0.5515 - val_loss: 0.9774 - val_accuracy: 0.2080 - 69ms/epoch - 6ms/step
Epoch 4/20
11/11 - 0s - loss: 0.8861 - accuracy: 0.5788 - val_loss: 0.9396 - val_accuracy: 0.2080 - 60ms/epoch - 5ms/step
Epoch 5/20
11/11 - 0s - loss: 0.8501 - accuracy: 0.5636 - val_loss: 0.9192 - val_accuracy: 0.2080 - 59ms/epoch - 5ms/step
Epoch 6/20
11/11 - 0s - loss: 0.8123 - accuracy: 0.5394 - val_loss: 0.8928 - val_accuracy: 0.2080 - 55ms/epoch - 5ms/step
Epoch 7/20
11/11 - 0s - loss: 0.7820 - accuracy: 0.6152 - val_loss: 0.8770 - val_accuracy: 0.2080 - 52ms/epoch - 5ms/step
Epoch 8/20
11/11 - 0s - loss: 0.7673 - accuracy: 0.6061 - val_loss: 0.8461 - val_accuracy: 0.2232 - 56ms/epoch - 5ms/step
Epoch 9/20
11/11 - 0s - 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
21/21 - 2s - loss: 1.0286 - accuracy: 0.5967 - val_loss: 0.9984 - val_accuracy: 0.5046 - 2s/epoch - 113ms/step
Epoch 2/20
21/21 - 0s - loss: 0.9244 - accuracy: 0.6180 - val_loss: 0.9270 - val_accuracy: 0.5046 - 70ms/epoch - 3ms/step
Epoch 3/20
21/21 - 0s - loss: 0.8418 - accuracy: 0.6195 - val_loss: 0.8698 - val_accuracy: 0.5046 - 83ms/epoch - 4ms/step
Epoch 4/20
21/21 - 0s - loss: 0.7848 - accuracy: 0.6195 - val_loss: 0.8285 - val_accuracy: 0.5046 - 80ms/epoch - 4ms/step
Epoch 5/20
21/21 - 0s - loss: 0.7405 - accuracy: 0.6256 - val_loss: 0.7985 - val_accuracy: 0.5046 - 78ms/epoch - 4ms/step
Epoch 6/20
21/21 - 0s - loss: 0.7113 - accuracy: 0.6484 - val_loss: 0.7853 - val_accuracy: 0.5046 - 67ms/epoch - 3ms/step
Epoch 7/20
21/21 - 0s - loss: 0.6883 - accuracy: 0.6636 - val_loss: 0.7754 - val_accuracy: 0.5046 - 79ms/epoch - 4ms/step
Epoch 8/20
21/21 - 0s - loss: 0.6599 - accuracy: 0.6834 - val_loss: 0.7708 - val_accuracy: 0.5046 - 66ms/epoch - 3ms/step
Epoch 9/20
21/21 - 0s - 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1/20
31/31 - 2s - loss: 1.0166 - accuracy: 0.5081 - val_loss: 0.9239 - val_accuracy: 0.6361 - 2s/epoch - 70ms/step
Epoch 2/20
31/31 - 0s - loss: 0.8552 - accuracy: 0.5925 - val_loss: 0.8053 - val_accuracy: 0.6361 - 84ms/epoch - 3ms/step
Epoch 3/20
31/31 - 0s - loss: 0.7724 - accuracy: 0.5935 - val_loss: 0.7513 - val_accuracy: 0.6361 - 85ms/epoch - 3ms/step
Epoch 4/20
31/31 - 0s - loss: 0.7284 - accuracy: 0.6220 - val_loss: 0.7305 - val_accuracy: 0.6361 - 81ms/epoch - 3ms/step
Epoch 5/20
31/31 - 0s - loss: 0.7006 - accuracy: 0.6484 - val_loss: 0.7209 - val_accuracy: 0.5933 - 83ms/epoch - 3ms/step
Epoch 6/20
31/31 - 0s - loss: 0.6946 - accuracy: 0.6596 - val_loss: 0.7172 - val_accuracy: 0.5780 - 83ms/epoch - 3ms/step
Epoch 7/20
31/31 - 0s - loss: 0.6730 - accuracy: 0.6860 - val_loss: 0.7147 - val_accuracy: 0.5749 - 79ms/epoch - 3ms/step
Epoch 8/20
31/31 - 0s - loss: 0.6488 - accuracy: 0.7033 - val_loss: 0.7182 - val_accuracy: 0.5566 - 75ms/epoch - 2ms/step
Epoch 9/20
31/31 - 0s - l