In [122]:
import configparser
import os
from joblib import dump, load
import json
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels as sm
import numpy as np
import pmdarima as pm
from statsmodels.tsa.arima.model import ARIMA

pd.set_option('display.max_rows', 500)

# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
# os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_csv(config['PATH']['DATA_DIR'] + '/dataset_mood_smartphone.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

# time to datetime
df['time'] = pd.to_datetime(df['time'])


In [123]:
# TODO: 
# DONE: Forward fill valence and arousal
# DONE: Remove appCat.builtin negative values
# DONE: Remove appCat outliers
# Impute valence, arousal and mood
# Impute long term missing values with mean instead of ffill (for valence and arousal)
# Aggregate to daily mood
# Decide on aggregation method for each variable

In [124]:
# # Forward fill valence and arousal
# # Iterate over people
# for person in tqdm(df['id'].unique()):
#     # Forward fill valence and arousal
#     idx_arousal = np.logical_and(df['id'] == person, df['variable'] == 'circumplex.arousal')
#     df.loc[idx_arousal] = df.loc[idx_arousal].fillna(method='ffill')
df

Unnamed: 0,id,time,variable,value
0,AS14.01,2014-02-26 13:00:00.000,mood,6.000
1,AS14.01,2014-02-26 15:00:00.000,mood,6.000
2,AS14.01,2014-02-26 18:00:00.000,mood,6.000
3,AS14.01,2014-02-26 21:00:00.000,mood,7.000
4,AS14.01,2014-02-27 09:00:00.000,mood,6.000
...,...,...,...,...
376907,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
376908,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
376909,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
376910,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033


## Extreme values appCat variables
There are many outliers in the appCat variables, this is not ideal for numerical stability. We will one hot encode these outliers per variable and remove the outlier from the original observation

In [125]:
#  We will one hot encode these outliers per variable and remove the outlier from the original observation
#  Moreover we will remove all negative values

all_vars = df['variable'].unique()
appVars = [var for var in all_vars if 'appCat' in var]
appVars

for var in appVars:
    df_var_cur = df[df['variable'] == var]
    # Iterate over observations

    # Get 95th percentile
    perc98 = np.percentile(df_var_cur['value'], 98)

    # Get all idx where value is smaller than 0
    idx = df_var_cur[df_var_cur['value'] < 0].index
    df.drop(idx, axis=0, inplace=True)

    # Get all idx where value is larger than 95th percentile
    idx_98 = df_var_cur[df_var_cur['value'] > perc98].index

    # Change variable name to var_outlier
    df.loc[idx_98, 'variable'] = var + '_outlier'
    df.loc[idx_98, 'value'] = 1    


In [126]:
mood_vars = ['mood', 'circumplex.valence', 'circumplex.arousal']	

idx_mood = np.logical_and(df['id'] == "AS14.28" , df['variable'] == 'mood')

df_mood = df[idx_mood].copy()['value']
df_mood
print(df_mood)

# Iterate over people
for person in tqdm(df['id'].unique()[:3]):
    # Dataframe for mood of current person
    idx_mood = np.logical_and(df['id'] == person , df['variable'] == 'mood')
    df_mood = df[idx_mood].copy()['value']
    model = pm.auto_arima(df_mood, suppress_warnings=True, seasonal=False, stepwise=True, d = 0, stationary = True)

    # # Extract the best (p, d, q) orders
    p, d, q = model.order
    # print(f"Person: {person}")
    # print(f"Estimated ARIMA orders: p={p}, d={d}, q={q}")

    # Fit the ARIMA model using the estimated orders
    arima_model = ARIMA(df_mood, order=(p, d, q))
    arima_model_fit = arima_model.fit()

    print("ARIMA model summary:")
    print(arima_model_fit.summary())

4543     7.0
4544     6.0
4545     8.0
4546     8.0
4547     7.0
4548     7.0
4549     6.0
4550     5.0
4551     9.0
4552     6.0
4553     8.0
4554     8.0
4555    10.0
4556     9.0
4557     6.0
4558     7.0
4559     3.0
4560     6.0
4561     6.0
4562     8.0
4563     7.0
4564     7.0
4565     8.0
4566     8.0
4567     8.0
4568     7.0
4569     6.0
4570     8.0
4571     7.0
4572     8.0
4573     9.0
4574     8.0
4575     7.0
4576     8.0
4577     8.0
4578     8.0
4579     7.0
4580     7.0
4581     8.0
4582     9.0
4583     7.0
4584     7.0
4585     6.0
4586     6.0
4587     9.0
4588     7.0
4589     6.0
4590     7.0
4591     7.0
4592     9.0
4593     8.0
4594     7.0
4595     9.0
4596     8.0
4597     8.0
4598     7.0
4599     7.0
4600     7.0
4601     7.0
4602     7.0
4603     8.0
4604     5.0
4605     7.0
4606     7.0
4607     8.0
4608     7.0
4609     6.0
4610     7.0
4611     5.0
4612     8.0
4613     8.0
4614     8.0
4615     7.0
4616     6.0
4617     4.0
4618     6.0
4619     8.0

 33%|███▎      | 1/3 [00:02<00:04,  2.14s/it]

ARIMA model summary:
                               SARIMAX Results                                
Dep. Variable:                  value   No. Observations:                  222
Model:                 ARIMA(1, 0, 1)   Log Likelihood                -247.092
Date:                Wed, 12 Apr 2023   AIC                            502.184
Time:                        19:10:32   BIC                            515.795
Sample:                             0   HQIC                           507.679
                                - 222                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.0702      0.257     27.515      0.000       6.567       7.574
ar.L1          0.9551      0.034     28.229      0.000       0.889       1.021
ma.L1         -0.7744      0.05

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
 67%|██████▋   | 2/3 [00:04<00:02,  2.08s/it]

ARIMA model summary:
                               SARIMAX Results                                
Dep. Variable:                  value   No. Observations:                  159
Model:                 ARIMA(0, 0, 3)   Log Likelihood                -253.972
Date:                Wed, 12 Apr 2023   AIC                            517.945
Time:                        19:10:34   BIC                            533.290
Sample:                             0   HQIC                           524.176
                                - 159                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          6.7786      0.152     44.588      0.000       6.481       7.077
ma.L1          0.1069      0.081      1.312      0.189      -0.053       0.267
ma.L2          0.2584      0.08

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
100%|██████████| 3/3 [00:07<00:00,  2.44s/it]

ARIMA model summary:
                               SARIMAX Results                                
Dep. Variable:                  value   No. Observations:                  221
Model:                 ARIMA(1, 0, 1)   Log Likelihood                -223.883
Date:                Wed, 12 Apr 2023   AIC                            455.766
Time:                        19:10:37   BIC                            469.358
Sample:                             0   HQIC                           461.254
                                - 221                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.6277      0.156     48.956      0.000       7.322       7.933
ar.L1          0.9784      0.024     40.110      0.000       0.931       1.026
ma.L1         -0.9163      0.04




In [49]:
# Get all variables
variables = df['variable'].unique()

# Iterate over variables
for variable in variables:
    # Get the data for this variable
    df_variable = df[df['variable'] == variable]

    # Get 95th percentile
    percentile_95 = np.percentile(df_variable['value'], 95)

    # Describe value column for variable
    print(f"Variable {variable}:")
    print(df_variable['value'].describe())

Variable mood:
count    5641.000000
mean        6.992555
std         1.032769
min         1.000000
25%         7.000000
50%         7.000000
75%         8.000000
max        10.000000
Name: value, dtype: float64
Variable circumplex.arousal:
count    5597.000000
mean       -0.098624
std         1.051868
min        -2.000000
25%        -1.000000
50%         0.000000
75%         1.000000
max         2.000000
Name: value, dtype: float64
Variable circumplex.valence:
count    5487.000000
mean        0.687808
std         0.671298
min        -2.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: value, dtype: float64
Variable activity:
count    22965.000000
mean         0.115958
std          0.186946
min          0.000000
25%          0.000000
50%          0.021739
75%          0.158333
max          1.000000
Name: value, dtype: float64
Variable screen:
count    96578.000000
mean        75.335206
std        253.822497
min          0.035000
25%        

In [92]:
df.append(df_variable)
# print(df.loc[idx_arousal])

AttributeError: 'DataFrame' object has no attribute 'append'

In [120]:
idx_mood = np.logical_and(df['id'] == "AS14.30" , df['variable'] == 'mood')
df_mood = df[idx_mood].copy()
df_mood

# Time sections

# Get first date
first_date = df_mood['time'].min().date()

# Last date
last_date = df_mood['time'].max().date()

# Iterate over dates by day
for date in pd.date_range(start=first_date, end=last_date, freq='D'):
    # Get all rows for this date
    idx_date = df_mood['time'].dt.date == date.date()
    df_date = df_mood[idx_date].copy()

    if len(df_date) == 5:
        print(date, len(df_date), df_date)
        continue

    # check for observation between 9.00 and 12.00
    hour_sets = [[9,12], [12,15], [15,18], [18,21], [21,24]]

    for hour_set in hour_sets:
        idx_cur = np.logical_and(df_date['time'].dt.hour >= hour_set[0], df_date['time'].dt.hour < hour_set[1])
        if len(df_date[idx_cur]) == 0:
            print(f"Missing observation for {date} between {hour_set[0]} and {hour_set[1]}")
            # Create new row
            new_row = df_date.iloc[0].copy()
            new_row['time'] = date.replace(hour=hour_set[0])
            new_row['value'] = np.nan

            df = pd.concat([df, pd.DataFrame(new_row).T], ignore_index=True)
            df_date = pd.concat([df_date, pd.DataFrame(new_row).T], ignore_index=True)



2014-03-20 00:00:00 5              id                time variable value
4899    AS14.30 2014-03-20 15:00:00     mood   8.0
4900    AS14.30 2014-03-20 18:00:00     mood   7.0
4901    AS14.30 2014-03-20 21:00:00     mood   6.0
376912  AS14.30 2014-03-20 09:00:00     mood   NaN
376913  AS14.30 2014-03-20 12:00:00     mood   NaN
2014-03-21 00:00:00 5            id                time variable value
4902  AS14.30 2014-03-21 09:00:00     mood   7.0
4903  AS14.30 2014-03-21 12:00:00     mood   8.0
4904  AS14.30 2014-03-21 15:00:00     mood   7.0
4905  AS14.30 2014-03-21 18:00:00     mood   9.0
4906  AS14.30 2014-03-21 21:00:00     mood   8.0
2014-03-22 00:00:00 5            id                time variable value
4907  AS14.30 2014-03-22 09:00:00     mood   8.0
4908  AS14.30 2014-03-22 12:00:00     mood   8.0
4909  AS14.30 2014-03-22 15:00:00     mood   7.0
4910  AS14.30 2014-03-22 18:00:00     mood   8.0
4911  AS14.30 2014-03-22 21:00:00     mood   7.0
2014-03-23 00:00:00 5            id     

In [118]:
df

Unnamed: 0,id,time,variable,value
0,AS14.01,2014-02-26 13:00:00,mood,6.0
1,AS14.01,2014-02-26 15:00:00,mood,6.0
2,AS14.01,2014-02-26 18:00:00,mood,6.0
3,AS14.01,2014-02-26 21:00:00,mood,7.0
4,AS14.01,2014-02-27 09:00:00,mood,6.0
...,...,...,...,...
376918,AS14.30,2014-04-14 15:00:00,mood,
376919,AS14.30,2014-04-20 12:00:00,mood,
376920,AS14.30,2014-04-25 15:00:00,mood,
376921,AS14.30,2014-05-05 18:00:00,mood,


In [88]:
KF = KalmanFilter(y=df['Nile'], 
                    a_init=np.zeros(np.maximum()).reshape(1,1), 
                    P_init=np.array([10e7]).reshape(1,1),
                    H=np.array([15099]).reshape(1,1), 
                    Q = np.array([1469.1]).reshape(1,1), 
                    R = np.array([1]).reshape(1,1))

def KF_arma(p, q, df):
    KF = KalmanFilter(y=df['Nile'],
                    a_init=np.zeros(np.maximum(p, q+1)).reshape(np.maximum(p, q+1),1),
                    P_init = np.diag(10e7 * np.ones(np.maximum(p, q+1))),
                    H = np.array([0]).reshape(1,1),
                    Q)
    

22

In [114]:
np.diag(10e7 * np.ones(q+1)))

Unnamed: 0,id,time,variable,value
376903,AS14.30,2014-04-06 11:38:32.033,appCat.weather,4.117
376904,AS14.30,2014-04-07 18:13:31.111,appCat.weather,11.039
376905,AS14.30,2014-04-07 18:16:49.107,appCat.weather,60.829
376906,AS14.30,2014-04-07 18:21:04.197,appCat.weather,3.018
376907,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
376908,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
376909,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
376910,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033
376911,AS14.32,2014-04-07 18:25:14.036,appCat.weather,22.431
376912,AS14.30,2014-03-20 09:00:00.000,mood,


Unnamed: 0,id,time,variable,value
0,AS14.01,2014-02-26 13:00:00,mood,6.0
1,AS14.01,2014-02-26 15:00:00,mood,6.0
2,AS14.01,2014-02-26 18:00:00,mood,6.0
3,AS14.01,2014-02-26 21:00:00,mood,7.0
4,AS14.01,2014-02-27 09:00:00,mood,6.0
...,...,...,...,...
376918,AS14.30,2014-04-14 15:00:00,mood,
376919,AS14.30,2014-04-20 12:00:00,mood,
376920,AS14.30,2014-04-25 15:00:00,mood,
376921,AS14.30,2014-05-05 18:00:00,mood,


In [129]:
# Get params
arima_model_fit.params

const     7.627699
ar.L1     0.978374
ma.L1    -0.916313
sigma2    0.443160
dtype: float64