In [1]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [2]:
df = pd.read_csv('dataset_mood_smartphone.csv')
# print('Total length df:', len(df)) # 376912 rows

candidates = df['id'].unique()
data_columns = df['variable'].unique()
# print(data_columns)

# Split time and date
df_pivoted = df.pivot_table(index=['id', 'time'], columns='variable', values='value').reset_index()
df_pivoted = pd.merge(df_pivoted, df[df['variable'] == 'mood'][['id', 'time']], on=['id', 'time'], suffixes=('_variable', '_mood'))
# print(df_pivoted) # 5641 rows, 21 columns

# Create separate columns for dates and times, day, month, and year
df_pivoted['time'] = pd.to_datetime(df_pivoted['time'])
df_pivoted['date'] = df_pivoted['time'].dt.date
df_pivoted['time_of_day'] = df_pivoted['time'].dt.time
df_pivoted['date'] = pd.to_datetime(df_pivoted['date'])
df_pivoted['day'] = df_pivoted['date'].dt.day
df_pivoted['month'] = df_pivoted['date'].dt.month
df_pivoted['year'] = df_pivoted['date'].dt.year

# print(df_pivoted[['id', 'date', 'day', 'month', 'year']]) # 5641 rows

# Remove time column, put new date-time columns in its place
time_index = df_pivoted.columns.get_loc('time')
df_pivoted.drop('time', axis=1, inplace=True)
# df_pivoted.drop('date', axis=1, inplace=True)

df_pivoted.insert(time_index, 'time_of_day', df_pivoted.pop('time_of_day'))
df_pivoted.insert(time_index + 1, 'day', df_pivoted.pop('day'))
df_pivoted.insert(time_index + 2, 'month', df_pivoted.pop('month'))
df_pivoted.insert(time_index + 3, 'year', df_pivoted.pop('year'))

print(df_pivoted) # 5641 rows, 24 columns

# How many mood values per candidate
mood_count = df_pivoted.groupby('id')['mood'].count()
# print(mood_count)




           id time_of_day  day  month  year  activity  appCat.builtin  \
0     AS14.01    13:00:00   26      2  2014       NaN             NaN   
1     AS14.01    15:00:00   26      2  2014       NaN             NaN   
2     AS14.01    18:00:00   26      2  2014       NaN             NaN   
3     AS14.01    21:00:00   26      2  2014       NaN             NaN   
4     AS14.01    09:00:00   27      2  2014       NaN             NaN   
...       ...         ...  ...    ...   ...       ...             ...   
5636  AS14.33    09:00:00   30      5  2014  0.252101             NaN   
5637  AS14.33    13:00:00   30      5  2014  0.466667             NaN   
5638  AS14.33    19:00:00   30      5  2014  0.035714             NaN   
5639  AS14.33    20:00:00   30      5  2014  0.008403             NaN   
5640  AS14.33    12:00:00   31      5  2014       NaN             NaN   

      appCat.communication  appCat.entertainment  appCat.finance  ...  \
0                      NaN                   NaN  

In [5]:
# # Average score per participant per day for all columns
# start_date = df_pivoted['date'].min()
# end_date = df_pivoted['date'].max()

# print(start_date, end_date)
# df_pivoted['day_count'] = (df_pivoted['date'] - start_date).dt.days
# print(df_pivoted)

# daily_participant_avg = df_pivoted.groupby(['id','day_count']).mean()

# print(daily_participant_avg)

### Notes and questions
1. Fill in data first, then average per day, then calculate correlation with moods, or calculate averages, see what's missing, fill in these variables, then calculate correlation with moods
2. 

In [None]:
df.interpolate(method='linear', inplace=True)

In [6]:
# # Implementation of MICE
# df_numeric = df_pivoted.select_dtypes(include='number')
# df_mice = df_numeric.copy()
# missing_values = df_mice.isna()
# imp = IterativeImputer(max_iter=10, random_state=0)
# imp_values = imp.fit_transform(df_mice)
# df_mice[missing_values] = imp_values[missing_values]
# print(df_mice)


## Simple imputation

In [72]:
df = pd.read_csv('data_cleaned.csv')
df.fillna(value=0, inplace=True)

# Sort df on time and ID
df_mood = df.pivot(index='time', columns='id', values='mood')
df_mood_simple = df_mood.copy()
# df_simple_mood.replace({pd.NA: None}, inplace=True)

# Simple imputation
previous_value = 0
for column in df_mood_simple.columns:
    # print(f"Column: {column}")
    for i, value in df_mood_simple[column].items():
        # print(value)
        if value == 0:
            # print(value)
            if previous_value != 0:
                df_mood_simple.at[i, column] = previous_value
        else:
            previous_value = value        

# Check imputed values
# diff = df_mood[df_mood != df_mood_simple]

print(df_mood_simple)

id          AS14.01   AS14.02  AS14.03   AS14.05  AS14.06  AS14.07  AS14.08  \
time                                                                          
2014-03-14      0.0       NaN      NaN  8.000000      NaN      NaN      7.4   
2014-03-15      0.0       NaN      NaN  6.200000      NaN      NaN      6.4   
2014-03-16      0.0  6.333333      NaN  6.333333      NaN      NaN      7.0   
2014-03-17      0.0  6.750000      NaN  6.000000      NaN      NaN      5.4   
2014-03-18      0.0  8.200000      NaN  7.500000      NaN      NaN      6.4   
...             ...       ...      ...       ...      ...      ...      ...   
2014-05-11      NaN       NaN      NaN       NaN      NaN      NaN      NaN   
2014-05-12      NaN       NaN      NaN       NaN      NaN      NaN      NaN   
2014-05-13      NaN       NaN      NaN       NaN      NaN      NaN      NaN   
2014-05-14      NaN       NaN      NaN       NaN      NaN      NaN      NaN   
2014-05-15      NaN       NaN      NaN       NaN    

In [None]:
# Rolling window 5 days
df_mood_rolling = df_mood.copy()
window = []
previous_value = 0

for column in df_mood_rolling.columns:
    # print(f"Column: {column}")
    for i, value in df_mood_rolling[column].items():
        # print(value)
        if value == 0:
            # print(value)
            if previous_value != 0:
                df_mood_rolling.at[i, column] = previous_value
        else:
            previous_value = value        

# Check imputed values
# diff = df_mood[df_mood != df_mood_rolling]

# print(df_mood_rolling)