In [48]:
import configparser
import os
from joblib import dump, load
import json
from tqdm import tqdm
from helpers.helper_functions import *
from helpers.helper_classes import *
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', 500)

In [None]:
#TODO: General
# PCA to decide importance of valence and arousal, maybe we can kick these two out, else decide on impute method
# extremely big values, which variables and why and how to deal with those.
# Find out if data is recorded daily or other frequencies (per variable)

In [73]:
# Read config.ini file
config = configparser.ConfigParser()
config.read('src/config.ini')
# os.chdir(config['PATH']['ROOT_DIR'])

# # Load data
df = pd.read_csv(config['PATH']['DATA_DIR'] + '/dataset_mood_smartphone.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

# time to datetime
df['time'] = pd.to_datetime(df['time'])


## Descriptive data over the whole dataset and by person
### Questions answered:
1. how many people are there in the dataset?
2. how many variables are there in the dataset?
3. how many days of data do we have for each person?
4. how many variables do we have for each person?



In [74]:
descriptive_dict = {}

# 1. How many people are there in the dataset?
tot_persons = len(df['id'].unique())
print('There are {} people in the dataset.'.format(tot_persons))

# 2. How many variables are in the dataset?
print('There are {} variables in the dataset.'.format(len(df['variable'].unique())))

# 3. How many variables do we have for each person?
# Iterate over persons
for person in df['id'].unique():
    
    # Get the data for this person
    df_person = df[df['id'] == person]

    # Get the number of variables
    num_vars = df_person['variable'].unique().shape[0]
    time_min, time_max = df_person['time'].min(), df_person['time'].max()
    
    # Count number of days of data
    num_days = (time_max - time_min).days

    # Get number of observations
    num_obs = df_person.shape[0]

    # Get mean mood
    mean_mood = df_person[df_person['variable'] == 'mood']['value'].mean()

    # Store the result in a dictionary
    descriptive_dict[person] = {'num_vars': num_vars, 'num_days': num_days, 'num_obs': num_obs, 'mean_mood': mean_mood}
    
# print all variables
print(df['variable'].unique())

# Convert the dictionary to a dataframe
df_descriptive = pd.DataFrame.from_dict(descriptive_dict, orient='index')
df_descriptive

There are 27 people in the dataset.
There are 19 variables in the dataset.
['mood' 'circumplex.arousal' 'circumplex.valence' 'activity' 'screen'
 'call' 'sms' 'appCat.builtin' 'appCat.communication'
 'appCat.entertainment' 'appCat.finance' 'appCat.game' 'appCat.office'
 'appCat.other' 'appCat.social' 'appCat.travel' 'appCat.unknown'
 'appCat.utilities' 'appCat.weather']


Unnamed: 0,num_vars,num_days,num_obs,mean_mood
AS14.01,19,76,21999,7.067568
AS14.02,14,67,14581,6.773585
AS14.03,18,79,14425,7.58371
AS14.05,16,76,15745,6.726141
AS14.06,17,80,18092,7.172414
AS14.07,16,48,16045,6.114583
AS14.08,16,77,7902,6.749164
AS14.09,19,77,10886,6.918782
AS14.12,19,75,17311,6.221622
AS14.13,16,75,19592,7.286885


## Examine NA values and distribution of values

In [79]:
# TODO: weirdly large values, find out which and why

# Describe value column
df['value'].describe()

count    376710.000000
mean         40.665313
std         273.726007
min      -82798.871000
25%           2.025000
50%           7.029000
75%          29.356000
max       33960.246000
Name: value, dtype: float64

In [80]:
# Count nan values
print(f"Number of NA values: \n{df.isna().sum()}")
# Print rows with nan values
print("Variables with NA values: ")
na_vars = df[df.isna().any(axis=1)]['variable'].unique()
print(na_vars)


Number of NA values: 
id            0
time          0
variable      0
value       202
dtype: int64
Variables with NA values: 
['circumplex.arousal' 'circumplex.valence']


In [83]:
# How many people are missing valence and arousal?
people_na = df[(df['value'].isna())]['id'].unique()
print(f"Number of people missing valence and arousal: {people_na.shape[0]}, ({people_na.shape[0] / tot_persons * 100}%)")

# Count number of observations with variable in na_vars
print(f"Number of observations of {na_vars}: {df[(df['variable'].isin(na_vars))].shape[0]}")

# Mean of mood for people not in people_na
mean_missing = df[(df['id'].isin(people_na)) & (df['variable'] == 'mood')]['value'].mean()
mean_not_missing = df[(~df['id'].isin(people_na)) & (df['variable'] == 'mood')]['value'].mean()
print(f"Mean mood of people with missing values: {mean_missing}")
print(f"Mean mood of people without missing values: {mean_not_missing}")

# Test whether difference between mean_missing and mean_not_missing is significant
from scipy.stats import ttest_ind
ttest = ttest_ind(df[(df['id'].isin(people_na)) & (df['variable'] == 'mood')]['value'], df[(~df['id'].isin(people_na)) & (df['variable'] == 'mood')]['value'])

# print results
print(f"t-statistic: {ttest.statistic}")
print(f"p-value: {ttest.pvalue}")


# Iterate over na people
# for person in people_na:
#     # Get the data for this person
#     df_person = df[df['id'] == person]

#     # Get mean value of mood variables
#     mean_mood = df_person[df_person['variable'] == 'mood']['value'].mean()

#     print(f"Mean mood for person {person}: {mean_mood}")

Number of people missing valence and arousal: 15, (55.55555555555556%)
Number of observations of ['circumplex.arousal' 'circumplex.valence']: 11286
Mean mood of people with missing values: 6.825787106446777
Mean mood of people without missing values: 7.2337380745880315
t-statistic: -14.86669247267248
p-value: 4.562622541426819e-49
