In [None]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

prefix = '/kaggle/input/the-depression-dataset/data'

import os
for dirname, _, filenames in os.walk(prefix):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Finding the between-participants matches**

Here the participants are aligned based on number of days the tracker was worn, gender and age. This enables similar attributed individuals to be compared.

In [None]:
df = pd.read_csv(f'{prefix}/scores.csv')

df_cond = df[df['number'].str.contains('condition')]
df_cont = df[df['number'].str.contains('control')]
df_cont = df_cont[['number','days', 'gender', 'age']]

sample_df = pd.merge(df_cond, df_cont, how='inner', left_on=['days','gender','age'], right_on = ['days','gender','age'],
          suffixes=('_cond', '_cont'))

col = sample_df.pop("number_cont")
sample_df.insert(0, col.name, col)

sample_df

# **Cleaning and pairing controls vs conditions**
It appears that the users forgot to take off the actigraph, as readings were well beyond the dates that the reader was stated to be worn for. The assumption is they started on the first day as they were supposed to, up to the stated number of days in scores.csv.

The conditions and controls are then merged together into a single dataframe, and the outliers are removed ($z \leq 3$).

In [None]:
nrow_count = len(sample_df)

def load_and_prep(path, col, i):
    df = pd.read_csv(f'{prefix}/{path}/{sample_df.iloc[i][col]}.csv',
                     parse_dates=['date','timestamp'])   
    df = df[df.date < (min(df.date) + dt.timedelta(days=int(sample_df.iloc[i]["days"])))]
    df['minute_index'] = df.index
    
    return(df)

activity_list = ['activity_cond', 'activity_cont']

data = []

for i in range(nrow_count):
    cond_plot = load_and_prep('condition','number_cond',i)
    cont_plot = load_and_prep('control','number_cont',i)
    
    new_df = pd.merge(cond_plot, cont_plot, how='inner',
                      left_on=['minute_index'], right_on = ['minute_index'],
                      suffixes=('_cond', '_cont'))
    
    new_df = new_df[['timestamp_cont','timestamp_cond',
                     'minute_index', 'activity_cond', 'activity_cont']]
    
    for activity in activity_list:
        new_df = new_df[(np.abs(stats.zscore(new_df[activity])) < 3)]
    
    data.append(new_df)

# **Plotting the controls vs conditions**
Plots up the pairs of controls and conditions as determined above.

In [None]:
_, ax1 = plt.subplots(nrow_count,1, figsize=(20, 80))

for i,_ in enumerate(data):

    ax1[i].plot(data[i].minute_index, data[i].activity_cond, color='r', alpha = 0.5)
    ax1[i].plot(data[i].minute_index, data[i].activity_cont, color='b',
                marker='o', markersize=2,linestyle = '', alpha = 0.5)

    ax1[i].legend((f'{sample_df.iloc[i]["number_cond"]}',
                   f'{sample_df.iloc[i]["number_cont"]}'),
                  loc='upper right', shadow=True)
    
    y_max =round(max([data[i].activity_cond.max(),data[i].activity_cont.max()]),-3)
    
    ax1[i].set_ylim(0, y_max)

In general, the activity levels drop for the condition vs control. This suggests that depression leads to a drop in activity.

# **Plotting box plots**
This quantifies the difference in activity. This makes it quite clear that there is a significant difference between the activity levels.

In [None]:
_, ax2 = plt.subplots(nrow_count,1, figsize=(20, 80))

for i, _ in enumerate(data):
    temp = data[i][["minute_index", "activity_cond", "activity_cont"]]
    
    temp2 = temp.rename(columns = {"activity_cond": f'{sample_df.iloc[i]["number_cond"]}',
                 "activity_cont": f'{sample_df.iloc[i]["number_cont"]}'})
    
    temp2 = pd.melt(temp2, id_vars=['minute_index'], var_name='activity')
    
    my_pal = {f'{sample_df.iloc[i]["number_cond"]}': "r",
              f'{sample_df.iloc[i]["number_cont"]}': "b"}
    
    sns.boxplot(x='activity', y='value', data=temp2,  orient='v' , ax=ax2[i], palette=my_pal)
    
    for patch in ax2[i].artists:
         r, g, b, a = patch.get_facecolor()
         patch.set_facecolor((r, g, b, .5))

# **Plotting violin plots**
This quantifies the difference in activity. This makes it quite clear that there is a significant difference between the activity levels.

In [None]:
_, ax3 = plt.subplots(nrow_count,1, figsize=(20, 80))

for i, _ in enumerate(data):
    temp = data[i][["minute_index", "activity_cond", "activity_cont"]]
    
    temp2 = temp.rename(columns = {"activity_cond": f'{sample_df.iloc[i]["number_cond"]}',
                 "activity_cont": f'{sample_df.iloc[i]["number_cont"]}'})
    
    temp2 = pd.melt(temp2, id_vars=['minute_index'], var_name='activity')
    
    my_pal = {f'{sample_df.iloc[i]["number_cond"]}': "r",
              f'{sample_df.iloc[i]["number_cont"]}': "b"}
    
    sns.violinplot(x='activity', y='value', data=temp2,  orient='v' , ax=ax3[i], palette=my_pal)

Next step...looking at the actigraphy data to predict sleep. This is a difficult task, as we have univariate time series for each individual, with one channel of info.

The approach isn't obvious - one is to define features on a sliding window then feed into an unsupervised model.

One for the future...