In [1]:
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
import re
import os
import math
import time
import statsmodels.api as sm

In [2]:
def unix_to_datetime(ts):
    return datetime.utcfromtimestamp(ts)

def get_unique_days(df_column):
    realtime = df_column.apply(lambda x: unix_to_datetime(x))
    days = np.array(realtime.apply(lambda x: x.day))
    return days, np.unique(days)

def get_day_into_study(days, prev_days = 0):
    adj = list(range(0, len(days)))
    res = [x + prev_days for x in adj]
    return res

def search(l, n):
    for i in range(len(l)):
        if l[i] == n:
            return i
    return -1

def adjust_day(uniq, n, blind = 0):
    haystack = uniq[blind:]
    idx = search(haystack, n)
    return idx + blind

In [3]:
# all participants
PIDS = [
    '1_EXP05/', '2_EXP01/', '3_EXP02/', 
    '4_EXP06/', '5_EXP07/', '6_EXP04/', '7_EXP03/', 
    '8_EXP11/', '9_EXP12/', '11_EXP13/',
    '12_EXP08/', '13_EXP15/', '14_EXP14/'
]

In [4]:
frames_df = []

df_demographics = pd.read_csv('demographics.csv')

for p in PIDS:
    print('processing', p)
    
    # read data
    df_c_w1 = pd.read_csv(p + 'week_1/' + 'parent.csv')
    df_c_w2 = pd.read_csv(p + 'week_2/' + 'parent.csv')
    df_c_w3 = pd.read_csv(p + 'week_3/' + 'parent.csv')
    df_c_w4 = pd.read_csv(p + 'week_4/' + 'parent.csv')
    
    # remove rows where the duration is 0
    df_c_w1 = df_c_w1[df_c_w1['duration'] != 0]
    df_c_w2 = df_c_w2[df_c_w2['duration'] != 0]
    df_c_w3 = df_c_w3[df_c_w3['duration'] != 0]
    df_c_w4 = df_c_w4[df_c_w4['duration'] != 0]
    
    # remove outliers
    df_c_w1 = df_c_w1[(np.abs(stats.zscore(df_c_w1['duration'])) < 3)].reset_index()
    df_c_w2 = df_c_w2[(np.abs(stats.zscore(df_c_w2['duration'])) < 3)].reset_index()
    df_c_w3 = df_c_w3[(np.abs(stats.zscore(df_c_w3['duration'])) < 3)].reset_index()
    df_c_w4 = df_c_w4[(np.abs(stats.zscore(df_c_w4['duration'])) < 3)].reset_index()

    # determine unique days in dataset
    _, unique_days_w1 = get_unique_days(df_c_w1['start'])
    _, unique_days_w2 = get_unique_days(df_c_w2['start'])
    _, unique_days_w3 = get_unique_days(df_c_w3['start'])
    _, unique_days_w4 = get_unique_days(df_c_w4['start'])

    unique_days = np.concatenate((unique_days_w1, unique_days_w2, unique_days_w3, unique_days_w4), axis = 0)
    
    # set day (of study) identifier
    df_c_w1['day'] = df_c_w1['start'].apply(lambda x: adjust_day(unique_days, unix_to_datetime(x).day))
    df_c_w2['day'] = df_c_w2['start'].apply(lambda x: adjust_day(unique_days, unix_to_datetime(x).day, blind=len(unique_days_w1)))
    df_c_w3['day'] = df_c_w3['start'].apply(lambda x: adjust_day(unique_days, unix_to_datetime(x).day, blind=len(unique_days_w1)+len(unique_days_w2)))
    df_c_w4['day'] = df_c_w4['start'].apply(lambda x: adjust_day(unique_days, unix_to_datetime(x).day, blind=len(unique_days_w1)+len(unique_days_w2)+len(unique_days_w3)))

    # set week identifier
    df_c_w1['week'] = 1
    df_c_w2['week'] = 2
    df_c_w3['week'] = 3
    df_c_w4['week'] = 4
    
    # apply data transform
    df_c_w1['log_duration'] = df_c_w1['duration'].apply(lambda x: math.log(x))
    df_c_w2['log_duration'] = df_c_w2['duration'].apply(lambda x: math.log(x))
    df_c_w3['log_duration'] = df_c_w3['duration'].apply(lambda x: math.log(x))
    df_c_w4['log_duration'] = df_c_w4['duration'].apply(lambda x: math.log(x))
    
    df_c_w1['prev_object'] = df_c_w1['object'].shift(periods=1)
    df_c_w2['prev_object'] = df_c_w1['object'].shift(periods=1)
    df_c_w3['prev_object'] = df_c_w1['object'].shift(periods=1)
    df_c_w4['prev_object'] = df_c_w1['object'].shift(periods=1)
    
    df_c_w1['next_object'] = df_c_w1['object'].shift(periods=-1)
    df_c_w2['next_object'] = df_c_w1['object'].shift(periods=-1)
    df_c_w3['next_object'] = df_c_w1['object'].shift(periods=-1)
    df_c_w4['next_object'] = df_c_w1['object'].shift(periods=-1)
    
    # combine all week dataframes
    frames = [df_c_w1, df_c_w2, df_c_w3, df_c_w4]
    df_c_p = pd.concat(frames)
    df_c_p['ID'] = p
    
    # get demographics
    pid = p.partition('_')[2][:-1]
    df_d = df_demographics[df_demographics['ID'].str.contains(pid)]
    for key, value in df_d.iteritems():
        if 'Date' in key:
            dt = datetime.strptime(df_d.iloc[0][key], '%M/%d/%Y').date()
            dt = time.mktime(dt.timetuple())
            df_c_p[key] = dt
        else:
            df_c_p[key] = df_d.iloc[0][key]
    
    # remove index
    # df_c_p = df_c_p[(np.abs(stats.zscore(df_c_p['log_duration'])) < 3)].reset_index() # remove overall outliers
    df_c_p = df_c_p.iloc[:, 2:]
    # df_c_p['prev_object'] = df_c_p['object'].shift(periods=1)
    # df_c_p['next_object'] = df_c_p['object'].shift(periods=-1)
    
    frames_df.append(df_c_p)
    
df_c = pd.concat(frames_df, ignore_index=True)

processing 1_EXP05/
processing 2_EXP01/
processing 3_EXP02/
processing 4_EXP06/
processing 5_EXP07/
processing 6_EXP04/
processing 7_EXP03/
processing 8_EXP11/
processing 9_EXP12/
processing 11_EXP13/
processing 12_EXP08/
processing 13_EXP15/
processing 14_EXP14/


In [5]:
df_c

Unnamed: 0,object,duration,start,day,week,log_duration,prev_object,next_object,ID,ADOS_Date,...,DAS_Date,DAS_VerbalReasoningSS,DAS_NonVerbalReasoningSS,DAS_SpatialReasoningSS,DAS_GCASS,ADI_Date,ADI_Atotal,ADI_Btotal,ADI_Ctotal,ADI_Dtotal
0,nodetection,14.193398,1499457802,0,1,2.652777,,outside,EXP05,1.451624e+09,...,1.422508e+09,51,71,66,55,1.422508e+09,26,16,8,4
1,outside,6.396810,1499457816,0,1,1.855799,nodetection,nodetection,EXP05,1.451624e+09,...,1.422508e+09,51,71,66,55,1.422508e+09,26,16,8,4
2,nodetection,2.498476,1499457823,0,1,0.915681,outside,outside,EXP05,1.451624e+09,...,1.422508e+09,51,71,66,55,1.422508e+09,26,16,8,4
3,outside,5.397454,1499457825,0,1,1.685927,nodetection,nodetection,EXP05,1.451624e+09,...,1.422508e+09,51,71,66,55,1.422508e+09,26,16,8,4
4,nodetection,2.299140,1499457831,0,1,0.832535,outside,outside,EXP05,1.451624e+09,...,1.422508e+09,51,71,66,55,1.422508e+09,26,16,8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73391,screen,0.499969,1506382771,14,4,-0.693208,,,EXP14,1.453266e+09,...,1.453352e+09,77,78,105,83,1.453352e+09,17,20,5,5
73392,nodetection,10.299658,1506382771,14,4,2.332111,,,EXP14,1.453266e+09,...,1.453352e+09,77,78,105,83,1.453352e+09,17,20,5,5
73393,nodetection,0.599943,1506382782,14,4,-0.510921,,,EXP14,1.453266e+09,...,1.453352e+09,77,78,105,83,1.453352e+09,17,20,5,5
73394,screen,0.200080,1506382783,14,4,-1.609036,,,EXP14,1.453266e+09,...,1.453352e+09,77,78,105,83,1.453352e+09,17,20,5,5


In [7]:
pids = df_c['ID'].unique()
total = 0
w1 = []
w2 = []
w3 = []
w4 = []
obj = 'screen'
p_obj = 'child'

for pid in pids:
    n_value = df_c[(df_c['ID'] == pid) & (df_c['object'] == obj)].shape[0]
    
    w1_n = df_c[(df_c['ID'] == pid) & (df_c['object'] == obj) & (df_c['prev_object'] == p_obj) & (df_c['week'] == 1)].shape[0] / df_c[(df_c['ID'] == pid) & (df_c['week'] == 1)].shape[0]
    w2_n = df_c[(df_c['ID'] == pid) & (df_c['object'] == obj) & (df_c['prev_object'] == p_obj) & (df_c['week'] == 2)].shape[0] / df_c[(df_c['ID'] == pid) & (df_c['week'] == 2)].shape[0]
    w3_n = df_c[(df_c['ID'] == pid) & (df_c['object'] == obj) & (df_c['prev_object'] == p_obj) & (df_c['week'] == 3)].shape[0] / df_c[(df_c['ID'] == pid) & (df_c['week'] == 3)].shape[0]
    w4_n = df_c[(df_c['ID'] == pid) & (df_c['object'] == obj) & (df_c['prev_object'] == p_obj) & (df_c['week'] == 4)].shape[0] / df_c[(df_c['ID'] == pid) & (df_c['week'] == 4)].shape[0]
    
    w1.append(w1_n)
    w2.append(w2_n)
    w3.append(w3_n)
    w4.append(w4_n)
    
print('1-2', stats.ttest_rel(w1, w2))
print('1-3', stats.ttest_rel(w1, w3))
print('1-4', stats.ttest_rel(w1, w4))

print('2-3', stats.ttest_rel(w2, w3))
print('2-4', stats.ttest_rel(w2, w4))

print('3-4', stats.ttest_rel(w3, w4))

1-2 Ttest_relResult(statistic=0.681493252699196, pvalue=0.5084985330538739)
1-3 Ttest_relResult(statistic=0.5622960733047174, pvalue=0.5842654567255563)
1-4 Ttest_relResult(statistic=1.3298425300761572, pvalue=0.2082947733885231)
2-3 Ttest_relResult(statistic=-0.23050154024383426, pvalue=0.8215848737982718)
2-4 Ttest_relResult(statistic=-0.11640266112801045, pvalue=0.9092588095224295)
3-4 Ttest_relResult(statistic=0.05911152801241137, pvalue=0.9538362657638134)
