## Data exploration
I made this notebook to explore the data, the understand it and decide upon how we should preprocess it.
I make no alterations to the data here.

In [39]:
import pandas as pd
import numpy as np

In [40]:
df = pd.read_csv('dataset_mood_smartphone.csv')
df.head(), df.shape

(   Unnamed: 0       id                     time variable  value
 0           1  AS14.01  2014-02-26 13:00:00.000     mood    6.0
 1           2  AS14.01  2014-02-26 15:00:00.000     mood    6.0
 2           3  AS14.01  2014-02-26 18:00:00.000     mood    6.0
 3           4  AS14.01  2014-02-26 21:00:00.000     mood    7.0
 4           5  AS14.01  2014-02-27 09:00:00.000     mood    6.0,
 (376912, 5))

In [41]:
df.variable.unique(), print(len(df.variable.unique()))

19


(array(['mood', 'circumplex.arousal', 'circumplex.valence', 'activity',
        'screen', 'call', 'sms', 'appCat.builtin', 'appCat.communication',
        'appCat.entertainment', 'appCat.finance', 'appCat.game',
        'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel',
        'appCat.unknown', 'appCat.utilities', 'appCat.weather'],
       dtype=object),
 None)

**The variables and their descriptions**
<br> 
mood= The mood scored by the user on a scale of 1-10
<br> 
circumplex.arousal= The arousal scored by the user, on a scale between -2 to 2
<br> 
circumplex.valence= The valence scored by the user, on a scale between -2 to 2 (positivity vs. negativity of general emotional feeling)
<br>activity= Activity score of the user (number between 0 and 1)
<br>screen= Duration of screen activity (time)
<br>call= Call made (indicated by a 1)
<br>sms= SMS sent (indicated by a 1)
<br>appCat.builtin= Duration of usage of builtin apps (time)
<br>appCat.communication= Duration of usage of communication apps (time) 
<br>appCat.entertainment= Duration of usage of entertainment apps (time)
<br>appCat.finance= Duration of usage of finance apps (time)
<br>appCat.game= Duration of usage of game apps (time)
<br>ppCat.office= Duration of usage of office apps (time)
<br>appCat.other= Duration of usage of other apps (time)
<br>appCat.social= Duration of usage of social apps (time)
<br>appCat.travel= Duration of usage of travel apps (time)
<br>appCat.unknown= Duration of usage of unknown apps (time)
<br>appCat.utilities= Duration of usage of utilities apps (time)
<br>appCat.weather= Duration of usage of weather apps (time)

In [42]:
#checking the occurence of vairables per patient per day
df[df['variable']=='activity']

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
16927,16928,AS14.01,2014-03-20 22:00:00.000,activity,0.071429
16928,16929,AS14.01,2014-03-20 23:00:00.000,activity,0.091667
16929,16930,AS14.01,2014-03-21 00:00:00.000,activity,0.008333
16930,16931,AS14.01,2014-03-21 01:00:00.000,activity,0.000000
16931,16932,AS14.01,2014-03-21 02:00:00.000,activity,0.000000
...,...,...,...,...,...
39887,39888,AS14.33,2014-05-30 16:00:00.000,activity,0.250000
39888,39889,AS14.33,2014-05-30 19:00:00.000,activity,0.035714
39889,39890,AS14.33,2014-05-30 20:00:00.000,activity,0.008403
39890,39891,AS14.33,2014-05-30 21:00:00.000,activity,0.017094


In [43]:
#Number of patients
print("N patients based on unique ID's: ", len(df.id.unique()) )

N patients based on unique ID's:  27


## Data Cleaning

In [44]:
#checking for missing values
df.isnull().sum()

Unnamed: 0      0
id              0
time            0
variable        0
value         202
dtype: int64

In [45]:
df = df.dropna()

In [46]:
df.head(), df.shape

(   Unnamed: 0       id                     time variable  value
 0           1  AS14.01  2014-02-26 13:00:00.000     mood    6.0
 1           2  AS14.01  2014-02-26 15:00:00.000     mood    6.0
 2           3  AS14.01  2014-02-26 18:00:00.000     mood    6.0
 3           4  AS14.01  2014-02-26 21:00:00.000     mood    7.0
 4           5  AS14.01  2014-02-27 09:00:00.000     mood    6.0,
 (376710, 5))

In [53]:
#checking for duplicates
df.drop_duplicates()

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.000
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.000
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.000
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.000
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.000
...,...,...,...,...,...
376907,2770399,AS14.30,2014-04-11 07:51:16.948,appCat.weather,8.032
376908,2772465,AS14.30,2014-04-19 11:00:32.747,appCat.weather,3.008
376909,2774026,AS14.30,2014-04-26 10:19:07.434,appCat.weather,7.026
376910,2774133,AS14.30,2014-04-27 00:44:48.450,appCat.weather,23.033


In [48]:
display(df.dtypes)

Unnamed: 0      int64
id             object
time           object
variable       object
value         float64
dtype: object

In [52]:
##Checking ranges using statistics

In [57]:
df_mood= df[df['variable']=='mood']
df_mood

Unnamed: 0.1,Unnamed: 0,id,time,variable,value
0,1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
1,2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
2,3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
3,4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
4,5,AS14.01,2014-02-27 09:00:00.000,mood,6.0
...,...,...,...,...,...
5636,5637,AS14.33,2014-05-30 09:00:00.000,mood,8.0
5637,5638,AS14.33,2014-05-30 13:00:00.000,mood,6.0
5638,5639,AS14.33,2014-05-30 19:00:00.000,mood,8.0
5639,5640,AS14.33,2014-05-30 20:00:00.000,mood,6.0


In [68]:
variables =df.variable.unique()

for i in variables:
    print("Range of '{0}': ".format(i), np.amin(df[df['variable']==i]['value']), np.amax(df[df['variable']==i]['value']))

Range of 'mood':  1.0 10.0
Range of 'circumplex.arousal':  -2.0 2.0
Range of 'circumplex.valence':  -2.0 2.0
Range of 'activity':  0.0 1.0
Range of 'screen':  0.0350000858306885 9867.00699996948
Range of 'call':  1.0 1.0
Range of 'sms':  1.0 1.0
Range of 'appCat.builtin':  -82798.871 33960.246
Range of 'appCat.communication':  0.006 9830.777
Range of 'appCat.entertainment':  -0.011000000000000001 32148.677000000003
Range of 'appCat.finance':  0.131 355.51300000000003
Range of 'appCat.game':  1.003 5491.793000000001
Range of 'appCat.office':  0.003 32708.818
Range of 'appCat.other':  0.013999999999999999 3892.038
Range of 'appCat.social':  0.094 30000.906000000003
Range of 'appCat.travel':  0.08 10452.615
Range of 'appCat.unknown':  0.111 2239.937
Range of 'appCat.utilities':  0.24600000000000002 1802.649
Range of 'appCat.weather':  1.003 344.86300000000006
Range of 'Mood':  1.0 10.0


In [72]:
#replacing the negative values with NaN for the relevant variables
##To use on the transformed dataframe! Doesnt work here
negative_variables = ['appCat.entertainment']
for i in negative_variables:
    df[df[i] < 0] = np.nan


TypeError: 'float' object is not subscriptable

In [None]:
#replacing low values with 0
## n seconds can be changed
low_variables = ['screen','appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance',
                 'appCat.game', 'appCat.office', 'appCat.social' , 'appCat.travel', 'appCat.utilities' ,'appCat.weather' ]
for i in low_variables:
    df[df[i] < 5] = 0