### Import

In [1]:
import pandas as pd
import numpy as np
import sys
import glob

In [2]:
sys.path.insert(0, '../code/')

In [3]:
from loading_routines import *
from preprocessing import *
from feature_engineering import *
from mov_ampl import *

### Load data

In [4]:
all_files = glob.glob('../data/behavior_AND_personality_dataset/joints/*.xml')
#print all_files
all_files_new = glob.glob('../data/data_recordings_master/joints/*.xml')

In [5]:
all_subjects_dfs = [load_df_from_xml(f) for f in all_files+all_files_new]

Loaded 0 tracks for "subject6"
Loaded 1000 tracks for "subject6"
Loaded 2000 tracks for "subject6"
Loaded 3000 tracks for "subject6"
Loaded 4000 tracks for "subject6"
Loaded 5000 tracks for "subject6"
Loaded 6000 tracks for "subject6"
Loaded 7000 tracks for "subject6"
Loaded 8000 tracks for "subject6"
Loaded 9000 tracks for "subject6"
Loaded 10000 tracks for "subject6"
Loaded 11000 tracks for "subject6"
Loaded 12000 tracks for "subject6"
Loaded 13000 tracks for "subject6"
Loaded 14000 tracks for "subject6"
Loaded 15000 tracks for "subject6"
Loaded 16000 tracks for "subject6"
Loaded 17000 tracks for "subject6"
Loaded 18000 tracks for "subject6"
Loaded 19000 tracks for "subject6"
Loaded 20000 tracks for "subject6"
Loaded 21000 tracks for "subject6"
Loaded 22000 tracks for "subject6"
Loaded 23000 tracks for "subject6"
Loaded 24000 tracks for "subject6"
Loaded 25000 tracks for "subject6"
Loaded 26000 tracks for "subject6"
Loaded 27000 tracks for "subject6"
Loaded 0 tracks for "subject11"
L

KeyboardInterrupt: 

## Preprocessing

#### Remove outliers

In [None]:
dfs_prep = [remove_outliers(df, low_percentil=0.05, high_percentil=0.95) for df in all_subjects_dfs]

In [None]:
df_prep_norm = [normalize_data(df) for df in dfs_prep]

#### Normalization

In [None]:
df_prep_norm[0].head()

#### Detect frames with little movement

* either of the head coordinates in a sequence moves more than 0.03

In [None]:
#is a list of lists
# "little movement" not little enough?
dfs_little_movement = [get_sequences_with_little_movement(df, variables_to_check=['head_x', 'head_y', 'head_z'], max_mov=0.03) for df in df_prep_norm]

In [None]:
for df in dfs_little_movement:
    print 'Length of sequences with little movement for ', df[0]['subject'].iloc[0]
    print '*'*5
    for d in df:
        print(len(d))

In [None]:
#dfs_little_movement[-1][-1]

In [None]:
dfs_little_movement[0][0].head()

In [None]:
all_little_movement_per_person = [pd.concat(df) for df in dfs_little_movement]

In [None]:
for df in all_little_movement_per_person:
    print len(df)

### Extract Posture Features for upper joints

In [None]:
upper_joints = ['head_x', 'head_y', 'head_z',
              'neck_x', 'neck_y', 'neck_z',
              'spineShoulder_x', 'spineShoulder_y', 'spineShoulder_z',
              'shoulderR_x', 'shoulderR_y', 'shoulderR_z',
              'elbowR_x', 'elbowR_y', 'elbowR_z',
              'wristR_x', 'wristR_y', 'wristR_z',
              'handR_x', 'handR_y', 'handR_z',
              'shoulderL_x', 'shoulderL_y', 'shoulderL_z',
              'elbowL_x', 'elbowL_y', 'elbowL_z',
              'wristL_x', 'wristL_y', 'wristL_z',
              'handL_x', 'handL_y', 'handL_z']

In [None]:
posture_per_person = [calculate_joint_differences(df, only_for_columns=upper_joints) for df in all_little_movement_per_person]

In [None]:
print 'Total posture features: ', len(posture_per_person[0].columns)

In [None]:
posture_per_person[0].head()

In [None]:
mean_posture_features = []
for i, df in enumerate(posture_per_person):
    pos_mean = pd.DataFrame([df.mean().values], columns=df.mean().index)
    subject = all_little_movement_per_person[i].iloc[0]['subject']
    pos_mean['subject'] = subject
    mean_posture_features.append(pos_mean)

In [None]:
mean_posture_features[0]

In [None]:
posture_feat_df = pd.concat(mean_posture_features)

#### Look at specific set of posture features


In [None]:
specific_posture = ['head_'+coord+'-shoulderL_'+coord for coord in ['x', 'y', 'z']]+['shoulderR_'+coord+'-head_'+coord for coord in ['x', 'y', 'z']]+['shoulderR_'+coord+'-shoulderL_'+coord for coord in ['x', 'y', 'z']]+['wristR_'+coord+'-neck_'+coord for coord in ['x', 'y', 'z']]
specific_posture

In [None]:
#get subject number back (was lost while calculating posture features)
for i, df in enumerate(posture_per_person):
    subject = all_little_movement_per_person[i].iloc[0]['subject']
    df['subject'] = [subject]*len(df)

In [None]:
for df in posture_per_person:
    print'Means for subject ', df['subject'].iloc[0]
    print '*'*5
    print df[specific_posture].mean()

In [None]:
import matplotlib.pyplot as plt
len(posture_per_person[0][specific_posture])

In [None]:
big_posture_df = pd.concat(posture_per_person)
len(big_posture_df)

In [None]:
#fig=plt.figure(figsize=(20, 20))
big_posture_df.boxplot(column = specific_posture,
                       by='subject', figsize=(20,20))
plt.show()

## Extract Movement Amplitude features

In [None]:
amplitude_feature_dfs = []
still_joints = ['spineMid', 'neck']
mov_joints = ['handR', 'handL', 'wristL', 'head']
for df in all_subjects_dfs:
    subject = df['subject'].iloc[0]
    subject_amp_dfs = []
    for still in still_joints:
        for mov in mov_joints:            
            subject_amp_dfs.append(mov_amplitude(df, s_joint=still, m_joint=mov).reset_index())
    amp_df = pd.concat(subject_amp_dfs, axis=1)
    amp_df['subject'] = subject
    #print(amp_df)
    amplitude_feature_dfs.append(amp_df)

In [None]:
mov_amplitude(df, s_joint=still, m_joint=mov)

In [None]:
amplitude_df = pd.concat(amplitude_feature_dfs).reset_index()
amplitude_df.drop('index', axis=1, inplace=True)
amplitude_df.drop('level_0', axis=1, inplace=True)
amplitude_df.head()
#amplitude_df.join(mean_posture_features, on='subject')

In [None]:
all_features = amplitude_df.merge(posture_feat_df, on='subject')
all_features.loc[17, 'subject'] = 'subject17' #fix naming error
#all_features['subject']

In [None]:
var_features = list(all_features.var().index[(all_features.var()>0.001).values])
var_features

In [None]:
all_features[var_features].corr()

In [None]:
import seaborn as sns
%matplotlib inline

In [None]:
fig=plt.figure(figsize=(16, 16))
sns.heatmap(np.abs(all_features[var_features].corr()))

### Sensor features

In [None]:
import sensor_data

In [None]:
sensor_files1 = glob.glob('../data/behavior_AND_personality_dataset/binary/*.txt')
sensor_files2 = glob.glob('../data/data_recordings_master/binary/*.txt')
sensor_files1[1]

In [None]:
#TODO: extract sensor features and concat to feature dataframe

### Speed features

In [None]:
import preprocessing
import speed_feature

In [None]:
#TODO: extract speed features and concat to feature

### Load personality data

In [None]:
import personality_data

In [None]:
personality_df = personality_data.personality_data_to_data_frame('../data/behavior_AND_personality_dataset/big5_personality_result.txt')
names = [s.replace('_', '') for s in personality_df['name']]
personality_df = personality_df.transpose()
#print(personality_df)
cols = personality_df.index[11:]
#print(cols)
personality_df = normalize_data(personality_df[11:], columns=personality_df.columns).transpose()
personality_df.columns = cols
personality_df['subject'] = names
personality_df.head()

In [None]:
with open('../data/data_recordings_master/personality.txt', 'r') as f:
    rows = [row.replace('\r', '').replace('\n', '').split() for row in f.readlines()]
#print(rows)
new_rows = []
for row in rows:
    new_row = []
    for v in row:
        if v.startswith('s'):
            new_row.append(v)
        else:
            new_row.append(int(v))
    new_rows.append(new_row)
            
columns = personality_data.get_column_names()[:11]
personality_df2 = pd.DataFrame(new_rows, columns=columns)
personality_df2.head()

In [None]:
extraversion = 5 - personality_df2['question_1'].values + personality_df2['question_6'].values
agreeableness = personality_df2['question_2'].values+ (5-personality_df2['question_7'].values)
conscientiousness = 5-personality_df2['question_3'].values + personality_df2['question_8'].values
neuroticism = 5-personality_df2['question_4'].values + personality_df2['question_9'].values
openess = 5-personality_df2['question_5'].values+personality_df2['question_10'].values

In [None]:
personality_df2['extraversion'] = extraversion
personality_df2['agreeableness'] = agreeableness
personality_df2['conscientiousness'] = conscientiousness
personality_df2['neuroticism'] = neuroticism
personality_df2['openness_to_experience'] = openess
personality_df2.head()

In [None]:
names = [s.replace('_', '') for s in personality_df2['name']]
personality_df2 = personality_df2.transpose()
#print(personality_df)
cols = personality_df2.index[11:]
#print(cols)
personality_df2 = normalize_data(personality_df2[11:], columns=personality_df2.columns).transpose()
personality_df2.columns = cols
personality_df2['subject'] = names
personality_df2.head()

In [None]:
personality_full = pd.concat([personality_df, personality_df2], axis=0)
personality_full.head()

## Correlations

In [None]:
feat_pers_df = all_features[var_features+['subject']].merge(personality_full, on = 'subject')
big5 = ['extraversion', 'agreeableness', 'conscientiousness', 'neuroticism', 'openness_to_experience']

In [None]:
feat_pers_df.corr()[big5]

In [None]:
fig=plt.figure(figsize=(20, 20))
sns.heatmap(np.abs(feat_pers_df.corr()[big5]), annot=True)