In [1]:
import pandas as pd
import os
import numpy as np

In [2]:
def data_path(i):

    """ Returns path to video data """

    return  '../_data/realeyes_dsc_homework_video_{}.csv'.format(i)


def load_data(path):

    """ Load data """

    df1 = pd.read_csv(path(1))
    df2 = pd.read_csv(path(2))
    df3 = pd.read_csv(path(3))
    df = pd.concat([df1, df2, df3], ignore_index=True)

    df = df.sort_values(by=['video_id', 'subject_id', 'frame_no', 'millisecond_from_start'], ignore_index=True)

    return df, df1, df2, df3


## Load data

In [3]:
# load data

df_all, df1, df2, df3 = load_data(data_path)
columns = list(df1.columns)

df_all.head()

Unnamed: 0,video_id,subject_id,frame_no,millisecond_from_start,positive_1,positive_2,negative_1,negative_2,negative_3
0,70728,12433421,1435,39,0.0,0.0,0.0,0.0,0.0
1,70728,12433421,1436,104,0.0,0.0,0.0,0.0,0.0
2,70728,12433421,1437,169,0.0,0.0,0.0,0.0,0.0
3,70728,12433421,1438,236,0.0,0.0,0.0,0.0,0.0
4,70728,12433421,1439,302,0.0,0.0,0.0,0.0,0.0


In [4]:
# each file corresponds to only one video

print(set(df1.video_id), set(df2.video_id), set(df3.video_id))

{'67928', 'No value'} {'70727', 'No value'} {70728}


In [5]:
# each subject watched only one video, so each video can be treated separately

print(set(df1.subject_id) & set(df2.subject_id))
print(set(df2.subject_id) & set(df3.subject_id))

{'No value'}
set()


**Empty values**

no Nan values, missing values are labeled as 'No value'.

In [6]:
df_all.isnull().values.any()

False

Only non-numeric values is "No value". 


Values from sets {video_id, subject_id} and {emotions} are equal to 'No value' together. For example, 
if video_id = 'No value', then subject_id = 'No value' as well.

In [7]:

def no_value(df):
    
    """ Shows non-numeric values and its count """
    
    for column in columns:
        idx = pd.to_numeric(df[column], errors='coerce').isna()
        no_val = list(set(df[idx][column]))
        no_val_len = len(df[idx][column])
        print(no_val, "{}%".format(round(no_val_len/df.shape[0]*100)))

In [8]:
print('video 1')
no_value(df1)
print('')

print('video 2')
no_value(df2)
print('')

print('video 3')
no_value(df3)
print('')

print('video all')
no_value(df_all)
print('')

video 1
['No value'] 1%
['No value'] 1%
[] 0%
[] 0%
['No value'] 23%
['No value'] 23%
['No value'] 23%
['No value'] 23%
['No value'] 23%

video 2
['No value'] 1%
['No value'] 1%
[] 0%
[] 0%
['No value'] 26%
['No value'] 26%
['No value'] 26%
['No value'] 26%
['No value'] 26%

video 3
[] 0%
[] 0%
[] 0%
[] 0%
['No value'] 29%
['No value'] 29%
['No value'] 29%
['No value'] 29%
['No value'] 29%

video all
['No value'] 1%
['No value'] 1%
[] 0%
[] 0%
['No value'] 27%
['No value'] 27%
['No value'] 27%
['No value'] 27%
['No value'] 27%



Replace missing values in video_id and subject_id by preceding values.

Remove entries with missing emotions

In [None]:
# replace 'No value' in video_id & subject_id by preceding value

df=df_all.copy()

for col in ['video_id', 'subject_id']:
    df[col] = pd.to_numeric(df[col], errors='coerce', downcast='integer')
    df[col].fillna(method='ffill', inplace=True)
    df[col] = df[col].astype('int32').astype('str')

Look at missing values distribution over subjects

In [None]:
df['no_value'] = df['positive_1'].apply(lambda x: 1 if str(x) == 'No value' else 0)
df['num_missing_values_per_subject'] = df.groupby(by=['subject_id'])['no_value'].transform('sum')

In [None]:
df

In [None]:
def num_of_subjects(df):
    return df['subject_id'].nunique()
    
print('Sample size per each video:', num_of_subjects(df1), num_of_subjects(df2), num_of_subjects(df3))

## Process data