# Data Pre-processing

In [None]:
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

sns.set()

### Load data

In [None]:
# Read in data
all_files = glob.glob('./*.txt')

# Concatenate the different files into a single dataframe
li = []
for filename in all_files:
    df = pd.read_csv(filename, sep=",", header=None)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df.columns = ["id", "idx", "filename", "label", "time"]
# df = df.loc[~((df['id'] != 101) & (df['id'] != 213))]
df.head()

### Filter data

In [None]:
# Create mask to filter away low and high reaction times
time_mask = (df['time'] > 0.2) & (df['time'] < 1)

# Filter out data
df = df[time_mask]

### Get normalized reaction times

In [None]:
# Create a new column with normalized times
scaler = MinMaxScaler()
for id_ in df['id'].unique():
    mask_ = df['id'] == id_
    scaler.fit(df.loc[mask_, ['time']])
    df.loc[mask_, 'time_norm'] = scaler.transform(df.loc[mask_, ['time']])

### Create the continuous index

In [None]:
# Create a mask for smile labels
smile_mask = df['label'] == 'Smiling'

# Create continuous index from -1 to 1
df.loc[~smile_mask, 'cidx'] = (1 - df.loc[~smile_mask, 'time_norm'])
df.loc[smile_mask, 'cidx'] = (df.loc[smile_mask, 'time_norm'] - 1)
df.head()

In [None]:
# plot the distribution of the index
plt.figure()
df['cidx'].plot(kind='hist', bins=50, title="Continuous Index")
plt.xlabel('continuous index value')
plt.savefig('cidx.png', dpi=None, facecolor='w', edgecolor='w',)
plt.show()

Average the continuous index between the different subjects

In [None]:
# Average out the cidx
df_avg = df.groupby('filename').agg({'cidx': 'mean'})
df_avg.head()

In [None]:
# Plot and save the cidx and average cidx
plt.figure()
df_avg['cidx'].plot(
    kind='hist', bins=50, 
    title="Avg. Continuous Index", 
    xlabel='s'
)
plt.xlabel('continuous index value')
plt.savefig('cidx_avg.png', dpi=None, facecolor='w', edgecolor='w',)
plt.show()

### Save to csv

In [None]:
# Save the two dataframes as csv files
df.to_csv("processed_data.csv", index=False)
df_avg.to_csv("cidx_avg.csv", index=True)