In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from tslearn.metrics import dtw, soft_dtw
from tslearn.utils import to_time_series_dataset
from sklearn.utils.validation import _check_large_sparse
from tslearn.clustering import TimeSeriesKMeans
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from sklearn.metrics import pairwise_distances_argmin_min, jaccard_score, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

In [2]:
def boolean_df(item_lists, unique_items):
# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: 1 if item in x else 0)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [3]:
X_df = pd.read_csv('../new_data/NA/na_dataset.csv', index_col=None)
Y_df = pd.read_csv('../new_data/NA/na_labels.csv', usecols=['filename', 'emotions'], index_col='filename')
Y_df["emotions"] = Y_df["emotions"].apply(eval)
unique_items = to_1D(Y_df["emotions"]).unique()
labels_expanded = boolean_df(Y_df['emotions'], unique_items)
labels_expanded

Unnamed: 0_level_0,none,furious,anger,annoyed,contempt,disgust,hatred
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
na/vid_1.mp4,1,0,0,0,0,0,0
na/vid_100.mp4,0,1,0,0,0,0,0
na/vid_101.mp4,0,1,0,0,0,0,0
na/vid_102.mp4,0,0,1,0,0,0,0
na/vid_104.mp4,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...
na/vid_90.mp4,0,0,1,0,0,0,0
na/vid_92.mp4,0,0,1,0,0,0,0
na/vid_93.mp4,0,0,1,0,0,0,0
na/vid_95.mp4,0,0,1,0,0,0,1


In [4]:
labels_expanded.loc['na/vid_1.mp4']['none':'hatred'].to_list()

for c in labels_expanded.columns: 
    X_df[c]=np.nan 


In [5]:
X_df.head()

Unnamed: 0,filename,culture,frame,face_id,timestamp,confidence,success,AU01_r,AU02_r,AU04_r,...,pose_Rz,gaze_angle_x,gaze_angle_y,none,furious,anger,annoyed,contempt,disgust,hatred
0,na/vid_1.mp4,north american,1,0,0.0,0.98,1,1.45,1.86,0.0,...,-0.063,0.107,0.353,,,,,,,
1,na/vid_1.mp4,north american,2,0,0.017,0.98,1,1.5,1.98,0.0,...,-0.064,0.101,0.347,,,,,,,
2,na/vid_1.mp4,north american,3,0,0.033,0.98,1,1.57,1.98,0.0,...,-0.065,0.1,0.348,,,,,,,
3,na/vid_1.mp4,north american,4,0,0.05,0.98,1,1.56,1.99,0.0,...,-0.065,0.1,0.348,,,,,,,
4,na/vid_1.mp4,north american,5,0,0.067,0.98,1,1.4,1.94,0.0,...,-0.075,0.093,0.356,,,,,,,


In [6]:
for index, row in X_df.iterrows():
    # print(index, row)
    filename = X_df.iloc[index]['filename']
    # print(labels_expanded.loc[filename]['none':'hatred'].to_list())
    X_df.at[index,'none'] = labels_expanded.at[filename,'none']
    X_df.at[index,'furious'] = labels_expanded.at[filename,'furious']
    X_df.at[index,'anger'] = labels_expanded.at[filename,'anger']
    X_df.at[index,'annoyed'] = labels_expanded.at[filename,'annoyed']
    X_df.at[index,'contempt'] = labels_expanded.at[filename,'contempt']
    X_df.at[index,'disgust'] = labels_expanded.at[filename,'disgust']
    X_df.at[index,'hatred'] = labels_expanded.at[filename,'hatred']

### Min-Max Scaling

In [7]:
X_df.describe()

Unnamed: 0,frame,face_id,timestamp,confidence,success,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,...,pose_Rz,gaze_angle_x,gaze_angle_y,none,furious,anger,annoyed,contempt,disgust,hatred
count,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,...,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0,6876.0
mean,62.283886,0.172484,2.202004,0.964308,1.0,0.68962,0.905599,0.453013,0.536358,0.550711,...,0.01255,0.010423,0.318281,0.104421,0.115765,0.199389,0.385835,0.173211,0.195172,0.017597
std,54.266866,0.430042,1.992598,0.033154,0.0,0.743647,1.202467,0.69554,0.800871,0.701084,...,0.160888,0.314508,0.164934,0.305828,0.319966,0.39957,0.486827,0.378457,0.396362,0.131493
min,1.0,0.0,0.0,0.88,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.564,-0.864,-0.544,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.0,0.0,0.767,0.98,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.09,-0.183,0.193,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48.0,0.0,1.635,0.98,1.0,0.49,0.18,0.0,0.0,0.255,...,0.017,-0.015,0.3175,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,84.0,0.0,2.97,0.98,1.0,1.19,1.57,0.81,0.89,0.9,...,0.104,0.222,0.44,0.0,0.0,0.0,1.0,0.0,0.0,0.0
max,301.0,2.0,10.01,0.98,1.0,3.75,5.0,3.47,4.47,3.6,...,1.51,0.977,0.825,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
cols_to_scale = list (
    set(X_df.columns.to_list()) - set(['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success', 'none', 'furious', 'anger', 'annoyed', 'contempt', 'disgust', 'hatred'])
)
scaler = MinMaxScaler()
X_df[cols_to_scale] = scaler.fit_transform(X_df[cols_to_scale])

In [13]:
X_df.tail()

Unnamed: 0,filename,culture,frame,face_id,timestamp,confidence,success,AU01_r,AU02_r,AU04_r,...,pose_Rz,gaze_angle_x,gaze_angle_y,none,furious,anger,annoyed,contempt,disgust,hatred
6871,na/vid_97.mp4,north american,92,0,3.792,0.88,1,0.176,0.0,0.778098,...,0.299421,0.241173,0.589481,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6872,na/vid_97.mp4,north american,93,0,3.833,0.88,1,0.0,0.0,0.789625,...,0.305689,0.247148,0.584368,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6873,na/vid_97.mp4,north american,94,0,3.875,0.88,1,0.021333,0.0,0.821326,...,0.3081,0.248778,0.579985,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6874,na/vid_97.mp4,north american,95,0,3.917,0.88,1,0.0,0.0,0.853026,...,0.302797,0.239001,0.598977,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6875,na/vid_97.mp4,north american,96,0,3.958,0.88,1,0.0,0.0,0.884726,...,0.296528,0.24063,0.604091,0.0,1.0,0.0,0.0,0.0,0.0,0.0


## Splitting into train and test

In [57]:
metadata_cols = ['frame', 'face_id', 'culture', 'filename', 'timestamp']
print(X_df.head())
videos = X_df['filename'].unique()
test_videos = pd.Series(videos).sample(frac=0.20)
train_videos = np.array(list(set(videos) - set(test_videos)))
test_df = X_df[X_df['filename'].isin(test_videos)]
metadata_test = test_df[metadata_cols]
y_test = test_df[['none', 'furious', 'anger', 'annoyed', 'contempt', 'disgust', 'hatred']].values
X_test = test_df.drop(columns = ['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success']).values

       filename         culture  frame  face_id  timestamp  confidence  \
0  na/vid_1.mp4  north american      1        0      0.000        0.98   
1  na/vid_1.mp4  north american      2        0      0.017        0.98   
2  na/vid_1.mp4  north american      3        0      0.033        0.98   
3  na/vid_1.mp4  north american      4        0      0.050        0.98   
4  na/vid_1.mp4  north american      5        0      0.067        0.98   

   success    AU01_r  AU02_r  AU04_r  ...   pose_Rz  gaze_angle_x  \
0        1  0.386667   0.372     0.0  ...  0.241562      0.527431   
1        1  0.400000   0.396     0.0  ...  0.241080      0.524172   
2        1  0.418667   0.396     0.0  ...  0.240598      0.523628   
3        1  0.416000   0.398     0.0  ...  0.240598      0.523628   
4        1  0.373333   0.388     0.0  ...  0.235776      0.519826   

   gaze_angle_y  none  furious  anger  annoyed  contempt  disgust  hatred  
0      0.655223   1.0      0.0    0.0      0.0       0.0      0.

In [58]:
metadata_test

Unnamed: 0,frame,face_id,culture,filename,timestamp
388,1,0,north american,na/vid_11.mp4,0.000
389,2,0,north american,na/vid_11.mp4,0.033
390,3,0,north american,na/vid_11.mp4,0.067
391,4,0,north american,na/vid_11.mp4,0.100
392,5,0,north american,na/vid_11.mp4,0.133
...,...,...,...,...,...
6642,116,0,north american,na/vid_92.mp4,4.792
6643,117,0,north american,na/vid_92.mp4,4.833
6644,118,0,north american,na/vid_92.mp4,4.875
6645,119,0,north american,na/vid_92.mp4,4.917


## Cross-validation

In [59]:
from sklearn.model_selection import KFold

kfold = KFold(5, True, 1)


splits = kfold.split(train_videos)
for (i, (train, test)) in enumerate(splits):
    print('%d-th split: train: %d, test: %d' % (i+1, len(videos[train]), len(videos[test])))
    train_df = X_df[X_df['filename'].isin(videos[train])]
    train_metadata = train_df[metadata_cols]
    print('Training+validation data size: ', train_df.shape[0])
    y = train_df[['none', 'furious', 'anger', 'annoyed', 'contempt', 'disgust', 'hatred']].values
    X = train_df.drop(columns = ['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success']).values
    X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    print('Training data size: ', X_train.shape[0])
    print('Validation data size: ', X_valid.shape[0])
    base_knn =  KNeighborsClassifier(n_neighbors=5,)
    chains = [ClassifierChain(base_knn, order='random', random_state=i)
            for i in range(7)]
    for j, model in enumerate(chains):
        model.fit(X_train, y_train)
        # valid_pred = model.predict(X_valid)
        # val_score =jaccard_score(y_valid, valid_pred, average='samples')
        # print('Validation score in model %d: %d' % (j, val_score) )
        
    # predict on validation data
    valid_pred_chains = np.array([chain.predict(X_valid) for chain in
                            chains])
    chain_accuracy_scores = [jaccard_score(y_valid, valid_pred_chain,
                                    average='micro')
                    for valid_pred_chain in valid_pred_chains]
    
    print("Validation Jaccard Score:\n ", chain_accuracy_scores)
    # test on test data
    Y_pred_chains = np.array([chain.predict(X_test) for chain in
                            chains])
    chain_accuracy_scores = [jaccard_score(y_test, Y_pred_chain,
                                    average='micro')
                    for Y_pred_chain in Y_pred_chains]

    print("Test Jaccard Score: \n ", chain_accuracy_scores)


      



1-th split: train: 60, test: 15
Training+validation data size:  4494
Training data size:  3370
Validation data size:  1124
Validation Jaccard Score:  [array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.]), array([1., 1., 1., 1., 1., 1., 0.])]
Test Jaccard Score:  [array([1.        , 1.        , 1.        , 1.        , 0.75471698,
       1.        , 0.        ]), array([1.        , 1.        , 1.        , 1.        , 0.75471698,
       1.        , 0.        ]), array([1.        , 1.        , 1.        , 1.        , 0.75471698,
       1.        , 0.        ]), array([1.        , 1.        , 1.        , 1.        , 0.75471698,
       1.        , 0.        ]), array([1.        , 1.        , 1.        , 1.        , 0.75471698,
       1.        , 0.        ]), array([1.        , 1.        , 1.        , 1.        , 0.75471698,
 

In [56]:
print(Y_pred_chains[3][200:205])

[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]]


In [54]:
print(y_test[200:205])

[[0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0.]]


## Break data into chunks of 50 frames or less

In [None]:
grouped = X_df.groupby(by=['filename', 'face_id'])
## Separating test data
test_ts_list = list()
test_metadata = list()
# X_list is video/face frames, divided into 50 frames chunks
X_list = []
Y_list = []
metadata = []
frame_limit = 50
for key in grouped.groups:
    X_group = grouped.get_group(key)
    # X_group = X_group.drop(['frame', 'face_id', 'culture', 'filename', 'emotion', 'confidence','success'], axis=1)
    if len(X_group) >= frame_limit:
        splitted_group = np.array_split(X_group, math.ceil(len(X_group) / frame_limit))
        for g in splitted_group:
            X_list.append(g.drop(['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'], axis=1).to_numpy())
            metadata.append({'filename': g.loc[g.index[0], 'filename'], 'face_id':g.loc[g.index[0], 'face_id']})
            Y_list.append(Y_df.loc[g.loc[g.index[0], 'filename']].to_list())
    else:
        X_list.append(X_group.drop(['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'], axis=1).to_numpy())
        metadata.append({'filename': X_group.loc[X_group.index[0], 'filename'],  'face_id':X_group.loc[X_group.index[0], 'face_id']})
        Y_list.append(Y_df.loc[g.loc[g.index[0], 'filename']].to_list())

In [None]:
X_ts = to_time_series_dataset(X_list)

n_series = len(X_ts)
distance_matrix = np.zeros(shape=(n_series, n_series))

# Build distance matrix
for i in range(n_series):
    for j in range(n_series):
        x = X_ts[i]
        y = X_ts[j]
        if i != j:
            dist = soft_dtw(x, y)
            distance_matrix[i, j] = dist

https://scikit-learn.org/stable/modules/multiclass.html#classifierchain


In [None]:
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay
X_train, X_test, Y_train, Y_test = train_test_split(X_ts, Y_list, test_size=.2,
                                                    random_state=0)

base_knn =  KNeighborsClassifier(n_neighbors=5)
chains = [ClassifierChain(base_knn, order='random', random_state=i)
          for i in range(7)]
for model in chains:
    model.fit(X_train, Y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in
                          chains])



In [None]:
chain_accuracy_scores = [jaccard_score(Y_test, Y_pred_chain,
                                      average='samples')
                        for Y_pred_chain in Y_pred_chains]
chain_f1_scores = [f1_score(Y_test, Y_pred_chain,  average='samples')
                        for Y_pred_chain in Y_pred_chains]

# multilabel_confusion_matrix(y_true, y_pred, samplewise=True)

In [None]:

    # kf = KFold(n_splits=5)
    # kf.get_n_splits(X)

    # for train_index, test_index in kf.split(X):

    #     X_train, X_test = X.iloc[train_index, train_index], X.iloc[test_index, train_index]
    #     Y_train, Y_test = Y[train_index], Y[test_index]

    #     model = KNeighborsClassifier(n_neighbors=3, metric='precomputed')
    #     model.fit(X_train, Y_train)
    #     predictions = model.predict(X_test)

    #     print(classification_report(Y_test,predictions))