In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib.pyplot as plt
from tslearn.metrics import dtw, soft_dtw
from tslearn.utils import to_time_series_dataset
from sklearn.utils.validation import _check_large_sparse
from tslearn.clustering import TimeSeriesKMeans
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from sklearn.metrics import pairwise_distances_argmin_min, jaccard_score, f1_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import ClassifierChain
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay

In [2]:
def boolean_df(item_lists, unique_items):
# Create empty dict
    bool_dict = {}
    
    # Loop through all the tags
    for i, item in enumerate(unique_items):
        
        # Apply boolean mask
        bool_dict[item] = item_lists.apply(lambda x: 1 if item in x else 0)
            
    # Return the results as a dataframe
    return pd.DataFrame(bool_dict)

def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [127]:
persian_emoji_map = {'neutral': 'none', 'smirk':'contempt', 'furious':'furious', 'weary':'annoyed', 'expressionless':'none', 'unamused': 'annoyed', 'rollingeyes':'contempt', 'none':'none', 'skeptical':'none', 'angry':'anger', 'nauseated':'disgust', 'vomiting':'disgust', 'triumph':'anger', 'hatred':'hatred'}
na_emoji_map = {'neutral': ['none'], 'smirk':['contempt'], 'furious':['furious', 'anger'], 'weary':['disgust'], 'expressionless':['contempt','annoyed'], 'unamused': ['annoyed', 'contempt'], 'rollingeyes':['annoyed'], 'none':['none'], 'skeptical':['none'], 'angry':['anger', 'annoyed'], 'nauseated':['disgust'], 'vomiting':['disgust'], 'triumph':['anger'], 'hatred':['hatred', 'furious']}

In [115]:
X_df = pd.read_csv('../new_data/NA/na_dataset.csv', index_col=None)
Y_df = pd.read_csv('../new_data/NA/na_emoji_labels.csv', usecols=['filename', 'emoji'], index_col='filename')
emotion_df = pd.read_csv('../new_data/NA/na_labels.csv', usecols=['filename', 'emotions'], index_col='filename')

Y_df["emoji"] = Y_df["emoji"].apply(eval)
emotion_df["emotions"] = emotion_df["emotions"].apply(eval)
label_cols = to_1D(Y_df["emoji"]).unique() 
emotion_cols = to_1D(emotion_df["emotions"]).unique() 
# label_cols.append('smilingimp')
labels_expanded = boolean_df(Y_df['emoji'], label_cols )
emotions_expanded = boolean_df(emotion_df['emotions'], emotion_cols )

order = [2, 11, 9, 13, 10, 12, 3, 4, 5, 6, 8, 7, 0]

In [113]:
emotion_cols

array(['none', 'furious', 'anger', 'annoyed', 'contempt', 'disgust',
       'hatred'], dtype=object)

In [None]:
labels_expanded.loc['na/vid_5.mp4']

In [None]:
for l in label_cols:
    X_df[label_cols]  = np.NaN
    

In [None]:
X_df.head()

In [None]:
label_cols.tolist()

In [None]:
print(labels_expanded.at['na/vid_5.mp4', 'hatred'])

In [116]:
for index, row in X_df.iterrows():
    # print(index, row)
    filename = X_df.iloc[index]['filename']
    # print(filename)
    for l in label_cols.tolist():
        try:
    # print(labels_expanded.loc[filename]['none':'hatred'].to_list())
            X_df.at[index, l] = labels_expanded.at[filename, l]
        except Exception:
            # print("here")
            print('expanded labels: filename: {}, label: {}, {}'.format(filename, l, labels_expanded.at[filename, l]))
            print(X_df.at[index, l])
        

In [117]:
for index, row in X_df.iterrows():
    # print(index, row)
    filename = X_df.iloc[index]['filename']
    # print(filename)
    for l in emotion_cols.tolist():
        try:
    # print(labels_expanded.loc[filename]['none':'hatred'].to_list())
            X_df.at[index, l] = emotions_expanded.at[filename, l]
        except Exception:
            # print("here")
            print('expanded labels: filename: {}, label: {}, {}'.format(filename, l, emotions_expanded.at[filename, l]))
            print(X_df.at[index, l])

In [118]:
X_df.head()

Unnamed: 0,filename,culture,frame,face_id,timestamp,confidence,success,AU01_r,AU02_r,AU04_r,...,skeptical,angry,nauseated,vomiting,triumph,hatred,anger,annoyed,contempt,disgust
0,na/vid_1.mp4,north american,1,0,0.0,0.98,1,1.45,1.86,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,na/vid_1.mp4,north american,2,0,0.017,0.98,1,1.5,1.98,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,na/vid_1.mp4,north american,3,0,0.033,0.98,1,1.57,1.98,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,na/vid_1.mp4,north american,4,0,0.05,0.98,1,1.56,1.99,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,na/vid_1.mp4,north american,5,0,0.067,0.98,1,1.4,1.94,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Min-Max Scaling

In [None]:
## Ablation cols
ablation_cols = ['AU01_r','AU02_r','AU04_r','AU05_r','AU06_r','AU07_r','AU09_r', 'AU10_r','AU12_r','AU14_r','AU15_r','AU17_r','AU20_r','AU23_r','AU25_r','AU26_r','AU45_r']

In [None]:
X_df = X_df.drop(columns=ablation_cols)
X_df

In [119]:
cols_to_scale = list (
    set(X_df.columns.to_list()) - set(['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'])- set(label_cols)-set(emotion_cols)
    )
scaler = MinMaxScaler()
X_df[cols_to_scale] = scaler.fit_transform(X_df[cols_to_scale])

In [None]:
X_df.tail()

## Splitting into train and test

In [None]:
['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'] + label_cols.tolist()

In [120]:
metadata_cols = ['frame', 'face_id', 'culture', 'filename', 'timestamp']
cols_to_drop = ['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'] + label_cols.tolist() + emotion_cols.tolist()
print(X_df.head())
videos = X_df['filename'].unique()
# test_videos = pd.Series(videos).sample(frac=0.35)
test_videos = ['na/vid_52.mp4', 'na/vid_13.mp4', 'na/vid_54.mp4', 'na/vid_92.mp4', 'na/vid_48.mp4', 'na/vid_93.mp4', 'na/vid_6.mp4', 'na/vid_50.mp4', 'na/vid_14.mp4', 'na/vid_10_1.mp4', 'na/vid_34.mp4', 'na/vid_86.mp4', 'na/vid_83.mp4', 'na/vid_90.mp4', 'na/vid_55.mp4', 'na/vid_60.mp4', 'na/vid_24.mp4', 'na/vid_10_3.mp4', 'na/vid_10_2.mp4', 'na/vid_87.mp4', 'na/vid_32.mp4', 'na/vid_79.mp4', 'na/vid_68.mp4', 'na/vid_56.mp4']
# test_videos = ['persian/vid_52.mp4', 'persian/vid_40.mp4', 'persian/vid_66.mp4', 'persian/vid_18.mp4', 'persian/vid_96.mp4', 'persian/vid_65.mp4', 'persian/vid_51.mp4', 'persian/vid_85.mp4', 'persian/vid_87.mp4', 'persian/vid_70.mp4', 'persian/vid_76.mp4', 'persian/vid_27.mp4', 'persian/vid_88.mp4', 'persian/vid_21.mp4', 'persian/vid_4.mp4', 'persian/vid_61.mp4', 'persian/vid_12.mp4', 'persian/vid_46.mp4', 'persian/vid_81.mp4', 'persian/vid_93.mp4', 'persian/vid_56.mp4', 'persian/vid_55.mp4', 'persian/vid_68.mp4', 'persian/vid_38.mp4', 'persian/vid_62.mp4', 'persian/vid_14.mp4', 'persian/vid_8.mp4', 'persian/vid_83.mp4', 'persian/vid_45.mp4', 'persian/vid_91.mp4', 'persian/vid_22.mp4', 'persian/vid_36.mp4', 'persian/vid_7.mp4']
train_videos = np.array(list(set(videos) - set(test_videos)))
test_df = X_df[X_df['filename'].isin(test_videos)]
metadata_test = test_df[metadata_cols]
y_test = test_df[label_cols].values
y_test_emotions = test_df[emotion_cols]
X_test = test_df.drop(columns = cols_to_drop).values

       filename         culture  frame  face_id  timestamp  confidence  \
0  na/vid_1.mp4  north american      1        0      0.000        0.98   
1  na/vid_1.mp4  north american      2        0      0.017        0.98   
2  na/vid_1.mp4  north american      3        0      0.033        0.98   
3  na/vid_1.mp4  north american      4        0      0.050        0.98   
4  na/vid_1.mp4  north american      5        0      0.067        0.98   

   success    AU01_r  AU02_r  AU04_r  ...  skeptical  angry  nauseated  \
0        1  0.386667   0.372     0.0  ...        0.0    0.0        0.0   
1        1  0.400000   0.396     0.0  ...        0.0    0.0        0.0   
2        1  0.418667   0.396     0.0  ...        0.0    0.0        0.0   
3        1  0.416000   0.398     0.0  ...        0.0    0.0        0.0   
4        1  0.373333   0.388     0.0  ...        0.0    0.0        0.0   

   vomiting  triumph  hatred  anger  annoyed  contempt  disgust  
0       0.0      0.0     0.0    0.0      0.0

In [None]:
print(test_videos.to_list())

In [None]:
metadata_test.iloc[800:805]

## Cross-validation

In [121]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

kfold = KFold(5, True, 1)
frames_mean_hm_test = []
frames_mean_jac_test = []
videos_mean_jac_test = []
videos_mean_hm_test = []
# metadata_test.reset_index(inplace=True)
order = [2, 11, 9, 13, 10, 12, 3, 4, 5, 6, 8, 7, 0]
col_indices = {i:label for (i,label) in enumerate(label_cols)}

splits = kfold.split(train_videos)
for (i, (train, test)) in enumerate(splits):
    # print(videos[train])
    # print(videos[test])
    print('%d-th split: train: %d, test: %d' % (i+1, len(train_videos[train]), len(train_videos[test])))
    train_df = X_df[X_df['filename'].isin(train_videos[train])]
    print(train_videos[train])
    train_metadata = train_df[metadata_cols]
    print('Training+validation data size: ', train_df.shape[0])
    y_train = train_df[label_cols].values
    X_train = train_df.drop(columns = cols_to_drop).values
    valid_df = X_df[X_df['filename'].isin(train_videos[test])]
    y_valid = valid_df[label_cols].values
    X_valid = valid_df.drop(columns = cols_to_drop).values
    # X_train, X_valid, y_train, y_valid = train_test_split(X, y)
    print('Training data size: ', X_train.shape[0])
    print('Validation data size: ', X_valid.shape[0])
    base_knn =  KNeighborsClassifier(n_neighbors=5,)
    base_lr = LogisticRegression()
    base_rf = RandomForestClassifier()
    base_xgb = XGBClassifier(objective="binary:logistic", eval_metric='logloss')

    multiclass_xg = MultiOutputClassifier(XGBClassifier(objective='binary:logistic', eval_metric='auc'))
    # ovr = LogisticRegression()
    multiclass_xg.fit(X_train, y_train)
    valid_pred_xgb = multiclass_xg.predict(X_valid)
    ovr_jaccard_score = jaccard_score(y_valid, valid_pred_xgb, average='samples') # TODO
    ovr_ham_loss = metrics.hamming_loss(y_valid, valid_pred_xgb)
    print("XGB-multi validation Jaccard score:" , ovr_jaccard_score)
    print("XGB-multi validation Hamming loss: " , ovr_ham_loss)

    Y_pred_ovr = multiclass_xg.predict(X_test)
    a = jaccard_score(y_test, Y_pred_ovr, average='samples') #TODO
    b = metrics.hamming_loss(y_test, Y_pred_ovr)
    # print(Y_pred_ovr[800:805,:])
    # print(y_test[800:805,:])

    print("XGB-multi test Jaccard score: ", a)
    print("XGB-multi test Hamming loss: " , b)
    chains = [ClassifierChain(base_xgb, order='random', random_state=i) for i in range(15)]
    best_model_index = 0
    best_jac = 0
    for j, model in enumerate(chains):
        model.fit(X_train, y_train)
        valid_pred = model.predict(X_valid)
        val_score =jaccard_score(y_valid, valid_pred, average='samples')
        if val_score > best_jac:
            best_model_index = j
            best_jac = val_score
        
        
    # predict on validation data
    valid_pred_chains = chains[best_model_index].predict(X_valid)
    chain_jaccard_scores = jaccard_score(y_valid, valid_pred_chains >= .5,
                                    average='samples')
                    
    
    print("CC Validation Jaccard Score:\n ", chain_jaccard_scores)

    chain_hamming_loss = metrics.hamming_loss(y_valid, valid_pred_chains >= .5)
                
    print("CC Validation Hamming Loss:\n ", chain_hamming_loss)

    # test on test data
    Y_pred_chains = chains[best_model_index].predict(X_test)
    chain_jaccard_scores = jaccard_score(y_test, Y_pred_chains >= .5,
                                    average='samples')
                    
    frames_mean_jac_test.append(np.mean(chain_jaccard_scores))
    print("CC Test Jaccard Score: \n ", chain_jaccard_scores)

    chain_hamming_loss = metrics.hamming_loss(y_test, Y_pred_chains) 
               
    frames_mean_hm_test.append(chain_hamming_loss)
    print("CC Test Hamming Loss:\n ", chain_hamming_loss)
    ## voting predicted labels
    # test_result_df = pd.DataFrame(columns=metadata_cols, data=metadata_test.values)
    # test_result_df.update(metadata_test)
    temp_df = pd.DataFrame(data=Y_pred_chains, columns=label_cols)

    # TODO: map emojis to emotions using temp_df

    test_result_df = pd.concat([metadata_test, temp_df], axis=1)
    video_groups = test_result_df.groupby('filename')[label_cols].sum()
    ground_truth_video_labels = []
    for v in video_groups.index.to_list():
        # number of 1s in ground truth labels
        ground_truth_video_labels.append(test_df[test_df['filename'] == v].iloc[0][label_cols])
        
        num_1s = test_df[test_df['filename'] == v].iloc[0][label_cols].sum()
        num_1s = int(num_1s)
        a = np.argsort(video_groups.loc[v].values)
        for i in range(len(a) - 1, len(a) - num_1s - 1, -1):
            video_groups.loc[v][a[i]] = 1
        for i in range(0, len(a) - num_1s):
            video_groups.loc[v][a[i]] = 0
        # print("          &&&&&&&&&&&&&&&&            ")
    print(np.array(ground_truth_video_labels,  dtype=int).shape)
    print(video_groups.values.shape)

    j = metrics.jaccard_score(np.array(ground_truth_video_labels,  dtype=int), video_groups.values, average='samples')
    h = metrics.hamming_loss(np.array(ground_truth_video_labels,  dtype=int), video_groups.values)
    videos_mean_jac_test.append(j)
    videos_mean_hm_test.append(h)

1-th split: train: 56, test: 14
['na/vid_15.mp4' 'na/vid_44.mp4' 'na/vid_69.mp4' 'na/vid_26.mp4'
 'na/vid_38.mp4' 'na/vid_21.mp4' 'na/vid_33.mp4' 'na/vid_1.mp4'
 'na/vid_20.mp4' 'na/vid_74.mp4' 'na/vid_43.mp4' 'na/vid_51.mp4'
 'na/vid_31.mp4' 'na/vid_46.mp4' 'na/vid_95.mp4' 'na/vid_62.mp4'
 'na/vid_11.mp4' 'na/vid_75.mp4' 'na/vid_67.mp4' 'na/vid_81.mp4'
 'na/vid_104.mp4' 'na/vid_84.mp4' 'na/vid_45.mp4' 'na/vid_23.mp4'
 'na/vid_25.mp4' 'na/vid_4.mp4' 'na/vid_85.mp4' 'na/vid_77.mp4'
 'na/vid_80.mp4' 'na/vid_78.mp4' 'na/vid_53.mp4' 'na/vid_22.mp4'
 'na/vid_65.mp4' 'na/vid_101.mp4' 'na/vid_76.mp4' 'na/vid_63.mp4'
 'na/vid_27.mp4' 'na/vid_47.mp4' 'na/vid_5.mp4' 'na/vid_29.mp4'
 'na/vid_2.mp4' 'na/vid_70.mp4' 'na/vid_16.mp4' 'na/vid_30.mp4'
 'na/vid_42.mp4' 'na/vid_88.mp4' 'na/vid_100.mp4' 'na/vid_17.mp4'
 'na/vid_18.mp4' 'na/vid_19.mp4' 'na/vid_73.mp4' 'na/vid_8.mp4'
 'na/vid_66.mp4' 'na/vid_7.mp4' 'na/vid_3.mp4' 'na/vid_12.mp4']
Training+validation data size:  4196
Training data size:  419

In [54]:
emotion_mapped_df.columns

Index(['none', 'furious', 'anger', 'annoyed', 'contempt', 'disgust', 'hatred'], dtype='object')

In [66]:
emotion_mapped_df.at[np.random.randint(1, 100, size=(20, )), 'none'] = 1


In [123]:
emotion_mapped_df.sum()

none         83
furious     150
anger        81
annoyed     519
contempt     28
disgust     117
hatred        0
dtype: int64

In [128]:
emotion_mapped_df = pd.DataFrame(columns=emotion_cols, data=np.zeros((temp_df.shape[0], len(emotion_cols)), dtype=int))
for emoji in label_cols.tolist():
    indices = temp_df[temp_df[emoji]==1].index
    emotions = na_emoji_map[emoji]
    # print(indices)
    # print(emotion)
    for emotion in emotions:
        emotion_mapped_df.at[indices.to_list(), emotion] = 1

In [99]:
X_test

array([[0.15733333, 0.614     , 0.        , ..., 0.33124397, 0.50678979,
        0.7027027 ],
       [0.17066667, 0.602     , 0.        , ..., 0.33510125, 0.51656708,
        0.6990504 ],
       [0.10933333, 0.568     , 0.        , ..., 0.33751205, 0.51330798,
        0.70197224],
       ...,
       [0.84533333, 0.442     , 0.1037464 , ..., 0.32594021, 0.46876697,
        0.51424397],
       [0.90666667, 0.484     , 0.0778098 , ..., 0.32594021, 0.47256926,
        0.49963477],
       [0.864     , 0.436     , 0.04034582, ..., 0.31918997, 0.4904943 ,
        0.50255661]])

In [86]:
type(emotion_df.values)

numpy.ndarray

In [124]:
y_test_emotions

Unnamed: 0,none,furious,anger,annoyed,contempt,disgust,hatred
149,0.0,0.0,0.0,1.0,1.0,0.0,0.0
150,0.0,0.0,0.0,1.0,1.0,0.0,0.0
151,0.0,0.0,0.0,1.0,1.0,0.0,0.0
152,0.0,0.0,0.0,1.0,1.0,0.0,0.0
153,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
6694,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6695,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6696,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6697,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [130]:
emotion_mapped_df.head()

Unnamed: 0,none,furious,anger,annoyed,contempt,disgust,hatred
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,1,1,0,0
4,0,0,0,0,0,0,0


In [129]:
print(metrics.classification_report(emotion_mapped_df.values,  y_test_emotions.values, target_names=emotion_cols))

              precision    recall  f1-score   support

        none       0.00      0.00      0.00        83
     furious       0.09      0.03      0.05       150
       anger       0.34      0.61      0.43       230
     annoyed       0.59      0.66      0.62       595
    contempt       0.21      0.10      0.13       425
     disgust       0.19      0.91      0.31       117
      hatred       0.00      0.00      0.00         0

   micro avg       0.34      0.43      0.38      1600
   macro avg       0.20      0.33      0.22      1600
weighted avg       0.35      0.43      0.36      1600
 samples avg       0.35      0.27      0.29      1600



In [131]:
print(metrics.classification_report(np.array(ground_truth_video_labels,  dtype=int),  video_groups.values, target_names=label_cols.tolist()))
print(metrics.classification_report(y_test,  Y_pred_chains, target_names=label_cols.tolist()))

                precision    recall  f1-score   support

       neutral       0.00      0.00      0.00         1
         smirk       0.00      0.00      0.00         0
       furious       0.00      0.00      0.00         1
         weary       0.00      0.00      0.00         2
expressionless       0.00      0.00      0.00         6
      unamused       0.80      0.67      0.73         6
   rollingeyes       0.00      0.00      0.00         2
          none       0.00      0.00      0.00         2
     skeptical       0.00      0.00      0.00         0
         angry       1.00      0.17      0.29         6
     nauseated       0.50      0.33      0.40         3
      vomiting       1.00      0.33      0.50         3
       triumph       0.00      0.00      0.00         0
        hatred       0.00      0.00      0.00         0

     micro avg       0.22      0.22      0.22        32
     macro avg       0.24      0.11      0.14        32
  weighted avg       0.48      0.22      0.27 

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax.grid(True)
ax.set_title('CC Jaccard Score')
ax.set_xticks(range(0,7))
ax.set_xticklabels(range(0,7))
ax.set_ylabel('Jaccard Similarity Score')
ax.set_ylim([0.00, 1.0])

plt.plot(frames_mean_jac_test, label='Frames Jaccard Score', color='blue')
plt.plot(videos_mean_jac_test, label='Video Jaccard Score', color='green')
plt.legend(loc='best')
plt.show()

In [None]:
videos_mean_jac_test

In [None]:
print(metrics.classification_report(y_test, Y_pred_chains, target_names=label_cols.tolist()))

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax.grid(True)
ax.set_title('CC Hamming Loss')
ax.set_xticks(range(0,7))
ax.set_xticklabels(range(0,7))
ax.set_ylabel('Hamming Distance Loss')
ax.set_ylim([0.00, 1.0])
# ax.legend(('Frames Hamming Loss','Video Hamming Loss' ))
plt.plot(frames_mean_hm_test, label='Frames Hamming Loss', color='red')
plt.plot(videos_mean_hm_test, label='Video Hamming Loss', color='green')
plt.legend(loc='upper right')
plt.show()

In [None]:
videos_mean_jac_test

In [None]:
from skmultilearn.adapt import MLkNN
import sklearn.metrics as metrics

## MLTSVM is not compatible with later versions of numpy

kfold = KFold(5, True, 1)
frames_mean_hm_test = []
frames_mean_jac_test = []
videos_mean_jac_test = []
videos_mean_hm_test = []
col_indices = {i:label for (i,label) in enumerate(label_cols)}

splits = kfold.split(train_videos)
for (i, (train, test)) in enumerate(splits):
    print('%d-th split: train: %d, validation: %d' % (i+1, len(videos[train]), len(videos[test])))
    train_df = X_df[X_df['filename'].isin(train_videos[train])]
    train_metadata = train_df[metadata_cols]
    print('Training+validation data size: ', train_df.shape[0])
    y_train = train_df[label_cols].values
    X_train = train_df.drop(columns = ['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'] + label_cols.tolist()).values
    valid_df = X_df[X_df['filename'].isin(train_videos[test])]
    y_valid = valid_df[label_cols].values
    X_valid = valid_df.drop(columns = ['frame', 'face_id', 'culture', 'filename', 'timestamp', 'confidence','success'] + label_cols.tolist()).values
    
    print('Training data size: ', X_train.shape[0])
    print('Validation data size: ', X_valid.shape[0])
    classifier = MLkNN(k=5)
    # classifier = MLTSVM(c_k = 2**-1)
    prediction = classifier.fit(X_train, y_train).predict(X_valid)

    # Predicting on validation set
    print("Validation Hamming Loss:\n ", metrics.hamming_loss(y_valid, prediction))

    # Predicting on test set
    y_test_pred = classifier.predict(X_test)
    hm_loss = metrics.hamming_loss(y_test, y_test_pred)
    frames_mean_hm_test.append(hm_loss)
    print("Test Hamming Loss:\n ", hm_loss)
    jac_score = jaccard_score(y_test, y_test_pred,  average='samples')
    frames_mean_jac_test.append(jac_score)
    print("Test Jaccard Score:\n ", jac_score)

    # building test dataframe to vote labels
    test_result_df = pd.DataFrame(columns=metadata_cols, data=metadata_test.values)
    test_result_df.update(metadata_test)
    # print("^^^^^^^^", y_test_pred.toarray().shape)
    temp_df = pd.DataFrame(data=y_test_pred.toarray(), columns=label_cols)
    test_result_df = pd.concat([test_result_df, temp_df], axis=1)
    
    # print(test_result_df.head())
    video_groups = test_result_df.groupby('filename')[label_cols].apply(lambda x : x.astype(int).sum())
    # for name, group in video_groups:
    #     print(name)
    #     print(group)
    #     print("\n") 
    ground_truth_video_labels = []
    for v in video_groups.index.to_list():
        # number of 1s in ground truth labels
        ground_truth_video_labels.append(test_df[test_df['filename'] == v].iloc[0][label_cols])
        print(len(ground_truth_video_labels))
        num_1s = test_df[test_df['filename'] == v].iloc[0][label_cols].sum()
        num_1s = int(num_1s)
        a = np.argsort(video_groups.loc[v].values)
        for i in range(len(a) - 1, len(a) - num_1s - 1, -1):
            video_groups.loc[v][a[i]] = 1
        for i in range(0, len(a) - num_1s):
            video_groups.loc[v][a[i]] = 0

    j = metrics.jaccard_score(np.array(ground_truth_video_labels,  dtype=int), video_groups.values, average='samples')
    h = metrics.hamming_loss(np.array(ground_truth_video_labels,  dtype=int), video_groups.values)
    videos_mean_jac_test.append(j)
    videos_mean_hm_test.append(h)
        

Multilabel confusion matrix puts TN at (0,0) and TP at (1,1) position thanks @Kenneth Witham for pointing out.


In [None]:
multilabel_confusion_matrix(y_test, Y_pred_chains[1], labels=range(0,14))


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, Y_pred_chains[-1], target_names=label_cols))

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax.grid(True)
ax.set_title('MLKNN Jaccard Score')
ax.set_xticks(range(0,7))
ax.set_xticklabels(range(0,7))
ax.set_ylabel('Jaccard Similarity Score')
ax.set_ylim([0.55, 1.0])

plt.plot(frames_mean_jac_test, label='Frames Jaccard Score', color='blue')
plt.plot(videos_mean_jac_test, label='Video Jaccard Score', color='green')
plt.legend(loc='lower right')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(4, 4))
ax.grid(True)
ax.set_title('MLKNN Hamming Loss')
ax.set_xticks(range(0,7))
ax.set_xticklabels(range(0,7))
ax.set_ylabel('Hamming Distance Loss')
ax.set_ylim([0.00, .1])
# ax.legend(('Frames Hamming Loss','Video Hamming Loss' ))
plt.plot(frames_mean_hm_test, label='Frames Hamming Loss', color='red')
plt.plot(videos_mean_hm_test, label='Video Hamming Loss', color='green')
plt.legend(loc='upper right')
plt.show()

In [None]:
print(y_test[800:805])
print(metadata_test[800:805])

In [None]:
print(len(y_test))
print(len(metadata_test))

In [None]:
def clean(input: str):
    input = 'persian/' + input +".mp4"
    return input
X_df = pd.read_csv('../new_data/Persian/persian_dataset.csv', index_col=None)
X_df['filename'] = X_df['filename'].apply(clean)
X_df.to_csv('../new_data/Persian/persian_dataset.csv', index=False)