## Remove duplicate lines

In [83]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("../new_data/annotation.csv")

# sorting by first name
# data.sort_values("id", inplace = True)

# dropping ALL duplicte values
data.drop_duplicates(subset =["filename","emotions","gender","confidence","comment","emoji","annotator_individuality","intensity"],
                     keep = 'first', inplace = True)
data.to_csv("../new_data/annotations_wo_dupes.csv", index=False)

## Separate NA and Persian annotations
Also removes video culture column

In [4]:
data.drop(columns=['video_culture'], inplace=True)
p_mask = data['annotator_culture'] == 'persian'
na_mask = data['annotator_culture'] == 'north american'

persian_data = data[p_mask]
na_data = data[na_mask]
persian_data.to_csv("../new_data/persian_annotations.csv", index=False)
na_data.to_csv("../new_data/na_annotations.csv", index=False)

## Duplicate lines with multiple labels

In [49]:
persian_data = pd.read_csv("../new_data/persian_annotations.csv")
cols = ["id", "filename","emotions","emoji","gender","confidence","comment","intensity"]
contents = []
for i in range(0, persian_data.shape[0]):
    row = persian_data.iloc[i]
    for emotion in row['emotions'].split(','):
        # for emoji in row['emoji'].split(','):
        contents.append([row["id"], row["filename"], emotion, row["emoji"], row["gender"], row["confidence"],
                         row["comment"], row["intensity"]])
flattened_df = pd.DataFrame(columns=cols, data=contents)

flattened_df

Unnamed: 0,id,filename,emotions,emoji,gender,confidence,comment,intensity
0,1,persian/vid_58.mp4,contempt,none,female,3,,2
1,1,persian/vid_58.mp4,disgust,none,female,3,,2
2,2,persian/vid_27.mp4,none,unamused,male,5,,5
3,3,persian/vid_89.mp4,anger,hatred,female,5,,2
4,4,persian/vid_61.mp4,annoyed,angry,female,5,,4
...,...,...,...,...,...,...,...,...
665,884,persian/vid_24.mp4,contempt,smirk,female,4,,4
666,884,persian/vid_24.mp4,disgust,smirk,female,4,,4
667,885,persian/vid_62.mp4,annoyed,angry,female,5,,4
668,885,persian/vid_62.mp4,furious,angry,female,5,,4


### Flattening emojis into new columns

In [50]:
emoji_expanded = flattened_df['emoji'].str.split(',', expand=True)
emoji_expanded.columns = ['emoji'+str(i) for i in emoji_expanded.columns]
emoji_expanded
flattened_df_concat = pd.concat([flattened_df,emoji_expanded], axis=1)

## Voting

In [52]:
labels_df = flattened_df.groupby(['filename', 'gender'])['emotions'].agg(pd.Series.mode).to_frame()

print(labels_df.head(20))
labels_df.to_csv("../new_data/labels.csv")
#
flattened_df['emotions'].value_counts()

                                     emotions
filename           gender                    
persian/vid_1.mp4  male               annoyed
persian/vid_10.mp4 male      [contempt, none]
persian/vid_11.mp4 male              contempt
persian/vid_12.mp4 male      [contempt, none]
persian/vid_13.mp4 male                 anger
persian/vid_14.mp4 female                none
persian/vid_15.mp4 male                hatred
persian/vid_16.mp4 male              contempt
persian/vid_17.mp4 male              contempt
persian/vid_18.mp4 female             disgust
persian/vid_19.mp4 male                  none
persian/vid_2.mp4  male               annoyed
persian/vid_20.mp4 male              contempt
persian/vid_21.mp4 male                  none
persian/vid_22.mp4 female             annoyed
persian/vid_23.mp4 female            contempt
persian/vid_24.mp4 female  [contempt, hatred]
persian/vid_25.mp4 female                none
persian/vid_26.mp4 female                none
                   male           

annoyed     134
anger       131
contempt     97
none         91
furious      89
hatred       74
disgust      54
Name: emotions, dtype: int64

### Average of reported intensity

In [56]:
average_intensity = flattened_df.groupby(['filename', 'gender'])['intensity'].mean()
labels_df = pd.concat([labels_df, average_intensity], axis=1)
labels_df

Unnamed: 0_level_0,Unnamed: 1_level_0,emotions,intensity
filename,gender,Unnamed: 2_level_1,Unnamed: 3_level_1
persian/vid_1.mp4,male,annoyed,2.500000
persian/vid_10.mp4,male,"[contempt, none]",2.500000
persian/vid_11.mp4,male,contempt,2.625000
persian/vid_12.mp4,male,"[contempt, none]",2.400000
persian/vid_13.mp4,male,anger,1.750000
...,...,...,...
persian/vid_93.mp4,female,hatred,3.666667
persian/vid_94.mp4,male,none,1.500000
persian/vid_95.mp4,male,"[contempt, hatred, none]",3.250000
persian/vid_96.mp4,male,contempt,2.200000


In [71]:
df = flattened_df.groupby(['filename', 'emotions'])['emotions'].count()\
    .unstack(fill_value=0) \
    .reset_index() \
    .rename_axis(None, axis=1)
# flattened_df['emotions'].unique()
df

Unnamed: 0,filename,anger,annoyed,contempt,disgust,furious,hatred,none
0,persian/vid_1.mp4,0,3,0,1,0,0,2
1,persian/vid_10.mp4,0,1,2,1,0,0,2
2,persian/vid_11.mp4,0,2,3,1,0,0,2
3,persian/vid_12.mp4,0,1,2,0,0,0,2
4,persian/vid_13.mp4,4,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...
91,persian/vid_93.mp4,3,0,1,2,2,4,0
92,persian/vid_94.mp4,1,1,1,0,0,0,3
93,persian/vid_95.mp4,1,1,2,0,0,2,2
94,persian/vid_96.mp4,0,0,3,0,1,0,1


In [85]:
emotions_corr = df.corr(method = "pearson")
emotions_corr

# fig, ax = plt.subplots(figsize = (9,5))
# sn.heatmap(emotions_corr, cmap = "Blues")
# plt.xticks(rotation=50)
# plt.savefig("heatmap.png", dpi = 300)

Unnamed: 0,anger,annoyed,contempt,disgust,furious,hatred,none
anger,1.0,-0.058781,-0.40069,-0.17059,0.371841,0.319699,-0.537176
annoyed,-0.058781,1.0,-0.315634,0.115,-0.153939,-0.198191,-0.122242
contempt,-0.40069,-0.315634,1.0,0.152478,-0.417921,-0.131181,0.166083
disgust,-0.17059,0.115,0.152478,1.0,-0.111244,0.169772,-0.243032
furious,0.371841,-0.153939,-0.417921,-0.111244,1.0,0.267434,-0.465288
hatred,0.319699,-0.198191,-0.131181,0.169772,0.267434,1.0,-0.372277
none,-0.537176,-0.122242,0.166083,-0.243032,-0.465288,-0.372277,1.0


In [76]:
import krippendorff
print("Krippendorff's alpha for nominal metric: ", krippendorff.alpha(value_counts=df.drop('filename', axis=1).values,
                                                                          level_of_measurement='nominal'))


Krippendorff's alpha for nominal metric:  0.07589428340718574
