# AngriBERT - Data Preparation
Emotions data is still being created across domains. Various datasets use different classication. This Notebook, maps the datasets into the Ekman's model as close as possible. Ekman's emotion model contains six basic emotions: one positive and five negative:  

joy 😊, anger 😡 disgust 🤢 fear 😱 sadness 😢 surprise 😲

All the original datasets should be available in in_path location.

This notebooks requires a JSON file containing a map of emotion to map to, all sub_emotions to include and a optionally the type of emotion.
```
{
     # Emotion to group into
     "anticipation" : 
         {   
              #List of all of the sub-emotions that should be grouped into this emotion
             "sub_emotions":["anticipation", "vigilence"],
              # Type of emotion   
             "type":"positive"
         }
}
```


In [2]:
import json
import csv 
import os
import pandas as pd 
from collections import Counter 


## Define variables

In [3]:
in_path = './data'                  #Location where downloaded datasets are available
out_path = './data/final_ekman'     #Folder to write the output files into
outstats_f = out_path + '/stats.csv'#Store statistics of the files processed 
emotion_f = './emotions_ekman.json' #File containing a map of emotions to the list of emotions into
if not os.path.exists(out_path):    
    os.mkdir(out_path)

#Only one set of the below variables should be uncommented
#For mapping to 11 categories
# out_path = './data/finaldata'
# emotion_f = './emotions.json'
 

This section defines all the metadata required for processing. Only this section would need to be changed for re-runs. Rest of the code should not have any "magic" hardcoded values.

To disable a particular dataset from being processed, comment out the lines for the corresponding metadata varable.

In [4]:
# In the below datasets "in" and "out" must have same number of elements
# Records from the input file will be consolidated and writted into corresponding output files

# SemEval 2018 dataset locations. Annotated tweets
# Obtained from: https://competitions.codalab.org/competitions/17751
semeval = {"in":[in_path + "/semeval2018/2018-E-c-En-train.txt",       
                 in_path +"/semeval2018/2018-E-c-En-dev.txt",
                 in_path +"/semeval2018/2018-E-c-En-test-gold.txt"],
           "out":[out_path + "/semeval_train.tsv",
                  out_path + "/semeval_dev.tsv",
                  out_path + "/semeval_test.tsv"]      
           }
# Friends EmotionLines dataset location. Annotated dialogs from Friends
# Obtained from: http://doraemon.iis.sinica.edu.tw/emotionlines/download.html
friends = {"in":[in_path +"/friends/friends_train.json",
                 in_path +"/friends/friends_dev.json",
                 in_path +"/friends/friends_test.json"],
           "out":[out_path + "/friends_train.tsv",
                 out_path + "/friends_dev.tsv",
                 out_path + "/friends_test.tsv"]      
           }

# GoEmotions - Annotated data of reddit comments
# Obtained from: https://github.com/google-research/google-research/tree/master/goemotions 
# GoEmotions uses only emotion indexes in dataset. Emotions are defined in the emotions_file 
goemotions = {"in":[in_path +"/goemotions/train.tsv",
                    in_path +"/goemotions/dev.tsv",
                    in_path +"/goemotions/test.tsv"],
              "out":[out_path + "/goemotions_train.tsv",
                    out_path + "/goemotions_dev.tsv",
                    out_path + "/goemotions_test.tsv"],
              "emotions_file":in_path +"/goemotions/emotions.txt"      
              }
# Unified Dataset - multiple corpora consolidated into 9 emotions
# Obtained from: https://github.com/sarnthil/unify-emotion-datasets 
# Unified dataset contains all data in one file. 
# We split the original file into different corpora
# temp_path defines the folder where the split files are written to
# top_classes: unified dataset gives the "average" rating for each emotion. 
#           This defines the number of "top" classes by rating that should be labelled
# out_path defines where the train, dev and test for selected corpora are written to 
# out_corpora list of corpora that should be split into train, dev, test
#           All corpora matching the strings will be split
unified = {"file":in_path +"/unified_dataset/unified-dataset.jsonl",
           "temp_path":in_path +"/unified_dataset",
           "top_classes":2, #Number of classes with highest ratings to be assigned
           "out_path":out_path,
           "out_corpora":["tales","affectivetext_"], #List of corpus to extract
           "out_split":[0.80,0.10,0.10] # Split files to into train, dev, test
           }
          

## Class & Function definitions
Common functions and classes

In [5]:
class Emotions(object):
    '''Loads emotions from a JSON file of style:
            "anticipation" : 
                {   
                    "sub_emotions":["anticipation", "vigilence"],
                    "type":"positive"
                }
    '''
    def __init__(self,emotion_f):
        with open(emotion_f) as f:
            self.emotions_ = json.load(f)
        self.sub_emotions_ = {v:k for k in self.emotions_.keys() 
                                    for v in self.emotions_[k]['sub_emotions']}
        self.emo_index = {e:i for i,e in enumerate(self.emotions_.keys())}

    def length(self):
        return len(self.emo_index)

    def index(self,field_name): 
        ''' Return index of summary emotion, given a sub-emotion'''   
        return self.emo_index[self.sub_emotions_[field_name]]

    def keys(self):
        ''' Returns keys defining the list of summary emotions '''
        return [e for e in self.emotions_.keys()]

    def parent(self,field_name):
        ''' Returns the parent emotion for a given sub sub-emotion'''
        return self.sub_emotions_[field_name]


In [6]:

def semeval_extract(filein, fileout):
    ''' Processes the filein file, maps the sub-emotions from SemEval 2018 into the summary emotions defined in Emotions class 
        Args:
        filein: string
           Complete path for input file. Expected to be tab separated.
        fileout: string
           Complete path for output file. Output file is tab separated with Text and 1 Hot encoding of summary emotions
        Returns:
           Dictionary containing the statistics of count summary emotions in output file
    '''
    stats = {k:0 for k in emotions.keys()}
    stats["lines"] = 0
    stats["file"] = fileout
    with open(filein,encoding='utf-8') as f, \
        open(fileout,mode="w",encoding='utf-8',newline='') as fw:
        f_read = csv.reader(f, dialect='excel',delimiter='\t')
        f_write = csv.writer(fw,dialect='excel',delimiter='\t')
        header = next(f_read)
        #Write the header row
        f_write.writerow(['Text']+ emotions.keys())
        for row in f_read:
            emo_arr = [0] * emotions.length()
            for col in range(2,len(row)):
                hdr,fld = header[col],row[col]
                # If emotion is labelled as 1, mark the corresponding summary emotion as 1
                if fld == "1":
                    emo_arr[emotions.index(hdr)] = 1 
            for i,e in enumerate(emotions.keys()):
                stats[e] += emo_arr[i]
            f_write.writerow([row[1]] + emo_arr)
            stats["lines"] += 1
    return stats



In [7]:
def friends_extract(filein,fileout):
    ''' Processes the filein file, maps the sub-emotions from Friends dialog dataset 
        into the summary emotions defined in Emotions class 
        Args:
        filein: string
           Complete path for input file. Expected to be tab separated.
        fileout: string
           Complete path for output file. Output file is tab separated with Text and 1 Hot encoding of summary emotions
        Returns:
           Dictionary containing the statistics of count summary emotions in output file
    '''
    stats = {k:0 for k in emotions.keys()}
    stats["lines"] = 0
    stats["file"] = fileout
    #Friends does not contain header.
    header = ["neutral", "joy", "sadness", "fear", "anger", "surprise", "disgust"]

    with open(filein,encoding='utf-8') as f:
        script = json.load(f)
    with open(fileout,mode='w',encoding='utf-8',newline='') as fw:
        f_write = csv.writer(fw,dialect='excel',delimiter='\t')
        f_write.writerow(['Text'] + emotions.keys())
        for dialogs in script:
            for line in dialogs:
                emo_arr = [0] * emotions.length()
                if line['emotion'] == 'non-neutral':
                    #When there is a conflict, the annotators have used a category non-neutral
                    # Convert the non-neutral cases into a class with max number of annotations
                    annots = list(line['annotation'])
                    maxvalue = max(annots)
                    for i in range(len(annots)):
                        if annots[i] == maxvalue:
                            emo_arr[emotions.index(header[i])] = 1   
                else:
                    emo_arr[emotions.index(line['emotion'])] = 1
                for i,e in enumerate(emotions.keys()):
                    stats[e] += emo_arr[i]
                f_write.writerow([line['utterance']] + emo_arr)
                stats["lines"] += 1
    return stats

In [8]:
def goemotions_extract(filein,fileout,emotion_file):
    ''' Processes the filein file, maps the sub-emotions from GoEmotions dataset 
    into the summary emotions defined in Emotions class 
    Args:
    filein: string
        Complete path for input file. Expected to be tab separated.
    fileout: string
        Complete path for output file. Output file is tab separated with Text and 1 Hot encoding of summary emotions
    emotion_file: string
        Complete path for file containing the list of emotions for GoEmotions. The list must the in the order of columns in filein
    Returns:
        Dictionary containing the statistics of count summary emotions in output file
    '''
    stats = {k:0 for k in emotions.keys()}
    stats["lines"] = 0
    stats["file"] = fileout

    with open(emotion_file) as f:
        header = f.read().splitlines()
    with open(filein,mode='r',encoding='utf-8') as f, \
        open(fileout,mode='w',encoding='utf-8',newline='') as fw:

        f_read = csv.reader(f, dialect='excel',delimiter='\t')
        f_write = csv.writer(fw,dialect='excel',delimiter='\t')
        f_write.writerow(['Text'] + emotions.keys())

        for row in f_read:
            emo_arr = [0] * emotions.length()
            idxs = [int(idx) for idx in row[1].split(',')]
            for idx in idxs:
                emo_arr[emotions.index(header[idx])] = 1 
            for i,e in enumerate(emotions.keys()):
                stats[e] += emo_arr[i]
            f_write.writerow([row[0]] + emo_arr)
            stats["lines"] += 1
    return stats

In [9]:
def unified_extract(filein,temp_path,top=2):
    ''' Processes the filein file, maps the sub-emotions from all the corpus in Unified dataset 
        into the summary emotions defined in Emotions class. One file is produced for each corpus in the unified dataset 
        Args:
        filein: string
           Complete path for input file. Expected to be tab separated.
        temp_path: string
           Complete path where output files should be written. Must be a folder.
           Each output file is tab separated with Text and 1 Hot encoding of summary emotions
        top: int
           Number of top emotions. Unified dataset gives average rating among annotators for each emotion.
        Returns:
           Dictionary containing the statistics of count summary emotions in output file
    '''    
    #Since order of corpus in input file is not guaranteed, keep a dictionary of output file handles for each corpus
    out_files = {}
    out_stats = []
    with open(filein) as f:
        prev_f = ""
        for line in f:
            d = json.loads(line)
            fname = "{}/{}_{}.tsv".format(temp_path,d['source'],d['domain'])
            if fname != prev_f:
                if prev_f:
                    out_stats.append(stats)
                stats = {e:0 for e in emotions.keys()}
                stats["lines"] = 0
                stats["file"] = fname
                prev_f = fname

            # Emotions in unified dataset is given as average rating for the emotion among annotators
            # For simplicity, pick the top 2 (default)
            d_emo = d['emotions']
            emo_common = Counter({k:v for k,v in d_emo.items() if v}).most_common(top)

            if len(emo_common) > 0:
                if fname not in out_files: #If we have not written to this file before, open a new handle
                    fw = open(fname,mode='w',encoding='utf-8',newline='')
                    f_write = csv.writer(fw,dialect='excel',delimiter='\t')
                    f_write.writerow(['Text'] + emotions.keys())
                    out_files[fname] = fw,f_write
                emo_arr = [0] * emotions.length()
                for e in emo_common:
                    emo_arr[emotions.index(e[0])] = 1
                for i,e in enumerate(emotions.keys()):
                    stats[e] += emo_arr[i]
                stats["lines"] += 1 
                out_files[fname][1].writerow([d["text"]] + emo_arr)
        out_stats.append(stats)
    #Close all the open output files.
    for fw, _ in out_files.values():
        fw.close()
    return out_stats 

## Main Functions
### Initialize variables

In [10]:
emotions = Emotions(emotion_f)
all_stats = []

### Process files 
Process files only if definitions are present. This allows multiple runs of the notebook for different datasets without overwriting previously generated dataset.  
E.g. if you had already run SemEval and only want to run other files, comment out the definition of semeval dictionary and run the complete notebook.
### Process SemEval Data Set

In [11]:
if semeval:
    for fin,fout in zip(semeval["in"],semeval["out"]):
        stats = semeval_extract(fin,fout)
        all_stats.append(stats)

### Process Friends Data Set

In [12]:
if friends:
    for fin,fout in zip(friends["in"],friends["out"]):
        stats = friends_extract(fin,fout)
        all_stats.append(stats)

### Process GoEmotions Data Set

In [13]:
if goemotions:
    emo_f = goemotions["emotions_file"]
    for fin,fout in zip(goemotions["in"],goemotions["out"]):
        stats = goemotions_extract(fin,fout,emo_f)
        all_stats.append(stats)

### Process Unified Data Set
First split the unified dataset into different corpus files. Then for the corpus defined in **out_corpora** split the file into **train**, **dev**, **test** using the splits defined in **out_split**.


In [14]:
if unified:
    stats = unified_extract(unified["file"],unified["temp_path"],unified["top_classes"])
    corpus = unified.get("out_corpora",[])
    splits = unified.get("out_split",[0.80,0.10,0.10])
    out_path = unified["out_path"]
    train_size = splits[0]
    for corp in corpus:
        train_file = '{}/{}_{}.tsv'.format(out_path,corp,'train')
        test_file = '{}/{}_{}.tsv'.format(out_path,corp,'test')
        dev_file = '{}/{}_{}.tsv'.format(out_path,corp,'dev')
        print (train_file,test_file,dev_file)
        for s in stats:
            if s["file"].find(corp) > -1:
                df = pd.read_csv(s["file"],sep='\t')
                df_train = df.sample(frac=train_size).reset_index(drop=True)
                df_train.to_csv(train_file,sep='\t',index=False)
                df_bal = df.drop(df_train.index).reset_index(drop=True)
                if len(splits) == 3:
                    dev_split = splits[1]/ sum(splits[1:])
                    df_dev = df_bal.sample(frac=dev_split).reset_index(drop=True)
                    df_test = df_bal.drop(df_dev.index).reset_index(drop=True)
                    df_dev.to_csv(dev_file,sep='\t',index=False)
                    df_test.to_csv(test_file,sep='\t',index=False)
                else:
                    df_bal.to_csv(test_file,sep='\t',index=False)
    all_stats += stats

./data/final_ekman2/tales_train.tsv ./data/final_ekman2/tales_test.tsv ./data/final_ekman2/tales_dev.tsv
./data/final_ekman2/affectivetext__train.tsv ./data/final_ekman2/affectivetext__test.tsv ./data/final_ekman2/affectivetext__dev.tsv


In [15]:
df = pd.DataFrame(all_stats)
df.to_csv(outstats_f)
df

Unnamed: 0,joy,anger,disgust,fear,sadness,surprise,neemo,lines,file
0,3386,2544,2602,1242,2266,361,0,6838,./data/final_ekman2/semeval_train.tsv
1,489,315,319,121,292,35,0,886,./data/final_ekman2/semeval_dev.tsv
2,1825,1101,1099,485,1049,170,0,3259,./data/final_ekman2/semeval_test.tsv
3,1698,960,594,424,618,1846,5823,10561,./data/final_ekman2/friends_train.tsv
4,167,129,65,64,92,213,593,1178,./data/final_ekman2/friends_dev.tsv
5,392,275,158,95,168,465,1577,2764,./data/final_ekman2/friends_test.tsv
6,19325,5579,2328,726,1864,3469,14219,43410,./data/final_ekman2/goemotions_train.tsv
7,2434,717,289,105,212,398,1766,5426,./data/final_ekman2/goemotions_dev.tsv
8,2356,726,305,98,212,428,1787,5427,./data/final_ekman2/goemotions_test.tsv
9,584,221,104,396,484,675,0,1246,./data/unified_dataset/affectivetext_headlines...
