#### Importing Libraries

In [None]:
from transformers import BertTokenizer
from model import BertForMultiLabelClassification
from multilabel_pipeline import MultiLabelPipeline
from pprint import pprint
import os
import pandas as pd
from tqdm import tqdm

#### Tokenizer, Model and Pipeline Setup

In [None]:
tokenizer = BertTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")

In [None]:
model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")

In [None]:
goemotions = MultiLabelPipeline(
    model=model,
    tokenizer=tokenizer,
    threshold=0.3
)

#### Inference

In [40]:
all_emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']

In [94]:
def process_line(line):
    emotion_prediction = goemotions(line)[0]
    labels, scores = emotion_prediction['labels'], emotion_prediction['scores']
    emotion_dict = dict(zip(labels, scores))
    # Fill in missing emotions with 0
    for emotion in all_emotions:
        if emotion not in emotion_dict:
            emotion_dict[emotion] = 0
    return pd.Series(emotion_dict)

In [98]:
folder_path = './script_csv'
output_path = './script_csv_go_emotion'
for file_name in tqdm(os.listdir(folder_path)):
    if file_name.endswith('.csv'):
        data = pd.read_csv(os.path.join(folder_path, file_name))
        data[all_emotions] = 0
        data[all_emotions] = data['line'].apply(process_line)
        data.to_csv(os.path.join(output_path, file_name))

100%|██████████| 17/17 [28:25<00:00, 100.34s/it]
