# Import data

In [None]:
import tensorflow as tf
import os
os.environ['OMP_NUM_THREADS'] = '1'

In [None]:
import pandas as pd
import numpy as np

ins_df = pd.read_csv('data/instagram_data.csv')
ins_df = ins_df[ins_df['Contents'].notna()] # We might want to do something different here - SN
ins_df

In [None]:
from sklearn.utils import shuffle

anger_df = pd.read_csv('data/twitter/anger.tsv', sep='\t').drop(columns=['index', 'intensity'])
fear_df = pd.read_csv('data/twitter/fear.tsv', sep='\t').drop(columns=['index', 'intensity'])
joy_df = pd.read_csv('data/twitter/joy.tsv', sep='\t').drop(columns=['index', 'intensity'])
sadness_df = pd.read_csv('data/twitter/sadness.tsv', sep='\t').drop(columns=['index', 'intensity'])

emotion_df = pd.concat([anger_df, fear_df, joy_df, sadness_df])
emotion_df = shuffle(emotion_df)

emotion_df

# Feature Engineering

In [None]:
!pip install --upgrade transformers datasets emoji deep-translator

In [None]:
import torch
from transformers import AutoTokenizer
from deep_translator import GoogleTranslator
import emoji
# TOKENIZER_MODEL = "cardiffnlp/twitter-roberta-base-sentiment" 
TOKENIZER_MODEL = "digitalepidemiologylab/covid-twitter-bert-v2"
# TOKENIZER_MODEL = "roberta-base"
# TOKENIZER_MODEL = "gpt2" # - not yet working - SN

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)
translator = GoogleTranslator(source='auto', target='en')

# Note: How we preprocess may depend on model we use to transfer. 
# This comes from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '' if t.startswith('@') and len(t) > 1 else t
        t = '' if t.startswith('http') else t
        t = t.replace('\r', '')
        t = t.replace("\n", " ") # Remove newlines
        
        # Remove hashtags but keep words
        t = ' '.join(t.split("#")) if '#' in t else t
#         t = '' if len(t.split()) == 0 else t
#         t = '' if len(t.split()) == 1 and t.split()[0] == '#' else t # remove empty "#"
#         if len(t.split("#")) > 1:
#             t = ' #'.join(t.split("#"))[1:] # separate hashtags
            
        # change emojis to be explanation of emoji
        if emoji.get_emoji_regexp().search(t) != None:
            t = ' '.join(emoji.demojize(i) for i in emoji.get_emoji_regexp().split(t))
            t = t.replace("_"," ")
            t = t.replace("-"," ")
            t = t.replace(":"," ")
    #         t = emoji.get_emoji_regexp().sub("", t)
    
        t = " ".join(t.split()) # Remove excess whitespace
        new_text.append(t)
    
    cleaned_text = " ".join(new_text)
#     try:
#         cleaned_text = translator.translate(cleaned_text) # Translate non english to english
#     except Exception as e:
#         print(e)
    
    if len(cleaned_text.split()) == 0: return text # return original text if our cleaning made empty string
    return cleaned_text

# Load data into numpy arrays
X = np.array(emotion_df['tweet'])
Y = np.array(emotion_df['category'])
Y_ints = np.array(pd.factorize(emotion_df['category'])[0])
X_ins = np.array(ins_df['Contents'])
east_asian = np.array(ins_df['Q5A.  If yes to Q5, what type of Asian'] == 1, dtype=int)

# Preprocess text
for i in range(len(X)):     X[i] = preprocess(X[i])
for i in range(len(X_ins)): X_ins[i] = preprocess(X_ins[i])

# Split into train/val/test sets
TRAIN_PCT, VAL_PCT, TEST_PCT  = 0.6, 0.2, 0.2
train_idx = int(TRAIN_PCT * len(X))
val_idx = train_idx + int(VAL_PCT * len(X))

In [None]:
for i in range(30): 
    print(X_ins[i])

In [None]:
X_train, Y_train = X[:train_idx], Y_ints[:train_idx]
X_val, Y_val = X[train_idx:val_idx], Y_ints[train_idx:val_idx]
X_test, Y_test = X[val_idx:], Y_ints[val_idx:]

# Tokenize the data
if TOKENIZER_MODEL == 'gpt2':
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
X_train_enc = tokenizer(list(X_train), return_tensors='pt', padding=True, truncation=True)
X_val_enc = tokenizer(list(X_val), return_tensors='pt', padding=True, truncation=True)
X_test_enc = tokenizer(list(X_test), return_tensors='pt', padding=True, truncation=True)
X_ins_enc = tokenizer(list(X_ins), return_tensors='pt', padding=True, truncation=True, max_length=X_train_enc['input_ids'].shape[1])

# Model Definition

In [None]:
# TODO: define our machine learning model, from our discussion it we can try deep learning models

import os
from torch.utils.data import (
    Dataset, 
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)

import math 
from transformers import  (
    BertPreTrainedModel, 
    RobertaConfig, 
    RobertaTokenizerFast
)

from transformers.optimization import (
    AdamW, 
    get_linear_schedule_with_warmup
)

from scipy.special import softmax
from torch.nn import CrossEntropyLoss

from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
    roc_curve,
    auc,
    average_precision_score,
    accuracy_score
)

from transformers.models.roberta.modeling_roberta import (
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)

from transformers import AutoModel
from torch import nn

num_labels = 4

In [None]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print('Number of GPUs: ',torch.cuda.device_count())
else:
    print('No GPU, using CPU.')
    device = torch.device("cpu")

In [None]:
max_seq_length = 128 
train_batch_size = 8
test_batch_size = 8
warmup_ratio = 0.06
weight_decay=0.0
gradient_accumulation_steps = 1
num_train_epochs = 5 
learning_rate = 1e-05
adam_epsilon = 1e-08

hidden_units = 512

In [None]:
class RobertaClassification(BertPreTrainedModel):
    
    def __init__(self, config, MODEL=None, num_labels=None, pretrained_output_size=None, hidden_units=None):
        super(RobertaClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

config_class = RobertaConfig
model_class = RobertaClassification

config = config_class.from_pretrained(MODEL, num_labels=num_labels)
model = model_class.from_pretrained(MODEL, config=config)
print('Model=\n',model,'\n')

In [None]:
# If you want to use a different model than roberta
# Uncomment MODEL you want
# MODEL, pretrained_output_size = "gpt2", 768 # - not yet working
# MODEL, pretrained_output_size = "roberta-base", 768
# MODEL, pretrained_output_size = "cardiffnlp/twitter-roberta-base-sentiment", 768 
MODEL, pretrained_output_size = "digitalepidemiologylab/covid-twitter-bert-v2", 1024

assert MODEL == TOKENIZER_MODEL

class Model(nn.Module):
    
    def __init__(self, config, MODEL, num_labels, pretrained_output_size, hidden_units):
        super(Model, self).__init__()
        self.num_labels = num_labels
        self.pretrained_model = AutoModel.from_pretrained(MODEL)
#         self.drop1 = nn.Dropout(0.2)
        
        self.linear1 = nn.Linear(pretrained_output_size, hidden_units)
        self.relu = nn.ReLU()
        
#         self.drop2 = nn.Dropout(0.2)
        self.linear2 = nn.Linear(hidden_units, self.num_labels)

    def forward(self, input_ids, attention_mask, labels):
        output = self.pretrained_model(input_ids, attention_mask=attention_mask)
#         l1 = self.drop1(self.linear1(output.pooler_output))
        l1 = self.linear1(output[0])
        relu = self.relu(l1)
        
#         out = self.drop2(self.linear2(relu))
        out = self.linear2(relu)
        
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(out.view(-1, self.num_labels), labels.view(-1))
        
        return loss, out

model = Model(None, MODEL, num_labels, pretrained_output_size, hidden_units)
print('Model=\n',model,'\n')

In [None]:
class MyClassificationDataset(Dataset):
    
    def __init__(self, data,y):
        text = data
        labels=y
        self.examples = text
#         targets = tr.transform(labels)
        self.labels = torch.as_tensor(labels, dtype=torch.long)
        

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]


train_dataset = MyClassificationDataset(X_train_enc,Y_train)
val_dataset = MyClassificationDataset(X_val_enc, Y_val)
test_dataset = MyClassificationDataset(X_test_enc, Y_test)
ins_dataset = MyClassificationDataset(X_ins_enc, [0.] * len(X_ins))

In [None]:
train_batch_size = 8
val_batch_size = 8
test_batch_size = 8

def get_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(device) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(device)
    return inputs

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,sampler=train_sampler,batch_size=train_batch_size)

val_sampler = SequentialSampler(val_dataset)
val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=val_batch_size)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=test_batch_size)

ins_sampler = SequentialSampler(ins_dataset)
ins_dataloader = DataLoader(ins_dataset, sampler=ins_sampler, batch_size=test_batch_size)

#Extract a batch as sanity-check
batch = get_inputs_dict(next(iter(train_dataloader)))
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
print(batch)

In [None]:
def setup_opts(model):
    t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
    optimizer_grouped_parameters = []
    custom_parameter_names = set()
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters.extend(
        [
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
                ],
                "weight_decay": weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in model.named_parameters()
                    if n not in custom_parameter_names and any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
    )

    warmup_steps = math.ceil(t_total * warmup_ratio)
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)
    return optimizer, scheduler

optimizer, scheduler = setup_opts(model)

# Training

In [None]:
# TODO: train our model using the loaded data
model.to(device)

model.zero_grad()

def log_metrics(y, y_preds):
    print(classification_report(y, y_preds, target_names=['Joy', 'Fear', 'Sadness', 'Anger']))
    

def train_epochs(num_train_epochs):
    avg_loss=[]
    avg_val_loss=[]
    for epoch in range(num_train_epochs):

        model.train()
        epoch_loss = []
    
        for batch in train_dataloader:
            batch = get_inputs_dict(batch)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            epoch_loss.append(loss.item())
        
        #evaluate model with test_df at the end of the epoch.
        eval_loss = 0.0
        nb_eval_steps = 0
        n_batches = len(val_dataloader)
        preds = np.empty((len(val_dataset), num_labels))
        out_label_ids = np.empty((len(val_dataset)))
        model.eval()
    
        for i,test_batch in enumerate(val_dataloader):
            with torch.no_grad():
                test_batch = get_inputs_dict(test_batch)
                input_ids = test_batch['input_ids'].to(device)
                attention_mask = test_batch['attention_mask'].to(device)
                labels = test_batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                tmp_eval_loss, logits = outputs[:2]
                eval_loss += tmp_eval_loss.item()
            
            nb_eval_steps += 1
            start_index = test_batch_size * i
            end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
            preds[start_index:end_index] = logits.detach().cpu().numpy()
            out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
        eval_loss = eval_loss / nb_eval_steps
        model_outputs = preds
        preds = np.argmax(preds, axis=1)
        #result, wrong = compute_metrics(preds, model_outputs, out_label_ids)
        epoch_loss=np.mean(epoch_loss)
        print('epoch',epoch,'Training avg loss',epoch_loss)
        print('epoch',epoch,'Testing  avg loss',eval_loss)
        print('---------------------------------------------------\n')
        avg_loss.append(epoch_loss)
        avg_val_loss.append(eval_loss)
        
    report=log_metrics(Y_val, preds)
    print(report)
    avg_loss=np.mean(avg_loss)
    avg_val_loss=np.mean(avg_val_loss)
    accuracy=accuracy_score(Y_val, preds)
    return avg_loss,avg_val_loss,report,accuracy
        

# Performance Evaluation

In [None]:
def test():   
    model.to(device)
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_dataloader)
    preds = np.empty((len(test_dataset), num_labels))
    out_label_ids = np.empty((len(test_dataset)))
    model.eval()
    for i,test_batch in enumerate(test_dataloader):
        with torch.no_grad():
            test_batch = get_inputs_dict(test_batch)
            input_ids = test_batch['input_ids'].to(device)
            attention_mask = test_batch['attention_mask'].to(device)
            labels = test_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()

        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()

    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    print("classification report for test set")
    print(log_metrics(Y_test, preds))
    accuracy=accuracy_score(Y_test, preds)
    return eval_loss,accuracy

In [None]:
train_loss=[]
val_loss=[]
val_acc=[]
test_loss=[]
test_acc=[]
for epoch in range(2,12,2):
    print("train with epochs=",epoch)
#     model = model_class.from_pretrained(MODEL, config=config)
    model = Model(None, MODEL, num_labels, pretrained_output_size, hidden_units)
    model.to(device)
    optimizer, scheduler = setup_opts(model)
    
    avg_loss,avg_val_loss,report,accuracy=train_epochs(epoch)
    
    train_loss.append(avg_loss)
    val_loss.append(avg_val_loss)
    val_acc.append(accuracy)
    testloss,testacc=test()
    test_loss.append(testloss)
    test_acc.append(testacc)

In [None]:
import matplotlib.pyplot as plt

x=[2,4,6,8,10]
plt.figure(figsize=(10,5))
plt.xlabel('epoch')
plt.ylabel('Loss')
plt.title("Loss for twitter data")
plt.plot(x,train_loss,marker='o',label='train')
plt.plot(x,val_loss,marker='o',label='validation')
plt.plot(x,test_loss,marker='o',label='test')
plt.legend()

In [None]:
plt.figure(figsize=(10,5))
plt.xlabel('epoch')
plt.ylabel('Accuracy')
plt.title("Accuracy for twitter data")
plt.plot(x,val_acc,marker='o',label='validation')
plt.plot(x,test_acc,marker='o',label='test')
plt.legend()

# Prediction

In [None]:
# TODO: predict the submission data 
nb_eval_steps = 0
n_batches = len(ins_dataloader)
preds = np.empty((len(ins_dataset), num_labels))
model.eval()
    
for i,test_batch in enumerate(ins_dataloader):
    with torch.no_grad():
        test_batch = get_inputs_dict(test_batch)
        input_ids = test_batch['input_ids'].to(device)
        attention_mask = test_batch['attention_mask'].to(device)
        labels = test_batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        _, logits = outputs[:2]

    nb_eval_steps += 1
    start_index = test_batch_size * i
    end_index = start_index + test_batch_size if i != (n_batches - 1) else len(ins_dataset)
    preds[start_index:end_index] = logits.detach().cpu().numpy()

model_outputs = preds
preds = np.argmax(preds, axis=1)
np.savetxt('instagram_predictions.txt', preds) # We might want to do something different here - SN

In [None]:
from scipy.stats import pearsonr
# from scipy.stats import spearmanr
emotions = ['Joy', 'Fear', 'Sadness', 'Anger']

preds_one_hot = np.zeros((len(preds), preds.max()+1))
preds_one_hot[np.arange(len(preds)),preds] = 1

for i in range(num_labels):
    corr, _ = pearsonr(preds_one_hot[:,i], east_asian)
    print('Correlation with {}: {}'.format(emotions[i], corr))

In [None]:
for i in range(20):
    print('Prediction: {} \nProcessed:\n{}\nUnprocessed:\n{}\n\n'.format(emotions[preds[i]], X_ins[i],np.array(ins_df['Contents'])[i]))