###This notebook:
+ Exp1: **do emotion labels help the detection of irony?**
+ predictors (models): RoBERTa-base trained on ISEAR dataset (EC) and SemEval 2018-Tweets (ID)
+ RE-TRAIN
+ remove all emojis
+ new_x_test: each tweet has emotion label appended to it

###Check Requirements/imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import tensorflow as tf
print(tf.version.VERSION)

2.5.0


In [None]:
!pip3 install -q ktrain 

In [None]:
pip install -U sklearn

In [None]:
pip install parse_version

In [None]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

1 Physical GPUs, 1 Logical GPUs
Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


In [None]:
!pip3 install -U emojis

emojiS library: https://emojis.readthedocs.io/en/latest/api.html#module-emojis

emojis cheat sheet: https://www.webfx.com/tools/emoji-cheat-sheet/

In [None]:
pip install contractions

In [None]:
pip install git+https://github.com/amaiya/eli5@tfkeras_0_10_1

In [None]:
import ktrain

###Load irony data (SemEval tweets)

In [None]:
# Load train data
train_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_corrected.csv'

df_train = pd.read_csv(train_path, header=0, names=['index',
                                                    'irony_label',
                                                    'tweet'])
                                                

In [None]:
df_train.head()

Unnamed: 0,index,irony_label,tweet
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [None]:
# Check if dataset is balanced

# Classes are 1 and 0. Tweet can either be ironic or non-ironic -> binary classification
classes = df_train.irony_label.unique()

print((df_train.irony_label == 0).sum())
print((df_train.irony_label == 1).sum())

# => Balanced

1923
1911


In [None]:
# Load test data
test_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_test.csv'

df_test = pd.read_csv(test_path, sep='\t', header=0, names=['index',
                                                            'irony_label',
                                                            'tweet'])

print((df_test.irony_label == 0).sum())
print((df_test.irony_label == 1).sum())

df_test.head()

473
311


Unnamed: 0,index,irony_label,tweet
0,1,0,@Callisto1947 Can U Help?||More conservatives ...
1,2,1,"Just walked in to #Starbucks and asked for a ""..."
2,3,0,#NOT GONNA WIN http://t.co/Mc9ebqjAqj
3,4,0,@mickymantell He is exactly that sort of perso...
4,5,1,So much #sarcasm at work mate 10/10 #boring 10...


In [None]:
x_train = df_train['tweet'].to_numpy()
y_train = df_train['irony_label'].to_numpy()

x_test = df_test['tweet'].to_numpy()
y_test = df_test['irony_label'].to_numpy()

###Normalisation of input

Normalise:
+ hashtags
+ tagged users
+ emoji 
+ urls 

In [None]:
import emojis
from nltk.tokenize import TweetTokenizer
import re
import contractions
import numpy as np


def normalise_tweet(tweet):
    norm_tweet = re.sub("&", "and", tweet)
    norm_tweet = re.sub(r"[<>]", "", norm_tweet)
    norm_tweet = re.sub("http:.*", "url", norm_tweet)
    norm_tweet = re.sub("@", " @", norm_tweet)
    norm_tweet = re.sub("#", " ", norm_tweet)
    
    norm_tweet = re.sub(r"[-'()/_;:{}=~|,\[\]]", " ", norm_tweet)
    norm_tweet = re.sub(r"\\n", "", norm_tweet)

    norm_tweet = contractions.fix(norm_tweet)

    tokenizer = TweetTokenizer()
    tweet_tokens = tokenizer.tokenize(norm_tweet)
    final_tweet_list = []

    for token in tweet_tokens:        
        if token.startswith("@"):
            # then token is a user tag
            tag_token = "tagged_user"
            if final_tweet_list.count(tag_token) < 3:
                final_tweet_list.append(tag_token)
        elif emojis.count(token) == 1:
            # then token is an emoji
            emoji_token = '' # remove all emojis
            #if final_tweet_list.count(emoji_token) < 2:
            final_tweet_list.append(emoji_token)
        else:
            final_tweet_list.append(token)
    
    final_tweet = ' '.join(final_tweet_list)
    
    return final_tweet.strip()

# check normalisation    
#return tweet_tokens, final_tweet.strip()

In [None]:
x_train_norm = []
for tweet in x_train:
    x_train_norm.append(normalise_tweet(tweet))

x_test_norm = []
for tweet in x_test:
    x_test_norm.append(normalise_tweet(tweet))

x_train_norm = np.array(x_train_norm)
x_test_norm = np.array(x_test_norm)

In [None]:
x_train_norm[50:100]

In [None]:
# Check sentence lengths

from statistics import mean

seq_len = []

idx = 0
for tweet in x_train_norm:
    if len(tweet.split()) > 35:
        print(idx, tweet)
    seq_len.append(len(tweet.split()))
    idx += 1

print(max(seq_len))
print(mean(seq_len))

888 This time change is crazy . everyone is all up here like woohoo its 11am let us live life ! and I m like it is 5am and I have not slept at all yet .
948 Kyle it won t let me tagged_user you ? But yeah we are grown ass men with fast cars . Who gives af lol . And bring it to my room hooah ? See ya in a bit .
39
15.220918101199791


#1) Irony detection with ISEAR emotion labels

Assign emotion label to SemEval tweets

ISEAR labels: joy, sadness, fear, guilt, shame, anger, disgust

###Load pre-trained emotion classifier (EC_RoBERTa_nonorm)

In [None]:
pwd

'/content'

In [None]:
import ktrain
from ktrain import text

emotion_predictor = ktrain.load_predictor('/content/drive/MyDrive/TeamLab/my_models/EC_RoBERTa_nonorm')

In [None]:
emotion_predictor

<ktrain.text.predictor.TextPredictor at 0x7faa25a98d10>

In [None]:
y_pred_emotion_test = emotion_predictor.predict(x_test_norm)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
y_pred_emotion_train = emotion_predictor.predict(x_train_norm)

In [None]:
print(x_test_norm[1], y_pred_emotion_test[1])
print(x_test_norm[10], y_pred_emotion_test[10])
print(x_test_norm[100], y_pred_emotion_test[100])
print(x_test_norm[200], y_pred_emotion_test[200])
print(x_test_norm[300], y_pred_emotion_test[300])

Just walked in to Starbucks and asked for a " tall blonde " Hahahaha irony shame
Most important thing I ve learned in school url joy
I just wrote a 13 page paper ... I was awfully tired when I was writing it and now I can t sleep . irony fear
Agree with that ... we are not mind readers haha Repost 9gag ⁰You may think we are but we are ... url shame
tagged_user hold on a minute . Are you saying All blonde white women look the same ? ? sarcasm anger


In [None]:
# append labels

new_x_train = []

idx = 0
for tweet in x_train_norm:
    tweet_lst = tweet.split()
    tweet_lst.append(y_pred_emotion_train[idx])
    new_x_train.append(' '.join(tweet_lst))
    idx += 1


new_x_test = []

idx = 0
for tweet in x_test_norm:
    tweet_lst = tweet.split()
    tweet_lst.append(y_pred_emotion_test[idx])
    new_x_test.append(' '.join(tweet_lst))
    idx += 1


new_x_test[:5]

['tagged_user Can you Help ? More conservatives needed on TSU + get paid 4 posting stuff like this ! YOU $ can go to url shame',
 'Just walked in to Starbucks and asked for a " tall blonde " Hahahaha irony shame',
 'NOT going to WIN url sadness',
 'tagged_user He is exactly that sort of person . Weirdo ! fear',
 'So much sarcasm at work mate 10 10 boring 100 % dead mate full on shit absolutely sleeping mate can t handle the sarcasm anger']

###Set up irony detector

In [None]:
import ktrain
from ktrain import text


categories = [0, 1]

MODEL_NAME = 'roberta-base'

# Transormer is a wrapper to the Hugging Face transformers library for text classification.
t = text.Transformer(MODEL_NAME, maxlen=100, class_names=categories)

# Using normalised input data
trn = t.preprocess_train(new_x_train, y_train)
val = t.preprocess_test(new_x_test, y_test)

model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)

preprocessing train...
language: en
train sequence lengths:
	mean : 16
	95percentile : 28
	99percentile : 31


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 17
	95percentile : 29
	99percentile : 32


###Train

In [None]:
best_lr = 5e-5

In [None]:
# Train
# Parameters: LR, epochs
# LR==(5e-5)

learner.autofit(lr=best_lr, checkpoint_folder='/my_models', verbose=1)

# if epochs is None, then early_stopping and reduce_on_plateau are atomatically set to 6 and 3, respectively.
# if lr missing, it will be estimated (initial lr)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 5e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024
Epoch 6/1024

Epoch 00006: Reducing Max LR on Plateau: new max lr will be 2.5e-05 (if not early_stopping).
Epoch 7/1024
Epoch 8/1024

Epoch 00008: Reducing Max LR on Plateau: new max lr will be 1.25e-05 (if not early_stopping).
Epoch 9/1024
Restoring model weights from the end of the best epoch.
Epoch 00009: early stopping
Weights from best epoch have been loaded into model.


<tensorflow.python.keras.callbacks.History at 0x7fa9121c54d0>

###Evaluate predictions

In [None]:
# Set weights to those of the best epoch
learner.model.load_weights('/my_models/weights-05.hdf5')

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

           0       0.82      0.67      0.74       473
           1       0.61      0.77      0.68       311

    accuracy                           0.71       784
   macro avg       0.71      0.72      0.71       784
weighted avg       0.73      0.71      0.71       784



array([[317, 156],
       [ 70, 241]])

In [None]:
# the ones that we got most wrong
learner.view_top_losses(n=5, preproc=t)

----------
id:618 | loss:4.64 | true:0 | pred:1)

----------
id:5 | loss:4.61 | true:0 | pred:1)

----------
id:676 | loss:4.52 | true:0 | pred:1)

----------
id:212 | loss:4.37 | true:0 | pred:1)

----------
id:169 | loss:4.33 | true:0 | pred:1)



In [None]:
# print out instance to see why...
print(new_x_test[71])
print(new_x_test[484])
print(new_x_test[276])
print(new_x_test[5])
print(new_x_test[169])

Getting Final Jeopardy correct on Kids Jeopardy boosts my self esteem joy
Felicitats url joy
I m really excited for next semester joy
Corny jokes are my absolute favorite disgust
And then my sister should be home from college by time I get home from babysitting . And it s payday . THIS IS A GOOD FRIDAY joy


###Make predictions on new data

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
test_sent = ('Cool it is raining again')

In [None]:
predictor.predict(test_sent)

1

In [None]:
# Ask for explanation
predictor.explain(test_sent)

Contribution?,Feature
0.753,Highlighted in text (sum)
-0.838,<BIAS>


###Save Model

In [None]:
predictor.save('/my_models/ID_RoBERTa_with-emo_noemojis')

In [None]:
# Reload to check that model has been saved correctly
reloaded_predictor = ktrain.load_predictor('/my_models/ID_RoBERTa_with-emo_noemojis')

In [None]:
reloaded_predictor.predict(test_sent)

1

In [None]:
# Do reloaded_predictor and original predictor give the same numbers?
reloaded_predictor.predict_proba(test_sent)

array([0.456226, 0.543774], dtype=float32)

In [None]:
predictor.predict_proba(test_sent)

array([0.456226, 0.543774], dtype=float32)

In [None]:
reloaded_predictor.get_classes()

[0, 1]

**!!!before running next cell:**
+ keep only best weight and put it into model folder
+ move my_models into 'content' 

In [None]:
# Copy model files to drive - files on google colab disk space are temporary and get deleted when the session is over

%cp -av "/content/my_models/ID_RoBERTa_with-emo_noemojis" "/content/drive/MyDrive/TeamLab/my_models"

'/content/my_models/ID_RoBERTa_with-emo_noemojis' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/weights-05.hdf5' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis/weights-05.hdf5'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/config.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis/config.json'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/tf_model.h5' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis/tf_model.h5'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/tokenizer_config.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis/tokenizer_config.json'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/special_tokens_map.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_with-emo_noemojis/special_tokens_map.json'
'/content/my_models/ID_RoBERTa_with-emo_noemojis/vocab.json' -> '/content/driv

#2) Irony detection with TweetEval emotion labels

Assign emotion label to SemEval tweets

TweetEval labels: joy, sadness, optimism, anger

__

run cells up to **1)**

train data: x_train_norm, y_train

test data: x_test_norm, y_test

###Load pre-trained emotion classifier (EC_RoBERTa_TweetEval)

In [None]:
pwd

'/content'

In [None]:
import ktrain
from ktrain import text

emotion_predictor = ktrain.load_predictor('/content/drive/MyDrive/TeamLab/my_models/EC_RoBERTa_TweetEval')

###Make emotion prediction and add label to tweet

In [None]:
x_train_with_emo = []

idx = 0
for tweet in x_train_norm:
    tweet_lst = tweet.split()
    emo_pred = emotion_predictor.predict(tweet)
    tweet_lst.append(emo_pred)
    x_train_with_emo.append(' '.join(tweet_lst))
    idx += 1

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355863.0, style=ProgressStyle(descript…




In [None]:
x_train_with_emo[:10]

['Sweet United Nations video . Just in time for Christmas . imagine NoReligion url joy',
 'tagged_user We are rumored to have talked to Erv s agent ... and the Angels asked about Ed Escobar ... that s hardly nothing anger',
 'Hey there ! Nice to see you Minnesota ND Winter Weather joy',
 '3 episodes left I m dying over here sadness',
 'I can t breathe ! was chosen as the most notable quote of the year in an annual list released by a Yale University librarian joy',
 'You re never too old for Footie Pajamas . url joy',
 'Nothing makes me happier then getting on the highway and seeing break lights light up like a Christmas tree .. joy',
 '4 30 an opening my first beer now going to be a long night day joy',
 'tagged_user Klug do you think you would support a guy who knocked out your daughter ? Rice doesn t deserve support . anger',
 'tagged_user You are not allowed to open that until Christmas day ! joy']

In [None]:
x_test_with_emo = []

idx = 0
for tweet in x_test_norm:
    tweet_lst = tweet.split()
    emo_pred = emotion_predictor.predict(tweet)
    tweet_lst.append(emo_pred)
    x_test_with_emo.append(' '.join(tweet_lst))
    idx += 1

In [None]:
x_test_with_emo[:10]

['tagged_user Can you Help ? More conservatives needed on TSU + get paid 4 posting stuff like this ! YOU $ can go to url optimism',
 'Just walked in to Starbucks and asked for a " tall blonde " Hahahaha irony joy',
 'NOT going to WIN url anger',
 'tagged_user He is exactly that sort of person . Weirdo ! joy',
 'So much sarcasm at work mate 10 10 boring 100 % dead mate full on shit absolutely sleeping mate can t handle the sarcasm anger',
 'Corny jokes are my absolute favorite joy',
 'People complain about my backround pic and all I feel is like " hey don t blame me Albert E might have spoken those words " sarcasm life anger',
 'tagged_user Duncan tagged_user Darn my sock joke needs fixing ? anger',
 'if Christian expects Fifa to sleep in my bed with me tonight he s wrong anger',
 'People who tell people with anxiety to " just stop worrying about it " are my favorite kind of people not educateyourself optimism']

###Set up irony detector

In [None]:
# Check max/mean length of tweets
from statistics import mean

seq_len = []

idx = 0
for tweet in x_train_with_emo:
    if len(tweet.split()) > 35:
        print(idx, tweet)
    seq_len.append(len(tweet.split()))
    idx += 1

print(max(seq_len))
print(mean(seq_len))

888 This time change is crazy . everyone is all up here like woohoo its 11am let us live life ! and I m like it is 5am and I have not slept at all yet . joy
948 Kyle it won t let me tagged_user you ? But yeah we are grown ass men with fast cars . Who gives af lol . And bring it to my room hooah ? See ya in a bit . joy
2743 tagged_user ForgetMeNot oh lordy ! Eating out was the hardest or going to someone s place . Esp as I don t eat meat . * eye roll * what can we feed her ? sadness
40
16.220918101199793


In [None]:
import ktrain
from ktrain import text


categories = [0, 1]

MODEL_NAME = 'roberta-base'

# Transormer is a wrapper to the Hugging Face transformers library for text classification.
t = text.Transformer(MODEL_NAME, maxlen=100, class_names=categories)

# Using normalised input data
trn = t.preprocess_train(x_train_with_emo, y_train)
val = t.preprocess_test(x_test_with_emo, y_test)

model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)

preprocessing train...
language: en
train sequence lengths:
	mean : 16
	95percentile : 28
	99percentile : 31


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 17
	95percentile : 29
	99percentile : 32


###Train

In [None]:
best_lr = 5e-5

In [None]:
# Train
# Parameters: LR, epochs
# LR==(5e-5)

learner.autofit(lr=best_lr, checkpoint_folder='/my_models', verbose=1)

# if epochs is None, then early_stopping and reduce_on_plateau are atomatically set to 6 and 3, respectively.
# if lr missing, it will be estimated (initial lr)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 5e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024
Epoch 5/1024

Epoch 00005: Reducing Max LR on Plateau: new max lr will be 2.5e-05 (if not early_stopping).
Epoch 6/1024
Epoch 7/1024

Epoch 00007: Reducing Max LR on Plateau: new max lr will be 1.25e-05 (if not early_stopping).
Epoch 8/1024
Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping
Weights from best epoch have been loaded into model.


<tensorflow.python.keras.callbacks.History at 0x7fe7d8284e10>

###Evaluate predictions

In [None]:
# Set weights to those of the best epoch
learner.model.load_weights('/my_models/weights-07.hdf5')

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

           0       0.81      0.72      0.76       473
           1       0.63      0.75      0.69       311

    accuracy                           0.73       784
   macro avg       0.72      0.73      0.72       784
weighted avg       0.74      0.73      0.73       784



array([[339, 134],
       [ 79, 232]])

In [None]:
# the ones that we got most wrong
learner.view_top_losses(n=5, preproc=t)

----------
id:618 | loss:4.42 | true:0 | pred:1)

----------
id:676 | loss:3.95 | true:0 | pred:1)

----------
id:276 | loss:3.66 | true:0 | pred:1)

----------
id:334 | loss:3.51 | true:0 | pred:1)

----------
id:700 | loss:3.51 | true:0 | pred:1)



In [None]:
# print out instance to see why...
print(x_test_with_emo[618])
print(x_test_with_emo[676])
print(x_test_with_emo[276])
print(x_test_with_emo[334])
print(x_test_with_emo[700])

Love it when my mans on a cleaning spree .. Saves me doing it joy
So glad I m off work tonite joy
I m really excited for next semester joy
Also it s amazing how a shower and Taco Bell can make you feel like a new person . tagged_user joy
Today was a very good day in Iceland . joy


###Make predictions on new data

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
test_sent = ('Cool it is raining again')

In [None]:
predictor.predict(test_sent)

1

In [None]:
# Ask for explanation
predictor.explain(test_sent)

Contribution?,Feature
0.402,Highlighted in text (sum)
-0.248,<BIAS>


To save:


```
predictor.save('/my_models/ID_RoBERTa_with-emo......')
```