###This notebook:
+ ktrain
+ hugging face transformers
+ roberta-base
+ lower LR 5e-5 + triangular policy
+ remove emojis

07/07:
accuracy .75

Note: weight file has been moved into model folder - move out if it causes problems when loading predictor


###Check Requirements/imports

In [None]:
import tensorflow as tf
print(tf.version.VERSION)

2.5.0


In [None]:
import pandas as pd


In [None]:
pip install emoji

In [None]:
pip install contractions

In [None]:
!pip3 install -q ktrain 

In [None]:
pip install -U sklearn

In [None]:
pip install parse_version

In [None]:
pip install git+https://github.com/amaiya/eli5@tfkeras_0_10_1

In [None]:
import os

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub

from keras.utils import np_utils

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "NOT AVAILABLE")

1 Physical GPUs, 1 Logical GPUs
Version:  2.5.0
Eager mode:  True
Hub version:  0.12.0
GPU is available


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###Load data

In [None]:
# Load train data
train_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_corrected.csv'

df_train = pd.read_csv(train_path, header=0, names=['index',
                                                    'irony_label',
                                                    'tweet'])
                                                

In [None]:
df_train.head()

Unnamed: 0,index,irony_label,tweet
0,1,1,Sweet United Nations video. Just in time for C...
1,2,1,@mrdahl87 We are rumored to have talked to Erv...
2,3,1,Hey there! Nice to see you Minnesota/ND Winter...
3,4,0,3 episodes left I'm dying over here
4,5,1,I can't breathe! was chosen as the most notabl...


In [None]:
# Check if dataset is balanced

# Classes are 1 and 0. Tweet can either be ironic or non-ironic -> binary classification
classes = df_train.irony_label.unique()

print((df_train.irony_label == 0).sum())
print((df_train.irony_label == 1).sum())

# => Balanced

1923
1911


In [None]:
# Load test data
test_path = '/content/drive/MyDrive/TeamLab/data/semeval_taskA_test.csv'

df_test = pd.read_csv(test_path, sep='\t', header=0, names=['index',
                                                            'irony_label',
                                                            'tweet'])

print((df_test.irony_label == 0).sum())
print((df_test.irony_label == 1).sum())

df_test.head()

473
311


Unnamed: 0,index,irony_label,tweet
0,1,0,@Callisto1947 Can U Help?||More conservatives ...
1,2,1,"Just walked in to #Starbucks and asked for a ""..."
2,3,0,#NOT GONNA WIN http://t.co/Mc9ebqjAqj
3,4,0,@mickymantell He is exactly that sort of perso...
4,5,1,So much #sarcasm at work mate 10/10 #boring 10...


In [None]:
x_train = df_train['tweet'].to_numpy()
y_train = df_train['irony_label'].to_numpy()

x_test = df_test['tweet'].to_numpy()
y_test = df_test['irony_label'].to_numpy()

###Normalisation of input

Normalise:
+ hashtags
+ tagged users
+ emoji (+ ones that are made up of characters e.g. :P)
+ urls 
+ laugh (haha, lol..)
+ remove stops

In [None]:
import emoji
from nltk.tokenize import TweetTokenizer
import re
import contractions
import numpy as np


def normalise_tweet(tweet):
    norm_tweet = re.sub("&", "and", tweet)
    norm_tweet = re.sub(r"[<>]", "", norm_tweet)
    norm_tweet = re.sub("http:.*", "url", norm_tweet)
    norm_tweet = re.sub("@", " @", norm_tweet)
    norm_tweet = re.sub("#", " ", norm_tweet)

    norm_tweet = emoji.demojize(norm_tweet)
    # Remove emojis
    norm_tweet = re.sub(": ?[a-z][a-z]+.*[a-z]+ ?:", "", norm_tweet)
    
    norm_tweet = re.sub(r"[-()/_;:{}=~|,\[\]]", " ", norm_tweet)

    norm_tweet = contractions.fix(norm_tweet)

    tokenizer = TweetTokenizer()
    final_tweet = ''

    for token in tokenizer.tokenize(norm_tweet):
        if token.startswith("@"):
            token = "tagged_user"
        if token.isnumeric():
            token = "digit"

        final_tweet += token + " "
        
    return final_tweet.strip()

In [None]:
x_train_norm = []
for tweet in x_train:
    x_train_norm.append(normalise_tweet(tweet))

x_test_norm = []
for tweet in x_test:
    x_test_norm.append(normalise_tweet(tweet))

x_train_norm = np.array(x_train_norm)
x_test_norm = np.array(x_test_norm)

In [None]:
x_train_norm[10:20]

array(['Oh thank GOD our entire office email system is down ... the day of a big event . Santa you know JUST what to get me for xmas .',
       'But instead I am scrolling through Facebook Instagram and Twitter for hours on end accomplishing nothing .',
       'tagged_user no he bloody is not I was upstairs getting changed !',
       "Cold or warmth both suffuse one's cheeks with pink colour tone ... Do you understand the underlying difference and its texture ?",
       'Just great when you are mobile bill arrives by text',
       'crushes are great until you realize they will never be interested in you . p',
       'Buffalo sports media is smarter than all of us . Where else can you get the quality insight offered by Harrington and Busgaglia .',
       'I guess my cat also lost digit pounds when she went to the vet after I have been feeding her a few times a day . Eating food WorkingOut',
       'tagged_user tagged_user Rosenthal trading a SP for a defense only SS ? Brilliant trade .'

In [None]:
from statistics import mean

seq_len = []

idx = 0
for tweet in x_train_norm:
    if len(tweet.split()) > 35:
        print(idx, tweet)
    seq_len.append(len(tweet.split()))
    idx += 1

print(max(seq_len))
print(mean(seq_len))

888 This time change is crazy . everyone is all up here like woohoo its 11am let us live life ! and I am like it is 5am and I have not slept at all yet .
948 Kyle it will not let me tagged_user you ? But yeah we are grown ass men with fast cars . Who gives af lol . And bring it to my room hooah ? See ya in a bit .
39
15.141627543035995


##Model (ktrain, roberta-base)

In [None]:
import ktrain
from ktrain import text

from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger

categories = [0, 1]

MODEL_NAME = 'roberta-base'

# Transormer is a wrapper to the Hugging Face transformers library for text classification.
t = text.Transformer(MODEL_NAME, maxlen=100, class_names=categories)

# Using normalised input data
trn = t.preprocess_train(x_train_norm, y_train)
val = t.preprocess_test(x_test_norm, y_test)

model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)

preprocessing train...
language: en
train sequence lengths:
	mean : 15
	95percentile : 27
	99percentile : 30


Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 16
	95percentile : 27
	99percentile : 31


###Estimate LR

run the following to let ktrain stimate a good LR

learner.lr_find(show_plot=True, max_epochs=4)

###Train

In [None]:
best_lr = 5e-5

In [None]:
# Train
# Parameters: LR, epochs
# LR==(5e-5)

learner.autofit(lr=best_lr, checkpoint_folder='/my_models', verbose=1)

# if epochs is None, then early_stopping and reduce_on_plateau are atomatically set to 6 and 3, respectively.
# if lr missing, it will be estimated (initial lr)

early_stopping automatically enabled at patience=5
reduce_on_plateau automatically enabled at patience=2


begin training using triangular learning rate policy with max lr of 5e-05...
Epoch 1/1024
Epoch 2/1024
Epoch 3/1024
Epoch 4/1024

Epoch 00004: Reducing Max LR on Plateau: new max lr will be 2.5e-05 (if not early_stopping).
Epoch 5/1024
Epoch 6/1024

Epoch 00006: Reducing Max LR on Plateau: new max lr will be 1.25e-05 (if not early_stopping).
Epoch 7/1024
Restoring model weights from the end of the best epoch.
Epoch 00007: early stopping
Weights from best epoch have been loaded into model.


<tensorflow.python.keras.callbacks.History at 0x7fac77e41e50>

###Evaluate/Inspect model

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

           0       0.83      0.66      0.73       473
           1       0.60      0.80      0.69       311

    accuracy                           0.71       784
   macro avg       0.72      0.73      0.71       784
weighted avg       0.74      0.71      0.72       784



array([[311, 162],
       [ 63, 248]])

In [None]:
# Set weights to those of the best epoch
learner.model.load_weights('/my_models/weights-03.hdf5')

In [None]:
learner.validate(class_names=t.get_classes())

              precision    recall  f1-score   support

           0       0.84      0.71      0.77       473
           1       0.65      0.79      0.71       311

    accuracy                           0.75       784
   macro avg       0.74      0.75      0.74       784
weighted avg       0.76      0.75      0.75       784



array([[338, 135],
       [ 64, 247]])

In [None]:
# the ones that we got most wrong
learner.view_top_losses(n=10, preproc=t)

----------
id:618 | loss:5.21 | true:0 | pred:1)

----------
id:5 | loss:5.1 | true:0 | pred:1)

----------
id:330 | loss:4.75 | true:0 | pred:1)

----------
id:700 | loss:4.45 | true:0 | pred:1)

----------
id:587 | loss:4.41 | true:0 | pred:1)

----------
id:676 | loss:4.2 | true:0 | pred:1)

----------
id:505 | loss:4.19 | true:0 | pred:1)

----------
id:43 | loss:4.09 | true:0 | pred:1)

----------
id:169 | loss:4.01 | true:0 | pred:1)

----------
id:629 | loss:3.95 | true:0 | pred:1)



In [None]:
# print out instance to see why...
print(x_test_norm[618])
print(x_test_norm[5])
print(x_test_norm[330])
print(x_test_norm[700])
print(x_test_norm[587])

Love it when my mans on a cleaning spree .. Saves me doing it OK hand
Corny jokes are my absolute favorite
Sarcasm makes you mentally stronger . Which is very effective when dealing with emotional stress and fustration . funfact WhatIfISay
Today was a very good day in Iceland .
That awkward moment when you plane your whole day around your Golf Class and it gets cancelled ! stupidrain


###Make predictions on new data

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
test_sent = ('Cool it is raining again')

In [None]:
predictor.predict(test_sent)

1

In [None]:
# Ask for explanation
predictor.explain(test_sent)

Contribution?,Feature
0.735,Highlighted in text (sum)
-0.459,<BIAS>


In [None]:
more_sents = ['Going to the dentist for a root canal this afternoon. Yay, I can’t wait.', 
              'It was so nice of my dad to come to my graduation party. #not', 
              'I drank a healthy, homemade fruit smoothie...in a Budweiser glass #irony', 
              'Dogs are really cute, one day I want to live in a big house with many dogs', 
              'some trees are really tall, others not so much', 
              'just came back from dinner at Nandos with my mates']

In [None]:
predictor.predict(more_sents)

[1, 1, 1, 1, 0, 0]

In [None]:
# Ask for explanation
predictor.explain(more_sents[0])

Contribution?,Feature
6.082,Highlighted in text (sum)
-0.821,<BIAS>


In [None]:
# Ask for explanation
predictor.explain(more_sents[1])

Contribution?,Feature
3.781,Highlighted in text (sum)
-0.9,<BIAS>


In [None]:
# Ask for explanation
predictor.explain(more_sents[2])

Contribution?,Feature
3.81,Highlighted in text (sum)
-1.007,<BIAS>


In [None]:
# Ask for explanation
predictor.explain(more_sents[3])

Contribution?,Feature
0.85,Highlighted in text (sum)
-0.917,<BIAS>


In [None]:
# Ask for explanation
predictor.explain(more_sents[4])

Contribution?,Feature
0.761,<BIAS>
-0.068,Highlighted in text (sum)


In [None]:
# Ask for explanation
predictor.explain(more_sents[5])

Contribution?,Feature
0.991,<BIAS>
-0.726,Highlighted in text (sum)


###Save + Reload

In [None]:
predictor.save('/my_models/ID_RoBERTa_noemojis')

In [None]:
# Reload to check that model has been saved correctly
reloaded_predictor = ktrain.load_predictor('/my_models/ID_RoBERTa_noemojis')

In [None]:
reloaded_predictor.predict(test_sent)

1

In [None]:
# Do reloaded_predictor and original predictor give the same numbers?
reloaded_predictor.predict_proba(test_sent)

array([0.03814947, 0.9618505 ], dtype=float32)

In [None]:
predictor.predict_proba(test_sent)

array([0.03814947, 0.9618505 ], dtype=float32)

In [None]:
reloaded_predictor.get_classes()

[0, 1]

In [None]:
# Copy model files to drive - files on google colab disk space are temporary and get deleted when the session is over

%cp -av "/content/my_models/ID_RoBERTa_noemojis" "/content/drive/MyDrive/TeamLab/my_models"

'/content/my_models/ID_RoBERTa_noemojis' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis'
'/content/my_models/ID_RoBERTa_noemojis/config.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/config.json'
'/content/my_models/ID_RoBERTa_noemojis/tf_model.h5' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/tf_model.h5'
'/content/my_models/ID_RoBERTa_noemojis/tokenizer_config.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/tokenizer_config.json'
'/content/my_models/ID_RoBERTa_noemojis/special_tokens_map.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/special_tokens_map.json'
'/content/my_models/ID_RoBERTa_noemojis/vocab.json' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/vocab.json'
'/content/my_models/ID_RoBERTa_noemojis/merges.txt' -> '/content/drive/MyDrive/TeamLab/my_models/ID_RoBERTa_noemojis/merges.txt'
'/content/my_models/ID_RoBERTa_noemojis/tf_model.preproc' -> '/cont


To load and continue training
```
# save model and Preprocessor instance after partially training
ktrain.get_predictor(model, preproc).save('/tmp/my_predictor')

# reload Predictor and extract model
model = ktrain.load_predictor('/tmp/my_predictor').model

# re-instantiate Learner and continue training
learner = ktrain.get_learner(model, train_data=trn, val_data=val)
learner.fit_onecycle(2e-5, 1)
```

