# Sarcasm Detection in News Headlines with Neural Nets

### Loading the data

In the following we will use the **News Headlines Dataset For Sarcasm Detection** by *Rishabh Misra* to build a classifier that can distinguish sarcastic news from serious ones.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import itertools

# 0 - no sarcasm, 1 - sarcasm
data = pd.read_csv("https://raw.githubusercontent.com/michabirklbauer/hgb_dse_text_mining/master/data/Sarcasm/sarcasm.csv")
data.head()

### A glimpse at class distributions and baseline

In [None]:
print("Baseline: ", data["Label"].value_counts()[0] / data.shape[0])
data["Label"].value_counts().plot(kind = "bar");

### Splitting data into a training, validation and test partition to be able to evaluate our model

In [None]:
not_test, test = train_test_split(data, test_size = 0.2, random_state = 1337)
train, val = train_test_split(not_test, test_size = 0.3, random_state = 1337)
print(train.shape)
print(val.shape)
print(test.shape)

### We will use Keras to build our neural net

In [None]:
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Model parameters

- We will use a vocabulary size of 10 000
- We will create word vectors of length 16
- Our texts will be max 100 words long
  - If they are shorter we will zero-pad them at the end
  - If they are longer we will truncate them to 100 from the end
- Our Out-Of-Vocabulary token will be \<OOV>

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type = "post"
padding_type = "post"
oov_token = "<OOV>"

### Fit the tokenizer to the texts and generate equal length sequences

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_token)
tokenizer.fit_on_texts(train["Headline"])

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(train["Headline"])
training_padded = pad_sequences(training_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

validation_sequences = tokenizer.texts_to_sequences(val["Headline"])
validation_padded = pad_sequences(validation_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test["Headline"])
testing_padded = pad_sequences(testing_sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [None]:
train.head()

Bigger index of the word -> less frequent

In [None]:
training_sequences[:5]

In [None]:
training_padded[:5]

In [None]:
training_padded = np.array(training_padded)
training_labels = np.array(train["Label"])
validation_padded = np.array(validation_padded)
validation_labels = np.array(val["Label"])
testing_padded = np.array(testing_padded)
testing_labels = np.array(test["Label"])

### Building our model

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation = "relu"))
model.add(Dense(1, activation = "sigmoid"))

# callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
callbacks = []
callbacks.append(EarlyStopping(monitor = "val_loss", patience = 5, verbose = 0, mode = "auto"))
callbacks.append(ModelCheckpoint("sarcasm_check_.h5", save_best_only = True, monitor = "val_loss", mode = "min"))

model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [None]:
model.summary()

### Training the model on our data

Compare the training with and without callbacks!

In [None]:
# callbacks = []

In [None]:
history = model.fit(training_padded, training_labels, epochs = 30 , validation_data = (validation_padded, validation_labels), callbacks = callbacks, verbose = 1)

### Plot training history

In [None]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history["val_" + metric])
    plt.title("Model " +  metric)
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, "val_" + metric])
    plt.show()
    
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

### Load best-on-validation model

In [None]:
from tensorflow.keras.models import load_model
model = load_model("sarcasm_check_.h5")

### Get predictions

In [None]:
sentences = ["single woman getting all dolled up to watch room full of people make out this new year's eve", "there were some things to cheer in donald trump's wild press conference"]
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, maxlen = max_length, padding = padding_type, truncating = trunc_type)
print(model.predict(padded))

In [None]:
predictions_train = np.where(model.predict(training_padded) > 0.5, 1, 0)
predictions_val = np.where(model.predict(validation_padded) > 0.5, 1, 0)
predictions_test = np.where(model.predict(testing_padded) > 0.5, 1, 0)

### Evaluate model

In [None]:
print("Training Accuracy: ", accuracy_score(train["Label"], predictions_train))
print("Validation Accuracy: ", accuracy_score(val["Label"], predictions_val))
print("Testing Accuracy: ", accuracy_score(test["Label"], predictions_test))

In [None]:
def plot_confusion_matrix(cm, classes, normalize = False, title = "Confusion matrix", cmap = plt.cm.Blues):

    plt.imshow(cm, interpolation = "nearest", cmap = cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 45)
    plt.yticks(tick_marks, classes)
    
    if normalize:
        cm = cm.astype("float") / cm.sum(axis = 1)[:, np.newaxis]
        print("Normalized confusion matrix")

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment = "center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylim(len(cm) - 0.5, -0.5)
    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    
    return cm

In [None]:
conf = confusion_matrix(train["Label"], predictions_train)

plt.figure()
plot = plot_confusion_matrix(conf, classes = [0, 1], title = "Confusion Matrix - Training Split")
plt.show()

In [None]:
conf = confusion_matrix(val["Label"], predictions_val)

plt.figure()
plot = plot_confusion_matrix(conf, classes = [0, 1], title = "Confusion Matrix - Validation Split")
plt.show()

In [None]:
conf = confusion_matrix(test["Label"], predictions_test)

plt.figure()
plot = plot_confusion_matrix(conf, classes = [0, 1], title = "Confusion Matrix - Testing Split")
plt.show()

### By generating a reverse index we can also decode our sequences again...

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_sentence(text):
    return " ".join([reverse_word_index.get(i, "?") for i in text])

print(decode_sentence(training_padded[0]))

### ...combining that with the weights from our embedding layer we can extract the word vectors that we trained

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
import io

out_v = io.open("vecs.tsv", "w", encoding = "utf-8")
out_m = io.open("meta.tsv", "w", encoding = "utf-8")
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_v.write("\t".join([str(x) for x in embeddings]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

In [None]:
# set colab to true and run this cell if you are working in Google Colab to download the generated files
colab = False
if colab:
    try:
        from google.colab import files
    except ImportError:
        pass
    else:
        files.download("vecs.tsv")
        files.download("meta.tsv")

### We can load these files in [http://projector.tensorflow.org/](http://projector.tensorflow.org/) to visualize our embeddings

When we sphereize this data we get two clusters because we did binary classification.

And clicking a word will show you the closest words in the vector space.

# Sentiment Analysis with Transformer Models

### Loading the data

For this task we will use a sample of the **Amazon Fine Food Reviews Dataset** by *Stanford Network Analysis Project*.

In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/michabirklbauer/hgb_dse_text_mining/master/data/AmazonReviews/Reviews_sample.csv")
data.head()

In [None]:
example = data["Text"][0]
print(example)

### We will use TensorFlow models from the *transformers* package

In [None]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax

### Loading roBERTa

[https://ai.facebook.com/blog/roberta-an-optimized-method-for-pretraining-self-supervised-nlp-systems/](https://ai.facebook.com/blog/roberta-an-optimized-method-for-pretraining-self-supervised-nlp-systems/)

[https://arxiv.org/abs/1907.11692](https://arxiv.org/abs/1907.11692)

In [None]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

### Getting roBERTa predictions

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors = "tf")
    output = model(**encoded_text)
    scores = output[0][0].numpy()
    scores = softmax(scores)
    scores_dict = {
        "roberta_neg" : scores[0],
        "roberta_neu" : scores[1],
        "roberta_pos" : scores[2]
    }
    return scores_dict

print(example)
print(polarity_scores_roberta(example))

### Sentiment Analysis in 3 lines of code

In [None]:
from transformers import pipeline

sent_pipeline = pipeline("sentiment-analysis")
sent_pipeline(example)