In [6]:
import tensorflow as tf
from tensorflow import keras
from keras import layers
import pandas as pd

import re
import string

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!pip install transformers
import transformers



In [7]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [8]:
class TransformerBlock(layers.Layer):
  def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
    super(TransformerBlock, self).__init__()
    self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    self.ffn = keras.Sequential(
      [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
    )
    self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = layers.Dropout(rate)
    self.dropout2 = layers.Dropout(rate)

  def call(self, inputs, training):
    attn_output = self.att(inputs, inputs)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(inputs + attn_output)
    ffn_output = self.ffn(out1)
    ffn_output = self.dropout2(ffn_output, training=training)
    return self.layernorm2(out1 + ffn_output)

In [9]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [10]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
data_dir = "/content/gdrive/MyDrive/5001/dataset/Sarcasm_Headlines_Dataset_v2.json"

Mounted at /content/gdrive


In [13]:
data = pd.read_json(data_dir, lines=True)
data = data.drop(['article_link'], axis=1)
data.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [14]:
#Use regex to clean the data
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U00002702-\U000027B0"
                    u"\U000024C2-\U0001F251"
                    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
def decontraction(text):
    text = re.sub(r"won\'t", " will not", text)
    text = re.sub(r"won\'t've", " will not have", text)
    text = re.sub(r"can\'t", " can not", text)
    text = re.sub(r"don\'t", " do not", text)
    
    text = re.sub(r"can\'t've", " can not have", text)
    text = re.sub(r"ma\'am", " madam", text)
    text = re.sub(r"let\'s", " let us", text)
    text = re.sub(r"ain\'t", " am not", text)
    text = re.sub(r"shan\'t", " shall not", text)
    text = re.sub(r"sha\n't", " shall not", text)
    text = re.sub(r"o\'clock", " of the clock", text)
    text = re.sub(r"y\'all", " you all", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"n\'t've", " not have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'d've", " would have", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ll've", " will have", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'re", " are", text)
    return text 

def seperate_alphanumeric(text):
    words = text
    words = re.findall(r"[^\W\d_]+|\d+", words)
    return " ".join(words)

def cont_rep_char(text):
    tchr = text.group(0) 
    
    if len(tchr) > 1:
        return tchr[0:2] 

def unique_char(rep, text):
    substitute = re.sub(r'(\w)\1+', rep, text)
    return substitute

data['headline'] = data['headline'].apply(lambda x : remove_url(x))
data['headline'] = data['headline'].apply(lambda x : remove_punct(x))
data['headline'] = data['headline'].apply(lambda x : remove_emoji(x))
data['headline'] = data['headline'].apply(lambda x : decontraction(x))
data['headline'] = data['headline'].apply(lambda x : seperate_alphanumeric(x))
data['headline'] = data['headline'].apply(lambda x : unique_char(cont_rep_char,x))

In [15]:
seq_len = 128
batch_size = 64
num_samples = len(data)
model_name = 'cardiffnlp/twitter-roberta-base-sentiment'

In [16]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

data_tokens = tokenizer(data['headline'].tolist(), max_length=seq_len, 
                         truncation=True, padding='max_length', 
                         add_special_tokens=True, return_tensors='np')
x_data = data_tokens['input_ids']
y_data = data['is_sarcastic'].values
x_train = x_data[0:(int)(num_samples*0.7)]
x_val = x_data[(int)(num_samples*0.7):(int)(num_samples*0.9)]
x_test = x_data[(int)(num_samples*0.9):]
y_train = y_data[0:(int)(num_samples*0.7)]
y_val = y_data[(int)(num_samples*0.7):(int)(num_samples*0.9)]
y_test = y_data[(int)(num_samples*0.9):]

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [32]:
x_data

array([[    0,   212, 32430, ...,     1,     1,     1],
       [    0, 12789,  2851, ...,     1,     1,     1],
       [    0, 21046,   110, ...,     1,     1,     1],
       ...,
       [    0,   627,   144, ...,     1,     1,     1],
       [    0,   119,  2726, ...,     1,     1,     1],
       [    0, 27398, 17691, ...,     1,     1,     1]])

In [33]:
y_data

array([1, 0, 0, ..., 0, 1, 1])

In [20]:
# vocab_size = 20000  # Only consider the top 20k words
vocab_size = x_data.max()+1
maxlen = 128  # Only consider the first 200 words of each movie review
# (x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
print(len(x_train), "Training sequences")
print(len(x_val), "Validation sequences")
# x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
# x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)

20033 Training sequences
5724 Validation sequences


In [49]:
x_train

array([[    0,   212, 32430, ...,     1,     1,     1],
       [    0, 12789,  2851, ...,     1,     1,     1],
       [    0, 21046,   110, ...,     1,     1,     1],
       ...,
       [    0,   397,  2057, ...,     1,     1,     1],
       [    0,   627,   290, ...,     1,     1,     1],
       [    0,  4862,  4473, ...,     1,     1,     1]])

In [50]:
y_train

array([1, 0, 0, ..., 1, 0, 1])

In [26]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalMaxPooling1D()(x)
# x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
# x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [27]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    x_train, y_train, batch_size=batch_size, epochs=3, validation_data=(x_val, y_val)
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
results = model.evaluate(x_test, y_test, batch_size=32)



In [20]:
results

[0.8830837607383728, 0.8256464004516602]