In [1]:
!pip install -q kaggle

In [2]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"oluwemimosamuel","key":"09f71a8e29c1e1e668423bf9bd4dc707"}'}

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d danofer/sarcasm

Dataset URL: https://www.kaggle.com/datasets/danofer/sarcasm
License(s): copyright-authors
Downloading sarcasm.zip to /content
 99% 214M/216M [00:02<00:00, 111MB/s] 
100% 216M/216M [00:02<00:00, 91.4MB/s]


In [6]:
import zipfile

In [7]:
zipref = zipfile.ZipFile('/content/sarcasm.zip')
zipref.extractall('/content')
zipref.close()

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [9]:
df = pd.read_csv('/content/train-balanced-sarcasm.csv')
df.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [10]:
df.shape

(1010826, 10)

In [11]:
df = df[['comment', 'label']]
df = df[:20000]
df.head()

Unnamed: 0,comment,label
0,NC and NH.,0
1,You do know west teams play against west teams...,0
2,"They were underdogs earlier today, but since G...",0
3,"This meme isn't funny none of the ""new york ni...",0
4,I could use one of those tools.,0


In [12]:
df.dropna(inplace=True)
df.isnull().sum()

Unnamed: 0,0
comment,0
label,0


In [13]:
df['comment'] = df['comment'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

In [14]:
df['comment'] = df['comment'].apply(lambda x: x.lower())

In [15]:
df.head()

Unnamed: 0,comment,label
0,nc and nh,0
1,you do know west teams play against west teams...,0
2,they were underdogs earlier today but since gr...,0
3,this meme isnt funny none of the new york nigg...,0
4,i could use one of those tools,0


# Tokenization

In [16]:
from transformers import BertTokenizer, TFBertModel

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [18]:
def tokenizeText(text, max_length=100):
  return tokenizer(
      text.tolist(),
      truncation=True,
      padding='max_length',
      max_length=max_length,
      return_tensors='np'
  )

tokenizedData = tokenizeText(df['comment'])

In [19]:
tokenizedData

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
       [  101,  2017,  2079, ...,     0,     0,     0],
       [  101,  2027,  2020, ...,     0,     0,     0],
       ...,
       [  101,  1996,  2208, ...,     0,     0,     0],
       [  101,  2227,  3475, ...,     0,     0,     0],
       [  101,   102,     0, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])}

In [20]:
X = tokenizedData['input_ids']
y = df['label']

In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
X_train.shape

(15999, 100)

# Model

In [30]:
class hierarchicalBert(tf.keras.Model):
    def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
        super(hierarchicalBert, self).__init__()

        #bert layer
        self.bert = bert_model

        #sentence encoding layer
        self.dense_sentence = tf.keras.layers.Dense(768, activation='relu')

        #context summarization layer
        self.mean_pooling = tf.keras.layers.GlobalAveragePooling1D()

        #context encoder layer
        self.bilstm_encoder = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(lstm_units, return_sequences=True))

        #cnn
        self.conv1d = tf.keras.layers.Conv1D(cnn_filters, 2, activation='relu')
        self.pool = tf.keras.layers.GlobalMaxPool1D()

        #feed-forward
        self.dense1 = tf.keras.layers.Dense(dense_units, activation='relu')

        #output
        self.output_layer = tf.keras.layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        bert_output = self.bert(inputs)[0]

        #encoded sentence
        sentence_encoded = self.dense_sentence(bert_output)

        #encoded context
        context_encoded = self.mean_pooling(sentence_encoded)

        # expanding dims
        context_dims = tf.expand_dims(context_encoded, 1)

        #context summarization
        context_summarized = self.bilstm_encoder(context_dims)

        #squeesing dims
        context_squeezed = tf.squeeze(context_summarized, axis=1)

        #adding channels
        context_channels = tf.expand_dims(context_squeezed, axis=-1)

        #cnn
        cnn_output = self.conv1d(context_channels)
        cnn_output = self.pool(cnn_output)

        #feed-forward
        dense_output = self.dense1(cnn_output)

        #output
        output = self.output_layer(dense_output)

        return output

In [24]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [31]:
model = hierarchicalBert(bert_model, lstm_units=128, cnn_filters=64, dense_units=32)

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
model.evaluate(X_test, y_test)



[0.6641179323196411, 0.6205000281333923]