In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sarcasm/train-balanced-sarc.csv.gz
/kaggle/input/sarcasm/train-balanced-sarcasm.csv
/kaggle/input/sarcasm/test-balanced.csv
/kaggle/input/sarcasm/test-unbalanced.csv


## IMPORT ALL THE NECESSARY LIBRARIES AND DATASETS

In [65]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import warnings

from tensorflow import keras
from transformers import BertTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")


In [42]:
train = pd.read_csv("/kaggle/input/sarcasm/train-balanced-sarcasm.csv")

train.head()

Unnamed: 0,label,comment,author,subreddit,score,ups,downs,date,created_utc,parent_comment
0,0,NC and NH.,Trumpbart,politics,2,-1,-1,2016-10,2016-10-16 23:55:23,"Yeah, I get that argument. At this point, I'd ..."
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4,-1,-1,2016-11,2016-11-01 00:24:10,The blazers and Mavericks (The wests 5 and 6 s...
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3,3,0,2016-09,2016-09-22 21:45:37,They're favored to win.
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8,-1,-1,2016-10,2016-10-18 21:03:47,deadass don't kill my buzz
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6,-1,-1,2016-12,2016-12-30 17:00:13,Yep can confirm I saw the tool they use for th...


In [43]:
train.shape

(1010826, 10)

In [44]:
train.isnull().sum()

label              0
comment           55
author             0
subreddit          0
score              0
ups                0
downs              0
date               0
created_utc        0
parent_comment     0
dtype: int64

## DATA PROCESSING

In [45]:
# lets take only 10k rows to make the operations more easier

df = train[:10000]
df = df[['label', 'comment']]
df.head()

Unnamed: 0,label,comment
0,0,NC and NH.
1,0,You do know west teams play against west teams...
2,0,"They were underdogs earlier today, but since G..."
3,0,"This meme isn't funny none of the ""new york ni..."
4,0,I could use one of those tools.


In [46]:
df.shape

(10000, 2)

In [47]:
df.isnull().sum()

label      0
comment    1
dtype: int64

In [48]:
df.dropna(inplace = True)

In [49]:
df.isnull().sum()

# Now we have no null values in the dataset

label      0
comment    0
dtype: int64

In [50]:
# In the comment column we could have multiple datatypes, so lets clean it and keep only string type of data

df['comment'] = df['comment'].str.replace(r'[^a-zA-z\s]', '', regex = True)
df.comment

0                                               NC and NH
1       You do know west teams play against west teams...
2       They were underdogs earlier today but since Gr...
3       This meme isnt funny none of the new york nigg...
4                          I could use one of those tools
                              ...                        
9995                          probably a young latino boy
9996                                  Dog filtergiving up
9997                          Saturday Night dead amirite
9998                         Moderators not fact checkers
9999                          She hacked the online votes
Name: comment, Length: 9999, dtype: object

In [52]:
# Tokenization

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [54]:
# Tokenization Function

def tokenize_data(text, max_length = 100):
    return tokenizer(
        text.tolist(),
        max_length = max_length,
        truncation = True,
        padding = 'max_length',
        return_tensors = 'np'
    )

tokenized_data = tokenize_data(df['comment'])

In [57]:
tokenized_data

# The zeros are the padded data, to match the max length
# attention mask: 1 - padded data, 0 - unpadded data

{'input_ids': array([[  101, 13316,  1998, ...,     0,     0,     0],
       [  101,  2017,  2079, ...,     0,     0,     0],
       [  101,  2027,  2020, ...,     0,     0,     0],
       ...,
       [  101,  5095,  2305, ...,     0,     0,     0],
       [  101, 29420,  2015, ...,     0,     0,     0],
       [  101,  2016, 28719, ...,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])}

## TRAIN TEST SPLIT

In [60]:
X = tokenized_data['input_ids']
y = df['label']

X.shape, y.shape

((9999, 100), (9999,))

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7999, 100), (7999,), (2000, 100), (2000,))

In [79]:
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")

X_train shape: (7999, 100)
y_train shape: (7999,)


## BUILD THE MODEL TO THE PROPOSED ARCHITECTURE
ARCHITECTURE PAPER: https://www.youtube.com/redirect?event=video_description&redir_token=QUFFLUhqbnBpSFFjajFmNTlwUEhwR2xhelFGRGVIY294QXxBQ3Jtc0tralFJd3F4dEkwOG9oYXVvdzJvWjV2RTFLRW94MlZfNUZIcHl2dVBHXy1JUTFBcUdmSjRQWFFwYnBDNzFodTgxeEhWZU5ESzNVRjJoeVdxUjZheXlrSXljU003eFN5TWVGSXhucnZPSVBGNVRjc3RwTQ&q=https%3A%2F%2Faclanthology.org%2F2020.figlang-1.14.pdf&v=63O81OUNY_g

- mixture of Encoder, CNN, LSTM, Maxpooling and Dense layers
- This the architecture that you can find in the paper

In [93]:
class HierarchicalBert(tf.keras.Model):
    def __init__(self, bert_model, lstm_units, cnn_filters, dense_units):
        super(HierarchicalBert, self).__init__()
        self.bert = bert_model

        # Sentecne Encoding layer - Dense
        self.dense_sentence = keras.layers.Dense(768, activation='relu')

        # Context summarization layer  - Pooling
        self.mean_pooling = keras.layers.GlobalAveragePooling1D()

        # Context Encoder layer - Bidirectional LSTM
        self.bilstm_encoder = keras.layers.Bidirectional(keras.layers.LSTM(lstm_units, return_sequences = True))

        # CNN layer to capture the local features
        self.conv = keras.layers.Conv1D(cnn_filters, 2, activation = 'relu')
        self.pool = keras.layers.GlobalMaxPooling1D()

        # Fully connected layer
        self.dense_df = keras.layers.Dense(dense_units, activation = 'relu')

        # Output layer
        self.output_layer = keras.layers.Dense(1, activation = 'sigmoid')

    def call(self, inputs):

        # Bert embeddings
        bert_output = self.bert(inputs)[0]

        # Sentence encoding layer
        sentence_encoded = self.dense_sentence(bert_output)

        # Context summarization
        context_summarized = self.mean_pooling(sentence_encoded)

        # Expand the dimensions
        context_summarized = tf.expand_dims(context_summarized, 1)

        # Context encoder layer
        context_encoded = self.bilstm_encoder(context_summarized)

        # Squeesing the dimensions
        context_encoded_squeezed = tf.squeeze(context_encoded, axis=1)

        # Adding the channel dimension to match the required input shape by the CNN layer
        context_encoded_expanded = tf.expand_dims(context_encoded_squeezed, -1)

        # CNN layer
        conv_output = self.conv(context_encoded_expanded)
        pooled_ouput = self.pool(conv_output)

        # Fully connected layer
        dense_output = self.dense_df(pooled_ouput)

        # Final Output layer
        final_output = self.output_layer(dense_output)

        return final_output

In [66]:
# lOADING THE TRAINED BERT MODEL

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

2025-12-04 08:03:30.993907: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch mode

In [94]:
# Defining the hierarchical Bert model

model = HierarchicalBert(bert_model, lstm_units = 128, cnn_filters = 64, dense_units = 32)

In [97]:
# Compile the model

model.compile(
    optimizer='adam', 
    loss='binary_crossentropy', 
    metrics=['accuracy'] 
)

In [None]:
history = model.fit(
    X_train, 
    y_train,
    epochs = 5,
    batch_size = 32,
    validation_split = 0.2
)

Epoch 1/5
[1m 20/200[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m20:01[0m 7s/step - accuracy: 0.6481 - loss: 0.6618

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy * 100}')

In [None]:
plt.figure(figsize = (10, 4))

plt.plot(history.history['accuracy'], label = "Training Accuracy")
plt.plot(history.history['val_accuracy'], label = "Validation Accuracy")

plt.title("Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Accuracy")

plt.legend()
plt.grid()
plt.show