In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv('Small.csv', index_col=False)
texts = df['Tweet']
labels = df['Sarcasm'].map({'yes': 1, 'no': 0})

In [3]:
max_len = 128
batch_size = 32
epochs = 10
# vocab_size = 10000

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_data(texts, tokenizer, max_len):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len, return_tensors='tf')
    return encodings['input_ids'], encodings['attention_mask']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
def predict_sarcasm(text, tokenizer, model, max_len):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='tf',
        truncation=True
    )

    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    prediction = model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})

    prediction_prob = tf.sigmoid(prediction.logits)

    if prediction_prob > 0.5:
        return "This text is predicted to be sarcastic."
    else:
        return "This text is predicted to be non-sarcastic."


In [6]:
X_train_texts = ["I love this!", "This is bad.", "Could be better.", "Absolutely fantastic!"]
y_train = [1, 0, 0, 1]
X_test_texts = ["Not great.", "I enjoyed it.", "Worst ever.", "Really good."]
y_test = [0, 1, 0, 1]

X_train_ids, X_train_mask = encode_data(X_train_texts, tokenizer, max_len)
X_test_ids, X_test_mask = encode_data(X_test_texts, tokenizer, max_len)

y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

optimizer = 'adam'
loss = BinaryCrossentropy(from_logits=True)
metrics = [BinaryAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

history = model.fit(
    {'input_ids': X_train_ids, 'attention_mask': X_train_mask},
    y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=({'input_ids': X_test_ids, 'attention_mask': X_test_mask}, y_test),
    callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]
)

loss, accuracy = model.evaluate({'input_ids': X_test_ids, 'attention_mask': X_test_mask}, y_test)
print(f"Test Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Test Loss: 0.4702, Accuracy: 0.6731


In [7]:
new_text = "My name is Tanisha."
result = predict_sarcasm(new_text, tokenizer, model, max_len)
print(result)

This text is predicted to be non-sarcastic.


In [8]:
new_text = "My mom asked me this question as well."
result = predict_sarcasm(new_text, tokenizer, model, max_len)
print(result)

This text is predicted to be non-sarcastic.


In [9]:
new_text = "Sure, let's just add this to my already overflowing to-do list."
result = predict_sarcasm(new_text, tokenizer, model, max_len)
print(result)

This text is predicted to be non-sarcastic.


In [10]:
new_text = "Sure, let's just add this to my already overflowing to-do list."
result = predict_sarcasm(new_text, tokenizer, model, max_len)
print(result)

This text is predicted to be non-sarcastic.
