In [None]:
!pip install pandas scikit-learn torch transformers tqdm
!pip install --upgrade datasets fsspec huggingface_hub
!pip install pandas scikit-learn torch transformers tqdm

Collecting fsspec
  Using cached fsspec-2025.5.0-py3-none-any.whl.metadata (11 kB)


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):

    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'http\S+', '', text)

    tokens = text.split()

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

dataset = [{"text": "This is such a great day!", "label": 0},
           {"text": "Oh, what a surprise...", "label": 1}]
tokenized_datasets = [tokenize_function(example) for example in dataset]

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):

    tokenized_inputs = tokenizer(examples["text"], truncation=True, max_length=512)

    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs

dataset = [{"text": "This is such a great day!", "label": 0},
           {"text": "Oh, what a surprise...", "label": 1}]
tokenized_datasets = [tokenize_function(example) for example in dataset]

from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
)


trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,0.448872
2,No log,0.428974
3,No log,0.407308


TrainOutput(global_step=3, training_loss=0.5455092589060465, metrics={'train_runtime': 19.4701, 'train_samples_per_second': 0.308, 'train_steps_per_second': 0.154, 'total_flos': 30833326800.0, 'train_loss': 0.5455092589060465, 'epoch': 3.0})

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="binary"),
        "precision": precision_score(labels, predictions, average="binary"),
        "recall": recall_score(labels, predictions, average="binary"),
    }

trainer.evaluate()

{'eval_loss': 0.5238103866577148,
 'eval_runtime': 0.4795,
 'eval_samples_per_second': 4.171,
 'eval_steps_per_second': 2.086,
 'epoch': 3.0}

In [None]:
def predict_sarcasm(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    prediction = np.argmax(outputs.logits.detach().numpy(), axis=1)
    return "Sarcastic" if prediction == 1 else "Not Sarcastic"

print(predict_sarcasm("This is such a great day!"))
print(predict_sarcasm("Oh, what a surprise..."))

Not Sarcastic
Sarcastic
