In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader
import torch
from collections import Counter
from torch.utils.data import WeightedRandomSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the dataset
df = pd.read_csv('sarcasm_detection_dataset.csv',encoding='latin-1')

# Calculate class weights to handle class imbalance
class_weights = torch.tensor([1.0 / count for count in Counter(df["Category"]).values()])

# Map category labels to integers
label_map = {label: i for i, label in enumerate(df["Category"].unique())}

# Split the data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [2]:
label_map

{'sarcasm': 0, 'not sarcasm': 1}

In [3]:
# Define custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        review = self.dataframe.iloc[idx]["Review"]
        category = self.dataframe.iloc[idx]["Category"]

        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        label = torch.tensor(label_map[category])

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": label,
        }


In [4]:
# Initialize tokenizer and model
hf_token='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'
tokenizer = AutoTokenizer.from_pretrained("nikesh66/Sarcasm-Detection-using-BERT")
model = AutoModelForSequenceClassification.from_pretrained("nikesh66/Sarcasm-Detection-using-BERT", num_labels=len(label_map))
# Define training arguments
training_args = TrainingArguments(
    output_dir="./sarcasm_results",  # Specify the output directory
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
)

# Create datasets and dataloaders
train_dataset = ReviewDataset(train_df, tokenizer, max_length=128)
test_dataset = ReviewDataset(test_df, tokenizer, max_length=128)

# Compute weights for each sample in the dataset
class_weights = [class_weights[label_map[label]] for label in train_df["Category"]]
sampler = WeightedRandomSampler(class_weights, len(train_dataset), replacement=True)

train_dataloader = DataLoader(train_dataset, batch_size=8, sampler=sampler)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

# Define metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "f1": f1,
        "precision": precision,
        "recall": recall,
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)

print(results)

***** Running training *****
  Num examples = 6563
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2463


Step,Training Loss
500,0.1574
1000,0.0577
1500,0.0211
2000,0.0167


Saving model checkpoint to ./sarcasm_results\checkpoint-500
Configuration saved in ./sarcasm_results\checkpoint-500\config.json
Model weights saved in ./sarcasm_results\checkpoint-500\pytorch_model.bin
Saving model checkpoint to ./sarcasm_results\checkpoint-1000
Configuration saved in ./sarcasm_results\checkpoint-1000\config.json
Model weights saved in ./sarcasm_results\checkpoint-1000\pytorch_model.bin
Saving model checkpoint to ./sarcasm_results\checkpoint-1500
Configuration saved in ./sarcasm_results\checkpoint-1500\config.json
Model weights saved in ./sarcasm_results\checkpoint-1500\pytorch_model.bin
Saving model checkpoint to ./sarcasm_results\checkpoint-2000
Configuration saved in ./sarcasm_results\checkpoint-2000\config.json
Model weights saved in ./sarcasm_results\checkpoint-2000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 1641
  Batch size = 8


{'eval_loss': 0.043757785111665726, 'eval_accuracy': 0.9914686166971359, 'eval_f1': 0.9914686483786789, 'eval_precision': 0.9914716255206522, 'eval_recall': 0.9914686166971359, 'eval_runtime': 1244.9341, 'eval_samples_per_second': 1.318, 'eval_steps_per_second': 0.165, 'epoch': 3.0}


In [5]:
trainer.save_model()

Saving model checkpoint to ./sarcasm_results
Configuration saved in ./sarcasm_results\config.json
Model weights saved in ./sarcasm_results\pytorch_model.bin


In [None]:
!pip install accelerate>=0.21.0
!pip install transformers[torch]
!pip install accelerate -U

In [2]:
from transformers import AutoTokenizer,TFAutoModelForSequenceClassification
label_map={'sarcasm': 0, 'not sarcasm': 1}
model1=TFAutoModelForSequenceClassification.from_pretrained("sarcasm_results", num_labels=len(label_map),from_pt=True)
tokenizer = AutoTokenizer.from_pretrained("nikesh66/Sarcasm-Detection-using-BERT")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [4]:
user_review = input('Enter review: ')

# Tokenize the user review (assuming you have a tokenizer)
tokenized_user_review = tokenizer(user_review, truncation=True, padding=True, return_tensors="tf")

# Make predictions
predictions = model1.predict(dict(tokenized_user_review))

# Extract the predicted probabilities for each class
predicted_probabilities = predictions[0][0]

# Get the predicted class index
predicted_class_index = predicted_probabilities.argmax()

def print_key_by_value(dict, value1):
  for key, value in dict.items():
    if value == value1:
      return key

# Example usage:
predicted_category=print_key_by_value(label_map,predicted_class_index )
# Print the predicted class index
print('predicted index',predicted_class_index)
print("Predicted category:", predicted_category)

Enter review: PM modi is taking much more initiatives since elections are on the way.
predicted index 1
Predicted category: not sarcasm
