In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch

# Step 1: Load CSV dataset
dataset_path = '/content/all_final.csv'
df = pd.read_csv(dataset_path)



In [None]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,result
0,0,1,"হাসি সর্বাধিক সুন্দর গহনা , সর্বদা এটি পরেন , ...",0
1,1,2,"কৃষকদের হাতে পৃথিবীর গান , আন্নাদাতার শ্রম থেক...",0
2,2,3,"সততার সূর্যকে আলোকিত করুন , নামটি আলোকিত করুন ...",0
3,3,5,"ঘৃণার অন্ধকার রাতটি সরিয়ে দিন , ভালবাসার প্রদ...",0
4,4,7,"সততার সংগীত , শ্রদ্ধার সুরেলা রাগ।",0
...,...,...,...,...
31495,31495,4453,"నాటకం చాలా సరదాగా ఉంటుంది , నేను మధ్యలో నవ్వుత...",1
31496,31496,4454,"ఆసుపత్రి , ఇది ధనికులకు చికిత్స చేయడానికి మార...",1
31497,31497,4456,మీరు వ్యాయామశాలలో కొత్త వ్యాయామం చేస్తున్నట్లు...,1
31498,31498,4457,మీరు ఎప్పుడైనా ఇంత చెడ్డ డ్రాయింగ్ చూశారా ?,1


In [None]:
from sklearn.metrics import accuracy_score

import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

class SarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'text': text,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Convert labels to numerical format
label_mapping = {label: idx for idx, label in enumerate(df['result'].unique())}
df['result'] = df['result'].map(label_mapping)

# Split the dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = SarcasmDataset(train_df['text'].tolist(), train_df['result'].tolist(), tokenizer, max_len=128)
val_dataset = SarcasmDataset(val_df['text'].tolist(), val_df['result'].tolist(), tokenizer, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Step 4: Fine-tuning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_mapping))
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_true_labels = []
    train_pred_labels = []

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Record true labels and predicted labels for accuracy calculation
        train_true_labels.extend(labels.cpu().numpy())
        train_pred_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

        total_loss += loss.item()

    # Calculate accuracy for the epoch
    epoch_accuracy = accuracy_score(train_true_labels, train_pred_labels)

    # Print average loss and accuracy for the epoch
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {average_loss:.4f}, Training Accuracy: {epoch_accuracy * 100:.2f}%")

# Step 5: Evaluation
model.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 3150/3150 [05:03<00:00, 10.37it/s]


Epoch 1/10, Average Training Loss: 0.1703, Training Accuracy: 94.04%


Epoch 2/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.29it/s]


Epoch 2/10, Average Training Loss: 0.1244, Training Accuracy: 96.27%


Epoch 3/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.29it/s]


Epoch 3/10, Average Training Loss: 0.1201, Training Accuracy: 96.38%


Epoch 4/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 4/10, Average Training Loss: 0.1030, Training Accuracy: 97.01%


Epoch 5/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.27it/s]


Epoch 5/10, Average Training Loss: 0.1026, Training Accuracy: 96.98%


Epoch 6/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 6/10, Average Training Loss: 0.1204, Training Accuracy: 96.25%


Epoch 7/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 7/10, Average Training Loss: 0.0988, Training Accuracy: 97.20%


Epoch 8/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 8/10, Average Training Loss: 0.2481, Training Accuracy: 89.50%


Epoch 9/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.27it/s]


Epoch 9/10, Average Training Loss: 0.1395, Training Accuracy: 95.43%


Epoch 10/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.26it/s]


Epoch 10/10, Average Training Loss: 0.0972, Training Accuracy: 97.15%


Evaluating: 100%|██████████| 788/788 [00:23<00:00, 32.93it/s]

Validation Accuracy: 96.94%





In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define the learning rate
learning_rate = 5e-5

# Step 4: Fine-tuning
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_mapping))
model.to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)

# Define the number of training steps
total_steps = len(train_loader) * num_epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    train_true_labels = []
    train_pred_labels = []

    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update the learning rate

        # Record true labels and predicted labels for accuracy calculation
        train_true_labels.extend(labels.cpu().numpy())
        train_pred_labels.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())

        total_loss += loss.item()

    # Calculate accuracy for the epoch
    epoch_accuracy = accuracy_score(train_true_labels, train_pred_labels)

    # Print average loss and accuracy for the epoch
    average_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Training Loss: {average_loss:.4f}, Training Accuracy: {epoch_accuracy * 100:.2f}%")

# Step 5: Evaluation
model.eval()
all_labels = []
all_predictions = []

with torch.no_grad():
    for batch in tqdm(val_loader, desc='Evaluating'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1).cpu().numpy()

        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predictions)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10: 100%|██████████| 3150/3150 [05:08<00:00, 10.22it/s]


Epoch 1/10, Average Training Loss: 0.1607, Training Accuracy: 94.21%


Epoch 2/10: 100%|██████████| 3150/3150 [05:10<00:00, 10.15it/s]


Epoch 2/10, Average Training Loss: 0.1101, Training Accuracy: 96.63%


Epoch 3/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.27it/s]


Epoch 3/10, Average Training Loss: 0.0977, Training Accuracy: 97.02%


Epoch 4/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.27it/s]


Epoch 4/10, Average Training Loss: 0.0862, Training Accuracy: 97.42%


Epoch 5/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 5/10, Average Training Loss: 0.0750, Training Accuracy: 97.78%


Epoch 6/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.29it/s]


Epoch 6/10, Average Training Loss: 0.0680, Training Accuracy: 98.01%


Epoch 7/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.28it/s]


Epoch 7/10, Average Training Loss: 0.0602, Training Accuracy: 98.21%


Epoch 8/10: 100%|██████████| 3150/3150 [05:07<00:00, 10.26it/s]


Epoch 8/10, Average Training Loss: 0.0555, Training Accuracy: 98.33%


Epoch 9/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.26it/s]


Epoch 9/10, Average Training Loss: 0.0500, Training Accuracy: 98.55%


Epoch 10/10: 100%|██████████| 3150/3150 [05:06<00:00, 10.27it/s]


Epoch 10/10, Average Training Loss: 0.0472, Training Accuracy: 98.63%


Evaluating: 100%|██████████| 788/788 [00:23<00:00, 33.09it/s]

Validation Accuracy: 98.10%



