In [1]:
# Install transformers if not already installed
!pip install transformers

# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import random
import time
import datetime
import os

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix




In [2]:
# Load dataset
df = pd.read_csv('/kaggle/input/banfakelabelnews/BanFake.csv')

# Check data
print(df.head())
print('Number of samples:', df.shape[0])

# Check label distribution
print(df['label'].value_counts())


   label                                               news
0      1  হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত, ৬ জনকে শো...
1      1  মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারি...
2      1  প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...
3      1  মেডিয়েশনই মামলাজট নিরসনের পথ : বিচারপতি আহমেদ ...
4      1  টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু মাদ...
Number of samples: 49977
label
1    48678
0     1299
Name: count, dtype: int64


In [3]:
# Split into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

print('Training set size:', train_df.shape[0])
print('Validation set size:', val_df.shape[0])


Training set size: 44979
Validation set size: 4998


In [4]:
# Initialize the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sagorsarker/bangla-bert-base')


config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

In [5]:
def tokenize_and_split(text, tokenizer, max_length):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)
    
    # Account for [CLS] and [SEP] with "- 2"
    if len(tokens) > max_length - 2:
        # Split the tokens into chunks of size max_length - 2
        chunks = [tokens[i:i + (max_length - 2)] for i in range(0, len(tokens), max_length - 2)]
    else:
        chunks = [tokens]
    
    return chunks


In [6]:
def prepare_data_with_chunking(df, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    labels = []
    article_ids = []
    
    for idx, row in df.iterrows():
        text = row['news']
        label = row['label']
        
        chunks = tokenize_and_split(text, tokenizer, max_length)
        
        for chunk in chunks:
            # Add [CLS] and [SEP] tokens
            chunk = ['[CLS]'] + chunk + ['[SEP]']
            
            # Convert tokens to IDs
            encoding = tokenizer.convert_tokens_to_ids(chunk)
            
            # Pad/truncate to max_length
            if len(encoding) < max_length:
                padding_length = max_length - len(encoding)
                encoding += [tokenizer.pad_token_id] * padding_length
            else:
                encoding = encoding[:max_length]
            
            attention_mask = [1 if id != tokenizer.pad_token_id else 0 for id in encoding]
            
            input_ids.append(torch.tensor(encoding))
            attention_masks.append(torch.tensor(attention_mask))
            labels.append(label)
            article_ids.append(idx)  # Keep track of the article ID
    
    # Convert lists to tensors
    input_ids = torch.stack(input_ids)
    attention_masks = torch.stack(attention_masks)
    labels = torch.tensor(labels)
    article_ids = torch.tensor(article_ids)
    
    return input_ids, attention_masks, labels, article_ids


In [7]:
MAX_LEN = 512  # Maximum sequence length for BERT

# Prepare training data
train_input_ids, train_attention_masks, train_labels, train_article_ids = prepare_data_with_chunking(train_df, tokenizer, MAX_LEN)

# Prepare validation data
val_input_ids, val_attention_masks, val_labels, val_article_ids = prepare_data_with_chunking(val_df, tokenizer, MAX_LEN)


In [8]:
# Create the datasets
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels, train_article_ids)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels, val_article_ids)


In [9]:
# Set batch size
batch_size = 50  # Adjust based on GPU memory

# Create DataLoaders
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


In [10]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Initialize the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base', num_labels=len(df['label'].unique()))

# Move model to the default device
model.to(device)

# Wrap the model with DataParallel
if torch.cuda.device_count() > 1:
    print("Using", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)


Using device: cuda


model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using 2 GPUs!


In [11]:
# Set the number of epochs
epochs = 5  # Adjust as needed

# Import AdamW from torch.optim
from torch.optim import AdamW

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Total number of training steps
total_steps = len(train_loader) * epochs

# Set up the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)


In [12]:
# Set seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [13]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


In [14]:
# Store training statistics
training_stats = []

# Measure the total training time
total_t0 = time.time()

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(train_loader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))
        
        # Unpack the inputs from the dataloader
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        # Clear any previously calculated gradients
        model.zero_grad()
        
        # Forward pass
        outputs = model(b_input_ids,
                        attention_mask=b_attention_mask,
                        labels=b_labels)
        
        # Take the mean of the loss values (for multi-GPU)
        loss = outputs.loss.mean()
        logits = outputs.logits
        
        total_train_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and scheduler
        optimizer.step()
        scheduler.step()
    
    # Calculate the average loss over all batches
    avg_train_loss = total_train_loss / len(train_loader)
    
    training_time = format_time(time.time() - t0)
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")
    
    t0 = time.time()
    model.eval()
    
    total_eval_loss = 0
    total_eval_accuracy = 0
    
    for batch in val_loader:
        b_input_ids = batch[0].to(device)
        b_attention_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_attention_mask,
                            labels=b_labels)
            
            # Take the mean of the loss values (for multi-GPU)
            loss = outputs.loss.mean()
            logits = outputs.logits
            
        total_eval_loss += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate accuracy
        preds = np.argmax(logits, axis=1)
        total_eval_accuracy += np.sum(preds == label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_dataset)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    avg_val_loss = total_eval_loss / len(val_loader)
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    
    # Record statistics
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


  Batch    40  of  1,423.    Elapsed: 0:01:47.
  Batch    80  of  1,423.    Elapsed: 0:03:25.
  Batch   120  of  1,423.    Elapsed: 0:05:03.
  Batch   160  of  1,423.    Elapsed: 0:06:41.
  Batch   200  of  1,423.    Elapsed: 0:08:18.
  Batch   240  of  1,423.    Elapsed: 0:09:56.
  Batch   280  of  1,423.    Elapsed: 0:11:34.
  Batch   320  of  1,423.    Elapsed: 0:13:11.
  Batch   360  of  1,423.    Elapsed: 0:14:49.
  Batch   400  of  1,423.    Elapsed: 0:16:27.
  Batch   440  of  1,423.    Elapsed: 0:18:04.
  Batch   480  of  1,423.    Elapsed: 0:19:42.
  Batch   520  of  1,423.    Elapsed: 0:21:19.
  Batch   560  of  1,423.    Elapsed: 0:22:57.
  Batch   600  of  1,423.    Elapsed: 0:24:35.
  Batch   640  of  1,423.    Elapsed: 0:26:12.
  Batch   680  of  1,423.    Elapsed: 0:27:50.
  Batch   720  of  1,423.    Elapsed: 0:29:27.
  Batch   760  of  1,423.    Elapsed: 0:31:05.
  Batch   800  of  1,423.    Elapsed: 0:32:43.
  Batch   840  of  1,423.    Elapsed: 0:34:20.
  Batch   880

In [15]:
# Put model in evaluation mode
model.eval()

# Tracking variables 
article_predictions = {}
article_true_labels = {}

for batch in val_loader:
    b_input_ids = batch[0].to(device)
    b_attention_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    b_article_ids = batch[3].to('cpu').numpy()
    
    with torch.no_grad():
        outputs = model(b_input_ids,
                        attention_mask=b_attention_mask)
        
        logits = outputs.logits
    
    logits = logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis=1)
    label_ids = b_labels.to('cpu').numpy()
    
    for article_id, pred, true_label in zip(b_article_ids, preds, label_ids):
        article_id = int(article_id)
        if article_id not in article_predictions:
            article_predictions[article_id] = []
            article_true_labels[article_id] = true_label
        article_predictions[article_id].append(pred)

# Now, aggregate predictions per article
final_predictions = []
final_true_labels = []

for article_id in article_predictions.keys():
    preds = article_predictions[article_id]
    true_label = article_true_labels[article_id]
    
    # Majority vote
    final_pred = max(set(preds), key=preds.count)
    
    final_predictions.append(final_pred)
    final_true_labels.append(true_label)


In [16]:
# Classification report
print(classification_report(final_true_labels, final_predictions))

# Confusion matrix
conf_mat = confusion_matrix(final_true_labels, final_predictions)
print("Confusion Matrix:")
print(conf_mat)


              precision    recall  f1-score   support

           0       0.91      0.85      0.88       130
           1       1.00      1.00      1.00      4868

    accuracy                           0.99      4998
   macro avg       0.95      0.92      0.94      4998
weighted avg       0.99      0.99      0.99      4998

Confusion Matrix:
[[ 110   20]
 [  11 4857]]


In [17]:
import os
import torch

# Output directory on Kaggle
output_dir = '/kaggle/working/model_save/'

# Create directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save model (handling DataParallel)
if isinstance(model, torch.nn.DataParallel):
    model_to_save = model.module  # Extract the actual model from DataParallel
else:
    model_to_save = model

# Save model, configuration, and tokenizer
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model to /kaggle/working/model_save/
The OrderedVocab you are attempting to save contains holes for indices [1015, 1016, 1017, 1018, 1053, 1054, 1055, 1056, 1057, 1060, 1061, 1062, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1099, 1101, 1112, 1113, 1556, 1557, 1568], your vocabulary could be corrupted !


('/kaggle/working/model_save/tokenizer_config.json',
 '/kaggle/working/model_save/special_tokens_map.json',
 '/kaggle/working/model_save/vocab.txt',
 '/kaggle/working/model_save/added_tokens.json',
 '/kaggle/working/model_save/tokenizer.json')