In [2]:
# Install transformers if not already installed
!pip install transformers

# Import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import random
import time
import datetime
import os

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix




In [3]:
# Load dataset
df = pd.read_csv('/kaggle/input/banfakelabelnews/BanFake.csv')

# Check data
print(df.head())
print('Number of samples:', df.shape[0])

# Check label distribution
print(df['label'].value_counts())


   label                                               news
0      1  হট্টগোল করায় বাকৃবিতে দুইজন বরখাস্ত, ৬ জনকে শো...
1      1  মালয়েশিয়ায় কর্মী পাঠানোর ব্যবস্থা নেয়ার সুপারি...
2      1  প্রেমের প্রস্তাবে রাজি না হওয়ায় স্কুলছাত্রীকে ...
3      1  মেডিয়েশনই মামলাজট নিরসনের পথ : বিচারপতি আহমেদ ...
4      1  টকশোতে বক্তব্য দিতে গিয়ে জাপা নেতার মৃত্যু মাদ...
Number of samples: 49977
label
1    48678
0     1299
Name: count, dtype: int64


In [4]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

print('Training set size:', train_df.shape[0])
print('Validation set size:', val_df.shape[0])

Training set size: 44979
Validation set size: 4998


In [6]:
train_df['label'].value_counts()

label
1    43810
0     1169
Name: count, dtype: int64

In [7]:
val_df['label'].value_counts()

label
1    4868
0     130
Name: count, dtype: int64

In [9]:
# Initialize the tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
MAX_LEN = 512  # You can adjust this value


In [10]:
# Prepare the training data
train_texts = train_df['news'].tolist()
train_labels = train_df['label'].tolist()

# Prepare the validation data
val_texts = val_df['news'].tolist()
val_labels = val_df['label'].tolist()

# Tokenize and encode the training set
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LEN)

# Tokenize and encode the validation set
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=MAX_LEN)


In [11]:
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        # Replace 'labels' with 'label' if needed
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


In [12]:
# Create the datasets
train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset = NewsDataset(val_encodings, val_labels)


In [13]:
# Set batch size
batch_size = 8  # Adjust based on GPU memory

# Create DataLoaders
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)


In [14]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Initialize the model
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large')

# Move model to the device
model.to(device)


Using device: cuda


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, ou

In [15]:
# Set the number of epochs
epochs = 2  # You can adjust this value

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Total number of training steps
total_steps = len(train_loader) * epochs

# Set up the scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)




In [16]:
# Set seed for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [17]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


In [18]:
# Store training statistics
training_stats = []

# Measure the total training time
total_t0 = time.time()

for epoch_i in range(0, epochs):
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    t0 = time.time()
    total_train_loss = 0
    model.train()
    
    for step, batch in enumerate(train_loader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))
        
        # Unpack the inputs from the dataloader
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        
        # Clear any previously calculated gradients
        model.zero_grad()
        
        # Forward pass
        outputs = model(b_input_ids,
                        attention_mask=b_attention_mask,
                        labels=b_labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        total_train_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and scheduler
        optimizer.step()
        scheduler.step()
    
    # Calculate the average loss over all batches
    avg_train_loss = total_train_loss / len(train_loader)
    
    training_time = format_time(time.time() - t0)
    
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")
    
    t0 = time.time()
    model.eval()
    
    total_eval_loss = 0
    total_eval_accuracy = 0
    nb_eval_steps = 0
    
    for batch in val_loader:
        b_input_ids = batch['input_ids'].to(device)
        b_attention_mask = batch['attention_mask'].to(device)
        b_labels = batch['labels'].to(device)
        
        with torch.no_grad():
            outputs = model(b_input_ids,
                            attention_mask=b_attention_mask,
                            labels=b_labels)
        
        loss = outputs.loss
        logits = outputs.logits
        
        total_eval_loss += loss.item()
        
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate accuracy
        preds = np.argmax(logits, axis=1)
        total_eval_accuracy += np.sum(preds == label_ids)
        nb_eval_steps += len(label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(val_dataset)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    
    avg_val_loss = total_eval_loss / len(val_loader)
    validation_time = format_time(time.time() - t0)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))
    
    # Record statistics
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...
  Batch    40  of  5,623.    Elapsed: 0:01:44.
  Batch    80  of  5,623.    Elapsed: 0:03:29.
  Batch   120  of  5,623.    Elapsed: 0:05:14.
  Batch   160  of  5,623.    Elapsed: 0:06:59.
  Batch   200  of  5,623.    Elapsed: 0:08:44.
  Batch   240  of  5,623.    Elapsed: 0:10:30.
  Batch   280  of  5,623.    Elapsed: 0:12:15.
  Batch   320  of  5,623.    Elapsed: 0:14:00.
  Batch   360  of  5,623.    Elapsed: 0:15:45.
  Batch   400  of  5,623.    Elapsed: 0:17:29.
  Batch   440  of  5,623.    Elapsed: 0:19:14.
  Batch   480  of  5,623.    Elapsed: 0:20:59.
  Batch   520  of  5,623.    Elapsed: 0:22:44.
  Batch   560  of  5,623.    Elapsed: 0:24:29.
  Batch   600  of  5,623.    Elapsed: 0:26:15.
  Batch   640  of  5,623.    Elapsed: 0:28:00.
  Batch   680  of  5,623.    Elapsed: 0:29:45.
  Batch   720  of  5,623.    Elapsed: 0:31:30.
  Batch   760  of  5,623.    Elapsed: 0:33:15.
  Batch   800  of  5,623.    Elapsed: 0:35:00.
  Batch   840  of  5,623.    Elapsed: 0:36:45.


In [19]:
# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = batch['labels'].to('cpu').numpy()
    
    preds = np.argmax(logits, axis=1)
    
    predictions.extend(preds)
    true_labels.extend(label_ids)

# Classification report
print(classification_report(true_labels, predictions))

# Confusion matrix
conf_mat = confusion_matrix(true_labels, predictions)
print("Confusion Matrix:")
print(conf_mat)


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       130
           1       0.97      1.00      0.99      4868

    accuracy                           0.97      4998
   macro avg       0.49      0.50      0.49      4998
weighted avg       0.95      0.97      0.96      4998

Confusion Matrix:
[[   0  130]
 [   0 4868]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
import os

# Define the output directory for saving
output_dir = '/kaggle/working/model_save/'  # Save in working directory

# Create the directory if it doesn't exist
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Saving model to {output_dir}")

# Save the trained model, configuration, and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)


Saving model to /kaggle/working/model_save/


('/kaggle/working/model_save/tokenizer_config.json',
 '/kaggle/working/model_save/special_tokens_map.json',
 '/kaggle/working/model_save/sentencepiece.bpe.model',
 '/kaggle/working/model_save/added_tokens.json')

In [1]:
from huggingface_hub import login

# Replace 'your_huggingface_token' with your actual token
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [2]:
from huggingface_hub import HfApi, upload_folder

# Define the repository name and your model folder
repo_name = "shafitanvir31/bangla-Roberta-xlm-finetuned"  

In [3]:
model_path = "/kaggle/working/model_save"

# Create a repository
api = HfApi()
api.create_repo(repo_id=repo_name, private=False)

# Upload the entire model folder to the repository
upload_folder(
    folder_path=model_path,
    repo_id=repo_name,
    commit_message="Upload fine-tuned Bangla BERT model"
)


Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shafitanvir31/bangla-Roberta-xlm-finetuned/commit/87c83b803ad4f73768f4076a2825f7fec945ab5b', commit_message='Upload fine-tuned Bangla BERT model', commit_description='', oid='87c83b803ad4f73768f4076a2825f7fec945ab5b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shafitanvir31/bangla-Roberta-xlm-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='shafitanvir31/bangla-Roberta-xlm-finetuned'), pr_revision=None, pr_num=None)