# Preprocessing Data

In [None]:
import random
from transformers import BertTokenizer

df_train = pd.read_csv('./datasets/train_data.csv')
df_confusion_set = pd.read_csv('./datasets/final_confusion_sets.csv')

In [None]:
import pandas as pd


confusion_set = {}

# Populate the dictionary
for _, row in confusion_set_df.iloc[:,:2].iterrows():
    word = row['Word']
    confusion_words = [word.strip() for word in row['Confusions'].split(',')]  # Handle multiple confusion words
    confusion_set[word] = confusion_words

    # Add reverse mappings for all confusion words
    for confusion_word in confusion_words:
        if confusion_word not in confusion_set:
            confusion_set[confusion_word] = []
        if word not in confusion_set[confusion_word]:
            confusion_set[confusion_word].append(word)

# Print the resulting dictionary
print(confusion_set)

In [None]:
sentences = df.text.to_list()

In [None]:
mask_prob = 0.15

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("nepali-bert")


In [None]:


# Function to tokenize sentences
def tokenize(sentence):
    return sentence.split()

# Prepare masked sentences
def preprocess_data(sentences, confusion_set, mask_prob, tokenizer):
    input_ids = []
    labels = []
    
    for sentence in sentences:
        tokens = tokenize(sentence)
        confusion_words = [word for word in tokens if word in confusion_set]
        
        if confusion_words:
            # Mask a confusion word
            word_to_mask = random.choice(confusion_words)
        else:
            # Mask a random word with probability
            word_to_mask = random.choice(tokens) if random.random() < mask_prob else None
        
        if word_to_mask:
            masked_tokens = [token if token != word_to_mask else "[MASK]" for token in tokens]
            masked_sentence = " ".join(masked_tokens)
            # Tokenize masked sentence and original sentence for labels
            encoded = tokenizer(masked_sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
            label_encoded = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)

            input_ids.append(encoded.input_ids)
            labels.append(label_encoded.input_ids)

    return input_ids, labels



In [None]:
input_ids, labels = preprocess_data(sentences, confusion_set, mask_prob, tokenizer)


# Dataset Preparation

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class MLMDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx].squeeze(),
            "labels": self.labels[idx].squeeze()
        }



In [None]:
# Create dataset and dataloader
dataset = MLMDataset(input_ids, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


# Training the model

In [None]:
from transformers import BertForMaskedLM, AdamW
from torch.nn import functional as F
from tqdm import tqdm

# Load pre-trained BERT model
model = BertForMaskedLM.from_pretrained("nepali-bert")

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()


In [None]:

epochs = 3
for epoch in range(epochs):
    loop = tqdm(dataloader, leave=True)
    total_loss = 0

    for batch in loop:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Update progress bar
        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch} Loss: {total_loss / len(dataloader)}")



In [None]:

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_nepali_bert")
tokenizer.save_pretrained("./fine_tuned_nepali_bert")