In [None]:
from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm
import numpy as np

# Load and preprocess data from CSV files
def load_and_preprocess_data(train_filepath, val_filepath):
    # Load training data
    train_df = pd.read_csv(train_filepath).dropna()
    label_encoder = LabelEncoder()
    train_df['CategoryEncoded'] = label_encoder.fit_transform(train_df['Tag'])
    train_texts, train_labels = train_df['Word'].tolist(), train_df['CategoryEncoded'].tolist()

    # Load validation data
    val_df = pd.read_csv(val_filepath).dropna()
    val_df['CategoryEncoded'] = label_encoder.transform(val_df['Tag'])
    val_texts, val_labels = val_df['Word'].tolist(), val_df['CategoryEncoded'].tolist()

    return train_texts, train_labels, val_texts, val_labels, label_encoder


In [None]:

# Custom dataset function
def create_dataset(texts, labels, tokenizer, max_length):
    class CustomDataset(Dataset):
        def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, idx):
            text = str(self.texts[idx])
            label = self.labels[idx]
            encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'labels': torch.tensor(label, dtype=torch.long)
            }
    return CustomDataset(texts, labels, tokenizer, max_length)


In [None]:
# BERT with LSTM function
def create_bert_lstm_model(bert_model, hidden_size, output_size, num_layers, bidirectional=True):
    class BertLSTMClassifier(nn.Module):
        def __init__(self, bert_model, hidden_size, output_size, num_layers, bidirectional=True):
            super(BertLSTMClassifier, self).__init__()
            self.bert_model = bert_model
            self.lstm = nn.LSTM(bert_model.config.hidden_size, hidden_size, num_layers, bidirectional=bidirectional, batch_first=True)
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, output_size)

        def forward(self, input_ids, attention_mask):
            outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
            hidden_states = outputs.last_hidden_state
            lstm_output, _ = self.lstm(hidden_states)
            lstm_output = self.dropout(lstm_output)
            logits = self.fc(lstm_output[:, -1, :])
            return logits
    return BertLSTMClassifier(bert_model, hidden_size, output_size, num_layers, bidirectional)


In [None]:
# BERT with CNN function
def create_bert_cnn_model(bert_model, num_filters, filter_sizes, output_size):
    class BertTextCNNClassifier(nn.Module):
        def __init__(self, bert_model, num_filters, filter_sizes, output_size):
            super(BertTextCNNClassifier, self).__init__()
            self.bert_model = bert_model
            self.num_filters = num_filters
            self.filter_sizes = filter_sizes
            self.conv_layers = nn.ModuleList([
                nn.Conv1d(in_channels=bert_model.config.hidden_size, out_channels=num_filters, kernel_size=fs)
                for fs in filter_sizes
            ])
            self.dropout = nn.Dropout(0.2)
            self.fc = nn.Linear(num_filters * len(filter_sizes), output_size)

        def forward(self, input_ids, attention_mask):
            outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
            hidden_states = outputs.last_hidden_state
            embedded = hidden_states.transpose(1, 2)
            pooled_outputs = [torch.max(nn.functional.relu(conv_layer(embedded)), dim=2)[0] for conv_layer in self.conv_layers]
            pooled_outputs = torch.cat(pooled_outputs, dim=1)
            pooled_outputs = self.dropout(pooled_outputs)
            logits = self.fc(pooled_outputs)
            return logits
    return BertTextCNNClassifier(bert_model, num_filters, filter_sizes, output_size)

# Training function
def train_model(model, train_loader, criterion, optimizer, device, num_epochs=1):
    model.to(device)
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct_train = 0
        total_train = 0
        with tqdm(train_loader, unit="batch") as t:
            for batch in t:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                optimizer.zero_grad()
                logits = model(input_ids, attention_mask)
                loss = criterion(logits, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                _, predicted = torch.max(logits.data, 1)
                total_train += labels.size(0)
                correct_train += (predicted == labels).sum().item()

                t.set_postfix({'loss': total_loss / (t.n + 1), 'accuracy': correct_train / total_train})

# Evaluation function with classification report
def evaluate_model(model, test_loader, device, label_encoder):
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        with tqdm(test_loader, unit="batch") as t:
            for batch in t:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                logits = model(input_ids, attention_mask)
                _, predicted = torch.max(logits.data, 1)

                y_true.extend(labels.cpu().numpy())
                y_pred.extend(predicted.cpu().numpy())

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Print classification report with explicit labels
    labels = list(range(len(label_encoder.classes_)))  # Ensure correct number of labels
    print("\nClassification Report:")
    report = classification_report(y_true, y_pred, labels=labels, target_names=label_encoder.classes_)
    print(report)

    # Convert predictions to original labels
    y_pred = np.array(y_pred)
    predict1 = label_encoder.inverse_transform(y_pred)
    return predict1, report


In [None]:
# Updated datasets with correct file paths
datasets = {
    "Tamil": {
        "train": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_tamil_dataset.csv",
        "validation": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_tamil_validation"
    },
    "Malayalam": {
        "train": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /Final_mal_train(80_)  (1).csv",
        "validation": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /Final_mal_dev(20_) (1).csv"
    },
    "Tulu": {
        "train": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_tulu_train_set",
        "validation": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_tulu_validation_set"
    },
    "Kannada": {
        "train": "/content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_kannada_train",
        "validation": "//content/drive/MyDrive/CURRENT HYBRID AND MBERT/all language correct format dataset /correct_kannada_validation"
    }
}



In [None]:
# Loop over datasets
for language, paths in datasets.items():
    print(f"Evaluating for {language}")

    # Load data
    train_texts, train_labels, val_texts, val_labels, label_encoder = load_and_preprocess_data(paths['train'], paths['validation'])

    batch_size = 8
    max_length = 128
    hidden_size = 128
    num_layers = 1

    # Adjust number of classes based on the language
    if language in ["Malayalam", "Tulu"]:
        num_classes = 8  # For Malayalam and Tulu
    else:
        num_classes = 7  # For Tamil and Kannada

    num_filters = 100
    filter_sizes = [2, 3, 4]
    bert_model_name = 'bert-base-multilingual-cased'
    tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    bert_model = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)

In [None]:
   # Create datasets and dataloaders
    train_dataset = create_dataset(train_texts, train_labels, tokenizer, max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = create_dataset(val_texts, val_labels, tokenizer, max_length)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    # Create and train the model
    model = create_bert_cnn_model(bert_model, num_filters, filter_sizes, num_classes)
    # Alternatively, use LSTM model:
    # model = create_bert_lstm_model(bert_model, hidden_size, num_classes, num_layers, bidirectional=False)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print("Start training")
    train_model(model, train_loader, criterion, optimizer, device, num_epochs=1)

    print(f"Evaluating model for {language}")
    predictions, classification_report_str = evaluate_model(model, val_loader, device, label_encoder)


In [None]:
   # Print or save the report separately for each language
    print(f"\nClassification Report for {language}:")
    print(classification_report_str)

    # Optionally, save the report to a file
    with open(f"{language}_classification_report.txt", "w") as f:
        f.write(f"Classification Report for {language}:\n")
        f.write(classification_report_str)
