In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
class MultilingualOffensiveLanguageDetector:
    def __init__(self):
        self.models = {}
        self.pipelines = {}
        self.languages = ['kannada', 'malayalam', 'tulu', 'tamil']
        self.predictions = {}
        self.data = {}
        self.evaluation_results = {}

    def preprocess_text(self, text):
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\u0B80-\u0BFF\u0C00-\u0C7F\u0C80-\u0CFF\u0D00-\u0D7F]', ' ', text)
        return text.strip()

    def load_data(self, file_paths):
        for lang in self.languages:
            if lang in file_paths:
                print(f"Loading {lang} data...")
                train_df = pd.read_csv(file_paths[lang]['train'])
                val_df = pd.read_csv(file_paths[lang]['val'])
                test_df = pd.read_csv(file_paths[lang]['test'])
                if lang == 'tulu':
                    if 'Label' in train_df.columns:
                        train_df = train_df.rename(columns={'Label': 'Labels'})
                    if 'Label' in val_df.columns:
                        val_df = val_df.rename(columns={'Label': 'Labels'})
                self.data[lang] = {'train': train_df, 'val': val_df, 'test': test_df}
                print(f"{lang} - Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")
                if 'Labels' in train_df.columns:
                    print(train_df['Labels'].value_counts())


In [None]:
def analyze_labels(self, lang):
    if lang in self.data:
        train_labels = self.data[lang]['train']['Labels']
        val_labels = self.data[lang]['val']['Labels']
        print(f"\n{lang.upper()} - Train Label Dist:\n{train_labels.value_counts()}")
        print(f"\n{lang.upper()} - Val Label Dist:\n{val_labels.value_counts()}")
        return train_labels.value_counts(), val_labels.value_counts()
    return None, None

def prepare_data(self, lang):
    train_texts = self.data[lang]['train']['Text'].apply(self.preprocess_text)
    train_labels = self.data[lang]['train']['Labels']
    val_texts = self.data[lang]['val']['Text'].apply(self.preprocess_text)
    val_labels = self.data[lang]['val']['Labels']
    test_texts = self.data[lang]['test']['Text'].apply(self.preprocess_text)
    test_ids = self.data[lang]['test']['ID']
    return train_texts, train_labels, val_texts, val_labels, test_texts, test_ids

MultilingualOffensiveLanguageDetector.analyze_labels = analyze_labels
MultilingualOffensiveLanguageDetector.prepare_data = prepare_data


In [None]:
def create_model_pipeline(self, model_type='logistic'):
    if model_type == 'logistic':
        classifier = LogisticRegression(random_state=42, max_iter=2000, multi_class='ovr', class_weight='balanced')
    elif model_type == 'svm':
        classifier = SVC(random_state=42, probability=True, class_weight='balanced', kernel='linear')
    elif model_type == 'random_forest':
        classifier = RandomForestClassifier(random_state=42, n_estimators=200, class_weight='balanced', max_depth=10)
    elif model_type == 'naive_bayes':
        classifier = MultinomialNB(alpha=0.1)
    else:
        classifier = LogisticRegression(random_state=42, max_iter=2000, multi_class='ovr', class_weight='balanced')

    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=15000, ngram_range=(1, 3), min_df=2, max_df=0.95, sublinear_tf=True)),
        ('classifier', classifier)
    ])
    return pipeline

MultilingualOffensiveLanguageDetector.create_model_pipeline = create_model_pipeline


In [None]:
def train_and_evaluate_model(self, lang, model_type='logistic'):
    train_texts, train_labels, val_texts, val_labels, test_texts, test_ids = self.prepare_data(lang)
    pipeline = self.create_model_pipeline(model_type)
    print(f"Training {model_type} for {lang}...")
    pipeline.fit(train_texts, train_labels)

    self.pipelines[f"{lang}_{model_type}"] = pipeline
    val_pred = pipeline.predict(val_texts)

    macro_f1 = f1_score(val_labels, val_pred, average='macro')
    accuracy = accuracy_score(val_labels, val_pred)

    self.evaluation_results[f"{lang}_{model_type}"] = {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
        'val_true': val_labels,
        'val_pred': val_pred,
        'classification_report': classification_report(val_labels, val_pred)
    }

    test_pred = pipeline.predict(test_texts)
    self.predictions[f"{lang}_{model_type}"] = {'ids': test_ids, 'predictions': test_pred}

    print(f"{lang} - {model_type} | F1: {macro_f1:.4f}, Acc: {accuracy:.4f}")
    return pipeline

def train_all_languages(self, model_types=['logistic', 'random_forest', 'svm', 'naive_bayes']):
    for lang in self.languages:
        if lang in self.data:
            for model_type in model_types:
                self.train_and_evaluate_model(lang, model_type)

MultilingualOffensiveLanguageDetector.train_and_evaluate_model = train_and_evaluate_model
MultilingualOffensiveLanguageDetector.train_all_languages = train_all_languages


In [None]:
def train_voting_ensemble(self, lang, model_types=['logistic','random_forest','svm','naive_bayes']):
    train_texts, train_labels, val_texts, val_labels, test_texts, test_ids = self.prepare_data(lang)
    estimators = [(m, self.pipelines[f"{lang}_{m}"]) for m in model_types if f"{lang}_{m}" in self.pipelines]
    if not estimators: return None
    ensemble = VotingClassifier(estimators=estimators, voting='hard')
    ensemble.fit(train_texts, train_labels)

    val_pred = ensemble.predict(val_texts)
    macro_f1 = f1_score(val_labels, val_pred, average='macro')
    accuracy = accuracy_score(val_labels, val_pred)

    self.evaluation_results[f"{lang}_voting_ensemble"] = {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
        'val_true': val_labels,
        'val_pred': val_pred,
        'classification_report': classification_report(val_labels, val_pred, digits=4)
    }

    test_pred = ensemble.predict(test_texts)
    self.predictions[f"{lang}_voting_ensemble"] = {'ids': test_ids, 'predictions': test_pred}
    self.pipelines[f"{lang}_voting_ensemble"] = ensemble

    print(f"{lang} Voting Ensemble | F1: {macro_f1:.4f}, Acc: {accuracy:.4f}")
    print(self.evaluation_results[f"{lang}_voting_ensemble"]["classification_report"])
    return ensemble


def train_weighted_ensemble(self, lang, model_types=['logistic','random_forest','svm','naive_bayes']):
    _, _, val_texts, val_labels, test_texts, test_ids = self.prepare_data(lang)
    preds_val, preds_test, weights = [], [], []

    # Collect predictions and weights from available models
    for m in model_types:
        key = f"{lang}_{m}"
        if key in self.evaluation_results:
            preds_val.append(self.evaluation_results[key]['val_pred'])
            preds_test.append(self.predictions[key]['predictions'])
            weights.append(self.evaluation_results[key]['macro_f1'])

    if not weights:
        print(f"No models available to create weighted ensemble for {lang}")
        return None

    weights = np.array(weights) / np.sum(weights)
    preds_val, preds_test = np.array(preds_val), np.array(preds_test)

    def weighted_vote(preds, weights):
        out = []
        for i in range(preds.shape[1]):
            votes = {}
            for j, label in enumerate(preds[:, i]):
                votes[label] = votes.get(label, 0) + weights[j]
            out.append(max(votes, key=votes.get))
        return np.array(out)

    val_final = weighted_vote(preds_val, weights)
    test_final = weighted_vote(preds_test, weights)

    macro_f1 = f1_score(val_labels, val_final, average='macro')
    accuracy = accuracy_score(val_labels, val_final)

    self.evaluation_results[f"{lang}_weighted_ensemble"] = {
        'macro_f1': macro_f1,
        'accuracy': accuracy,
        'val_true': val_labels,
        'val_pred': val_final,
        'classification_report': classification_report(val_labels, val_final, digits=4)
    }

    self.predictions[f"{lang}_weighted_ensemble"] = {'ids': test_ids, 'predictions': test_final}

    print(f"{lang} Weighted Ensemble | F1: {macro_f1:.4f}, Acc: {accuracy:.4f}")
    print("Classification Report (Validation):")
    print(self.evaluation_results[f"{lang}_weighted_ensemble"]['classification_report'])
    return test_final

MultilingualOffensiveLanguageDetector.train_weighted_ensemble = train_weighted_ensemble
MultilingualOffensiveLanguageDetector.train_voting_ensemble = train_voting_ensemble



In [None]:
def evaluate_on_test_with_labels(self, lang, model_type, test_file_paths):
    key_pipeline = f"{lang}_{model_type}"

    # Load test data
    df_test = pd.read_csv(test_file_paths[lang])
    label_col = "Labels" if "Labels" in df_test.columns else "Label"
    test_labels = df_test[label_col]
    test_texts = df_test['Text'].apply(self.preprocess_text)

    # Check if it's a pipeline model
    if key_pipeline in self.pipelines:
        pred = self.pipelines[key_pipeline].predict(test_texts)
    # Check if predictions already exist (weighted ensemble)
    elif key_pipeline in self.predictions:
        pred = self.predictions[key_pipeline]['predictions']
    else:
        print(f"No trained model or predictions found for {lang}-{model_type}")
        return None

    # Compute metrics
    macro_f1 = f1_score(test_labels, pred, average='macro')
    acc = accuracy_score(test_labels, pred)
    cls_report = classification_report(test_labels, pred, digits=4)

    # Print
    print(f"\n{lang.upper()}-{model_type.upper()} | Test F1={macro_f1:.4f}, Acc={acc:.4f}")
    print("Classification Report (Test):")
    print(cls_report)

    # Save results
    self.evaluation_results[f"{lang}_{model_type}_test"] = {
        'macro_f1': macro_f1,
        'accuracy': acc,
        'true': test_labels,
        'pred': pred,
        'classification_report': cls_report
    }

    return pred

MultilingualOffensiveLanguageDetector.evaluate_on_test_with_labels = evaluate_on_test_with_labels


In [None]:
def error_analysis_on_test(self, lang, model_type, test_file_paths):
    if lang not in test_file_paths:
        print(f"No test file for {lang}")
        return

    df_test = pd.read_csv(test_file_paths[lang])
    label_col = "Labels" if "Labels" in df_test.columns else "Label"

    key = f"{lang}_{model_type}"
    if key not in self.predictions:
        print(f"No predictions found for {key}")
        return

    preds = self.predictions[key]['predictions']

    df_test["Predicted_Label"] = preds
    df_test["True_Label"] = df_test[label_col]

    misclassified_df = df_test[df_test["True_Label"] != df_test["Predicted_Label"]]
    os.makedirs("error_analysis", exist_ok=True)
    out_csv = f"error_analysis/{lang}_{model_type}_misclassified.csv"
    misclassified_df.to_csv(out_csv, index=False)

    cm = confusion_matrix(df_test["True_Label"], preds, labels=sorted(df_test[label_col].unique()))
    plt.figure(figsize=(6,5))
    sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                xticklabels=sorted(df_test[label_col].unique()),
                yticklabels=sorted(df_test[label_col].unique()))
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {lang}-{model_type}")
    out_png = f"error_analysis/{lang}_{model_type}_confusion_matrix.png"
    plt.savefig(out_png, dpi=300, bbox_inches="tight")
    plt.close()

    print(f"[{lang}-{model_type}] Errors saved → {out_csv} ({len(misclassified_df)} rows)")
    print(f"[{lang}-{model_type}] Confusion Matrix saved → {out_png}")

MultilingualOffensiveLanguageDetector.error_analysis_on_test = error_analysis_on_test


In [None]:
# Initialize detector
detector = MultilingualOffensiveLanguageDetector()

# Dataset paths
file_paths = {
    'kannada': {
        'train': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/kannada_offensive_train.csv',
        'val': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/kannada_offensive_dev.csv',
        'test': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/kannada_offensive_test_without_labels.csv'
    },
    'malayalam': {
        'train': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/mal_full_offensive_train.csv',
        'val': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/mal_full_offensive_dev.csv',
        'test': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/mal_offensive_test_without_labels.csv'
    },
    'tulu': {
        'train': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/Tulu_offensive_train.csv',
        'val': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/Tulu_offensive_dev.csv',
        'test': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/Tulu_test_data_without_label.csv'
    },
    'tamil': {
        'train': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/tamil_offensive_full_train.csv',
        'val': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/tamil_offensive_full_dev.csv',
        'test': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/tamil_offensive_test_without_labels.csv'
    }
}

test_file_paths = {
    'kannada': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/kannada_offensive_test_with_labels.csv',
    'malayalam': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/mal_offensive_test_with_labels.csv',
    'tulu': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/tulu_offensive_test_with_labels.csv',
    'tamil': '/content/drive/MyDrive/FIRE-2025/Offensive Language Identification/tamil_offensive_full_test_with_labels.csv'
}

# Load data and train
detector.load_data(file_paths)
detector.train_all_languages(['logistic','random_forest','svm','naive_bayes'])

# Ensembles
for lang in detector.languages:
    if lang in detector.data:
        detector.train_voting_ensemble(lang)
        detector.train_weighted_ensemble(lang)

# Evaluation + Error Analysis
for lang in detector.languages:
    for model in ['logistic','random_forest','svm','naive_bayes','voting_ensemble','weighted_ensemble']:
        detector.evaluate_on_test_with_labels(lang, model, test_file_paths=test_file_paths)
        detector.error_analysis_on_test(lang, model, test_file_paths=test_file_paths)


Loading kannada data...
kannada - Train: (6217, 2), Val: (777, 2), Test: (778, 2)
Labels
Not_offensive                           3544
not-Kannada                             1522
Offensive_Targeted_Insult_Individual     487
Offensive_Targeted_Insult_Group          329
Offensive_Untargetede                    212
Offensive_Targeted_Insult_Other          123
Name: count, dtype: int64
Loading malayalam data...
malayalam - Train: (16010, 2), Val: (1999, 2), Test: (2001, 2)
Labels
Not_offensive                           14153
not-malayalam                            1287
Offensive_Targeted_Insult_Individual      239
Offensive_Untargetede                     191
Offensive_Targeted_Insult_Group           140
Name: count, dtype: int64
Loading tulu data...
tulu - Train: (2692, 2), Val: (577, 2), Test: (576, 2)
Labels
not offensive           1261
not tulu                 726
offensive untargeted     462
offensive targeted       243
Name: count, dtype: int64
Loading tamil data...
tamil - Train: (

In [None]:
for lang in detector.languages:
    detector.train_voting_ensemble(lang)
    detector.train_weighted_ensemble(lang)


kannada Voting Ensemble | F1: 0.4415, Acc: 0.6628
                                      precision    recall  f1-score   support

                       Not_offensive     0.7690    0.7113    0.7390       426
     Offensive_Targeted_Insult_Group     0.4000    0.3111    0.3500        45
Offensive_Targeted_Insult_Individual     0.5606    0.5606    0.5606        66
     Offensive_Targeted_Insult_Other     0.1818    0.1250    0.1481        16
               Offensive_Untargetede     0.1905    0.1212    0.1481        33
                         not-Kannada     0.6200    0.8115    0.7029       191

                            accuracy                         0.6628       777
                           macro avg     0.4537    0.4401    0.4415       777
                        weighted avg     0.6567    0.6628    0.6552       777

kannada Weighted Ensemble | F1: 0.4396, Acc: 0.6551
Classification Report (Validation):
                                      precision    recall  f1-score   support



In [None]:
for lang in detector.languages:
    for model in ['voting_ensemble','weighted_ensemble']:
        print(f"\n===== {lang.upper()} - {model.upper()} =====")
        detector.evaluate_on_test_with_labels(lang, model, test_file_paths=test_file_paths)
        detector.error_analysis_on_test(lang, model, test_file_paths=test_file_paths)



===== KANNADA - VOTING_ENSEMBLE =====

KANNADA-VOTING_ENSEMBLE | Test F1=0.4208, Acc=0.6645
Classification Report (Test):
                                      precision    recall  f1-score   support

                       Not_offensive     0.7895    0.7026    0.7435       427
     Offensive_Targeted_Insult_Group     0.3514    0.2955    0.3210        44
Offensive_Targeted_Insult_Individual     0.6250    0.6000    0.6122        75
     Offensive_Targeted_Insult_Other     0.0000    0.0000    0.0000        14
               Offensive_Untargetede     0.1905    0.1212    0.1481        33
                         not-Kannada     0.6008    0.8378    0.6998       185

                            accuracy                         0.6645       778
                           macro avg     0.4262    0.4262    0.4208       778
                        weighted avg     0.6644    0.6645    0.6579       778

[kannada-voting_ensemble] Errors saved → error_analysis/kannada_voting_ensemble_misclassified.