In [None]:
import os
import pandas as pd
import numpy as np
import nltk
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import logging
from lime.lime_text import LimeTextExplainer
import pickle
from collections import OrderedDict
from datetime import datetime

# Initialize logging
logging.basicConfig(filename='process_log.log', level=logging.INFO, format='%(asctime)s - %(message)s')

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords, lemmatizer, and punctuation
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punctuation = set(string.punctuation)

# Configuration variables
CONFIG = {
    'input_file': 'IJJD dataset.xlsx',
    'output_embeddings_file': 'output_embeddings.xlsx',
    'output_performance_file': 'performance_metrics.xlsx',
    'output_combined_predictions_file': 'final_predictions.xlsx',
    'lime_output_file': 'lime_explanation.pkl',
    'random_seed': 42,
    'cv': 5,
    'test_size': 0.25,
    'class_names': [0, 1, 2, 3],
    'max_len':512,
       'lime_instance': {
    'FastText': {
        'Abstractive_telugu_summary': 50,
        'Abstractive_kannada_summary': 120,
        'Extractive_telugu_summary': 65,
        'Extractive_kannada_summary': 75,
        'Extractive_english_summary': 80,
        'Abstractive_tamil_summary': 95,
        'Abstractive_english_summary': 85,
        'Extractive_tamil_summary': 70
    },
    'RoBERTa': {
        'Abstractive_telugu_summary': 85,
        'Abstractive_kannada_summary': 130,
        'Extractive_telugu_summary': 75,
        'Extractive_kannada_summary': 70,
        'Extractive_english_summary': 95,
        'Abstractive_tamil_summary': 105,
        'Abstractive_english_summary': 100,
        'Extractive_tamil_summary': 80
    },
    'InLegalBERT': {
        'Abstractive_telugu_summary': 100,
        'Abstractive_kannada_summary': 115,
        'Extractive_telugu_summary': 80,
        'Extractive_kannada_summary': 90,
        'Extractive_english_summary': 100,
        'Abstractive_tamil_summary': 110,
        'Abstractive_english_summary': 105,
        'Extractive_tamil_summary': 95
    },
    'IndicBERT': {
        'Abstractive_telugu_summary': 90,
        'Abstractive_kannada_summary': 125,
        'Extractive_telugu_summary': 85,
        'Extractive_kannada_summary': 60,
        'Extractive_english_summary': 90,
        'Abstractive_tamil_summary': 120,
        'Abstractive_english_summary': 115,
        'Extractive_tamil_summary': 100
    }
},
    'skip_preprocessing': ['telugu', 'kannada', 'tamil']
}

base_models = {
    'RandomForest': RandomForestClassifier(),
    'SVM': SVC(probability=True),
    'DecisionTree': DecisionTreeClassifier(),
    'XGB': XGBClassifier(),
    'LGBM': LGBMClassifier(),
    'MLP': MLPClassifier(),
    'KNN': KNeighborsClassifier(),
    'GaussianNB': GaussianNB()
}

# Hyperparameter grids
param_grids = {
    'RandomForest': {
        'n_estimators': [50, 100, 150],
        'max_features': ["sqrt", "log2"],
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 3, 5, 7, 10],
        'min_samples_leaf': [1, 2, 3, 4, 5]
    },
    'SVM': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly'],
        'gamma': ['scale', 'auto', 0.1, 1, 10],
        'degree': [2, 3, 4],
        'coef0': [0.0, 0.1, 0.5, 1.0]
    },
    'DecisionTree': {
        'max_depth': [5, 10, 15, 20],
        'min_samples_split': [2, 3, 5, 10],
        'min_samples_leaf': [1, 2, 4, 5]
    },
    'XGB': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'min_child_weight': [1, 3, 5]
    },
    'LGBM': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'min_child_samples': [10, 20,30]
    },
    'MLP': {
         'hidden_layer_sizes': [(50, 50), (100, 100), (50,)],
         'alpha': [0.0001, 0.001, 0.01, 0.1],
         'max_iter': [200, 300, 400, 500],
         'activation': ['logistic', 'tanh', 'relu']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'GaussianNB': {}
}

# Function to preprocess text, skipping preprocessing for telugu,tamil and kannada languages
def preprocess_text(text, column_name):
    lang_flags = any(lang in column_name.lower() for lang in CONFIG['skip_preprocessing'])

    if lang_flags:
        logging.info(f"Skipping preprocessing for {column_name}")
        return text

    text = text.lower()
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word not in punctuation]
    return ' '.join(words)

# Function to generate FastText embeddings
def generate_fasttext_embeddings(df, column_name, model_path='cc.en.300.bin'):
    import fasttext 

    logging.info(f"Processing FastText embeddings for column: {column_name}")
    try:
        fasttext_model = fasttext.load_model(model_path)
        df['embeddings'] = df[column_name].apply(lambda text: np.mean([fasttext_model.get_word_vector(word) for word in text.split()], axis=0))
        embeddings = np.vstack(df['embeddings'].values)
        return embeddings
    except Exception as e:
        logging.error(f"Error generating FastText embeddings for {column_name}: {str(e)}")
        raise

# Function to generate RoBERTa embeddings
def generate_roberta_embeddings(df, column_name, model_name='roberta-base'):
    logging.info(f"Processing RoBERTa embeddings for column: {column_name}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)

        def get_roberta_embedding(text):
            inputs = tokenizer(text, return_tensors='pt',max_length=CONFIG['max_len'], truncation=True, padding='max_length')
            outputs = model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()

        df['embeddings'] = df[column_name].apply(get_roberta_embedding)
        embeddings = np.vstack(df['embeddings'].values)
        return embeddings
    except Exception as e:
        logging.error(f"Error generating RoBERTa embeddings for {column_name}: {str(e)}")
        raise

# Function to generate InLegalBERT embeddings
def generate_inlegalbert_embeddings(df, column_name, model_name='nlpaueb/legal-bert-base-uncased'):
    logging.info(f"Processing InLegalBERT embeddings for column: {column_name}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)

        def get_inlegalbert_embedding(text):
            inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
            outputs = model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()

        df['embeddings'] = df[column_name].apply(get_inlegalbert_embedding)
        embeddings = np.vstack(df['embeddings'].values)
        return embeddings
    except Exception as e:
        logging.error(f"Error generating InLegalBERT embeddings for {column_name}: {str(e)}")
        raise

# Function to generate IndicBERT embeddings
def generate_indicbert_embeddings(df, column_name, model_name='ai4bharat/indic-bert'):
    logging.info(f"Processing IndicBERT embeddings for column: {column_name}")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)

        def get_indicbert_embedding(text):
            inputs = tokenizer(text, return_tensors='pt',max_length=CONFIG['max_len'], truncation=True,padding='max_length')
            outputs = model(**inputs)
            return outputs.last_hidden_state.mean(dim=1).detach().numpy().flatten()

        df['embeddings'] = df[column_name].apply(get_indicbert_embedding)
        embeddings = np.vstack(df['embeddings'].values)
        return embeddings
    except Exception as e:
        logging.error(f"Error generating IndicBERT embeddings for {column_name}: {str(e)}")
        raise
# Function to split data into train and test
def split_data(embeddings, labels):
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=CONFIG['test_size'], random_state=CONFIG['random_seed'])
    return X_train, X_test, y_train, y_test

# Function to train base models
def train_base_models(X_train, X_test, y_train, base_models, param_grids):
    train_meta_features = []
    test_meta_features = []
    best_params = {}

    for model_name, model in base_models.items():
        try:
            logging.info(f"Training base model: {model_name}")
            param_grid = param_grids.get(model_name, {})
            if param_grid:
                grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=CONFIG['cv'], scoring='accuracy', n_jobs=-1)
                grid_search.fit(X_train, y_train)
                best_model = grid_search.best_estimator_
                best_params[model_name] = grid_search.best_params_
            else:
                best_model = model.fit(X_train, y_train)

            train_preds = best_model.predict(X_train).reshape(-1, 1)
            test_preds = best_model.predict(X_test).reshape(-1, 1)

            train_meta_features.append(train_preds)
            test_meta_features.append(test_preds)

            logging.info(f"Completed training for {model_name}. Best parameters: {best_params.get(model_name, 'None')}")
        except Exception as e:
            logging.error(f"Error training model {model_name}: {str(e)}")
            raise

    train_meta_features = np.hstack(train_meta_features)
    test_meta_features = np.hstack(test_meta_features)

    return train_meta_features, test_meta_features, best_params

# Function to train meta-models using base-model predictions
def train_meta_models(X_train_meta, X_test_meta, y_train_meta, y_test_meta, best_params):
    meta_models = {}
    for model_name, params in best_params.items():
        try:
            logging.info(f"Training meta-model: {model_name} with best params: {params}")
            if model_name == 'RandomForest':
                meta_models['RandomForest'] = RandomForestClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'SVM':
                meta_models['SVM'] = SVC(**params, probability=True).fit(X_train_meta, y_train_meta)
            elif model_name == 'DecisionTree':
                meta_models['DecisionTree'] = DecisionTreeClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'XGB':
                meta_models['XGB'] = XGBClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'LGBM':
                meta_models['LGBM'] = LGBMClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'MLP':
                meta_models['MLP'] = MLPClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'KNN':
                meta_models['KNN'] = KNeighborsClassifier(**params).fit(X_train_meta, y_train_meta)
            elif model_name == 'GaussianNB':
                meta_models['GaussianNB'] = GaussianNB().fit(X_train_meta, y_train_meta)

            logging.info(f"Successfully trained meta-model: {model_name}")
        except Exception as e:
            logging.error(f"Error training meta-model {model_name}: {str(e)}")
            raise

    return meta_models

# Function to evaluate meta-model performance
def evaluate_meta_models(meta_models, X_train_meta, X_test_meta, y_train_meta, y_test_meta):
    performance_metrics = []
    for model_name, meta_model in meta_models.items():
        try:
            logging.info(f"Evaluating meta-model: {model_name}")

            # Evaluate on training data
            train_preds = meta_model.predict(X_train_meta)
            train_accuracy = accuracy_score(y_train_meta, train_preds)
            train_f1 = f1_score(y_train_meta, train_preds, average='weighted')

            # Evaluate on testing data
            test_preds = meta_model.predict(X_test_meta)
            test_accuracy = accuracy_score(y_test_meta, test_preds)
            test_f1 = f1_score(y_test_meta, test_preds, average='weighted')

            performance_metrics.append({
                'Model': model_name,
                'Train Accuracy': train_accuracy,
                'Train F1 Score': train_f1,
                'Test Accuracy': test_accuracy,
                'Test F1 Score': test_f1
            })

            logging.info(f"Meta-model: {model_name} - Train Accuracy: {train_accuracy}, Train F1 Score: {train_f1}, "
                         f"Test Accuracy: {test_accuracy}, Test F1 Score: {test_f1}")
        except Exception as e:
            logging.error(f"Error evaluating meta-model {model_name}: {str(e)}")
            raise

    return pd.DataFrame(performance_metrics)


# Function to generate LIME explanations for a specific embedding and column
def generate_lime_explanation(embedding_name, column_name, df, model, tokenizer):
    try:
        instance_index = CONFIG['lime_instance'][embedding_name][column_name]
        text_example = df[column_name].iloc[instance_index]

        logging.info(f"Generating LIME explanation for {embedding_name} on column {column_name}, instance index: {instance_index}")

        explainer = LimeTextExplainer(class_names=CONFIG['class_names'])
        explanation = explainer.explain_instance(text_example, lambda x: predict_proba_base_models(x, model, tokenizer))

        logging.info("LIME explanation generated successfully")
        return explanation
    except Exception as e:
        logging.error(f"Error generating LIME explanation for {embedding_name}, column {column_name}: {str(e)}")
        raise

# Function to predict probabilities for LIME explanation
def predict_proba_base_models(texts, model, tokenizer):
    try:
        embeddings = np.array([inlegalbert_embed(text, model, tokenizer) for text in texts])
        base_model_preds = [base_model.predict(embeddings) for _, base_model in base_models.items()]
        stacked_predictions = np.hstack([preds.reshape(-1, 1) for preds in base_model_preds])

        logging.info("Predictions for LIME generated successfully")
        return stacked_predictions
    except Exception as e:
        logging.error(f"Error predicting probabilities for LIME: {str(e)}")
        raise


# Main function to run the pipeline
def main():
    try:
        # Load data
        df = pd.read_excel(CONFIG['input_file'])
        logging.info("Data loaded successfully.")

        # Generate timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        columns_to_process = [
            'Abstractive Summarized Judgements', 'Extractive Summarized Judgements',
            'Abstractive Summarized Judgements (Telugu)', 'Extractive Summarized Judgements (Telugu)',
            'Abstractive Summarized Judgements (Kannada)', 'Extractive Summarized Judgements (Kannada)',
            'Abstractive Summarized Judgements (Tamil)', 'Extractive Summarized Judgements (Tamil)'
        ]

        embedding_functions = {
            'FastText': generate_fasttext_embeddings,
            'RoBERTa': generate_roberta_embeddings,
            'InLegalBERT': generate_inlegalbert_embeddings,
            'IndicBERT': generate_indicbert_embeddings
        }

        for col in columns_to_process:
            for embedding_name, embedding_func in embedding_functions.items():
                logging.info(f"Starting processing for {col} with {embedding_name} embeddings")

                # Generate embeddings
                embeddings = embedding_func(df, col)

                # Split the data
                labels = df['Judgement Status'].values
                X_train, X_test, y_train, y_test = split_data(embeddings, labels)

                # Train base models and get predictions, also save best hyperparameters
                train_meta_features, test_meta_features, best_params = train_base_models(X_train, X_test, y_train, base_models, param_grids)

                # Train meta-models using base-model predictions
                X_train_meta, X_test_meta, y_train_meta, y_test_meta = split_data(train_meta_features, y_train, test_size=CONFIG['test_size'])
                meta_models = train_meta_models(X_train_meta, X_test_meta, y_train_meta, y_test_meta, best_params)

                # Evaluate meta-models
                performance_metrics_df = evaluate_meta_models(meta_models, X_train_meta, X_test_meta, y_train_meta, y_test_meta)
                performance_metrics_df.to_excel(f'{embedding_name}_{col}_performance_metrics_{timestamp}.xlsx', index=False)

                # Generate LIME explanation for a sample based on the embedding and column
                explanation = generate_lime_explanation(embedding_name, col, df, meta_models[embedding_name], AutoTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased'))

                # Save the LIME explanation
                with open(f'{embedding_name}_{col}_lime_explanation_{timestamp}.pkl', 'wb') as file:
                    pickle.dump(explanation, file)

                logging.info(f"Completed processing for {col} with {embedding_name} embeddings")

    except Exception as e:
        logging.error(f"An error occurred in the main pipeline: {str(e)}")



