In [None]:
# @title pip install
# here to make pip installation to be easy to run at colab


In [None]:
# @title imports
import numpy as np
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
# @title load the models


In [None]:
# for majority voting, just send the same number for stats.
def weighted_ensemble_each_model_has_weight(models_list, clues, stats):
    """
    Predict answers for given samples using ensemble methods with the models in models_list.
    each model predict the answer, and the answer with the highest sum of f1 score will be chosen.

    Parameters
    ----------
    models_list : list of models to predict with ensemble method
    
    X : ndarray of shape (n_samples)
        Input data to predict answers for
        
    stats : list of shape (n_models) with the F1 scores for each model

    Returns
    -------
    responses : ndarray of shape (n_samples, )
        Predicted answers of given samples
    
    chosen_models : list of lists
        List of lists where each sublist contains the indices of the models that contributed to the final prediction for each sample
    """
    n_samples = clues.shape[0]
    
    # Initialize a list to store the responses
    responses = []
    # Initialize a list to store the chosen models for each prediction
    chosen_models = []

    for i, clue in enumerate(clues):
        # Initialize a dictionary to store the weighted scores for each possible prediction
        prediction_scores = defaultdict(float)
        # Initialize a dictionary to store which models contributed to each prediction
        model_contributors = defaultdict(list)
        
        for model_idx, (model, score) in enumerate(zip(models_list, stats)):
            # Predict the output for the ith sample using the current model
            # TODO check if the prediction is at this form
            prediction = model.predict(clue)[0]
            
            # Add the weighted F1 score to the prediction's total score
            prediction_scores[prediction] += score
            # Track which models contributed to this prediction
            # TODO maybe add model name if in the model object
            model_contributors[prediction].append(model_idx)
        
        # Select the prediction with the highest weighted score
        best_prediction = max(prediction_scores, key=prediction_scores.get)
        responses.append(best_prediction)
        chosen_models.append(model_contributors[best_prediction])
    
    return responses, chosen_models

In [None]:
def generate_with_confidence(model, clue):
    # todo a function that predicts with a model and returns also the confidence
    pass

In [None]:
def averaging_scoring_for_clue(model_list, clue, generating_func_with_score):
    word_scores = defaultdict(list)
    word_sources = defaultdict(list)
    
    # Generate words and scores from each LLM
    for model_idx, model in enumerate(model_list):
        word, score = generating_func_with_score(model, clue)
        word_scores[word].append(score)
        # TODO maybe add model name if in the model object
        word_sources[word].append(model_idx)
    
    # Calculate average scores
    average_scores = {word: sum(scores) / len(scores) for word, scores in word_scores.items()}
    
    # Select the word with the highest average score
    final_word = max(average_scores, key=average_scores.get)
    contributing_llms = word_sources[final_word]
    
    return final_word, contributing_llms

def averaging_scoring(model_list, clues, generating_func_with_score):
    """
    Predict answers for given samples using ensemble methods with the models in models_list.
    each model predict the answer, and the answer with the highest sum of f1 score will be chosen.

    Parameters
    ----------
    models_list : list of models to predict with ensemble method
    
    X : ndarray of shape (n_samples)
        Input data to predict answers for
        
    stats : list of shape (n_models) with the confidence scores for each model

    Returns
    -------
    responses : ndarray of shape (n_samples, )
        Predicted answers of given samples
    
    chosen_models : list of lists
        List of lists where each sublist contains the indices of the models that contributed to the final prediction for each sample
    """
    responses = []
    chosen_models = []
    for clue in clues:
        final_word, contributing_llms = averaging_scoring_for_clue(model_list, clue, generating_func_with_score)
        responses.append(final_word)
        chosen_models.append(contributing_llms)

    return responses, chosen_models
