# Comparing Different Embedding Sets on AUT Response Utility Prediction

This notebook serves to calculate a range of embeddings (USE, GloVe, BERT, ELMo, Word2Vec, FastText) for a dataset of responses to the Alternate Uses Task. Then, non-generative machine learning models are trained using a single set of embeddings as predictors. The performance of the models is compared to establish which set of embeddings is most useful in predicting human ratings of utility on AUT response datasets.

## Set Up

### Importing Packages

Before running this make sure the utils.py document is downloaded from [here](https://github.com/allenai/comet-atomic-2020/tree/master/models/comet_atomic2020_bart).

In [None]:
import pandas as pd
import numpy as np
import glob
import re
import os
import json
import torch
import argparse
import requests
import zipfile
import time
import seaborn as sns
import chardet
from tqdm import tqdm
from pathlib import Path
import joblib

from matplotlib import pyplot as plt
from IPython.display import display, HTML

import tensorflow_text
import tensorflow_hub as hub
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

from scipy import spatial
from scipy.stats import pearsonr
from utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch

from sklearn.preprocessing import OrdinalEncoder, RobustScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BertTokenizer, BertModel
from allennlp.modules.elmo import Elmo, batch_to_ids
from gensim.models import KeyedVectors
import gensim.downloader as api
import fasttext
import fasttext.util

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

### Importing Data

In [None]:
# Function for reading in Stevenson, 2020 data files and uniting rater_01 and rater_02 files
def read_files(path):
    """
    Read the csv files from ./data/Stevenson-2020-human

    :param path: string with path to files
    :return dataset: merged dataset
    """

    path = path
    all_files = glob.glob(path + "/*.csv")
    liR1 = []
    liR2 = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, nrows=1, encoding='latin1')
        if len(df.columns) == 1:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1', sep=';')
        else:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1')

        if '_rater01' in filename:
            liR1.append(df)


        else:
            liR2.append(df.loc[:, ['response_id', 'respondent_id', 'originality_rater02', 'utility_rater02']])

    frameR1 = pd.concat(liR1, axis=0, ignore_index=True)
    frameR2 = pd.concat(liR2, axis=0, ignore_index=True)

    df = frameR1.merge(frameR2, on=['response_id','respondent_id'],
                   how='left')

    df["translated_response"] = df["translated_response"].astype(str)
    df["response_id"] = df["response_id"].astype(str)
    df["respondent_id"] = df["respondent_id"].astype(str)

    return df

In [None]:
# Reading in files where encoding needs to be detected (change file paths as needed)
with open('/data/Nath-2024-LLM.csv', 'rb') as f:
    result = chardet.detect(f.read())
df1 = pd.read_csv('/data/Nath-2024-LLM.csv', encoding=result['encoding'])

with open('/data/Hubert-2024-LLM.csv', 'rb') as f:
    result = chardet.detect(f.read())
df2 = pd.read_csv('/data/Hubert-2024-LLM.csv', encoding=result['encoding'])

In [None]:
# Loading the rest of the data files (change file paths as needed)
df3 = pd.read_csv('/data/Nath-2024-human.csv')
df4 = pd.read_csv('/data/Stevenson-2022-human.csv')
df5 = read_files('/data/Stevenson-2020-human')
df6 = pd.read_excel('/data/additional-LLM.xlsx')

# Uniting all data files in one dataframe
df = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

## Cleaning Data

In [None]:
# Function for dropping invalid answers
def drop_invalid(df):
    """
    Drops all answers that were either empty, had a rating of 0 for at least one score,
    or of which the respondent number was 9999 (indicating an invalid respondent)

    :param df: dataset with all columns needed for further steps
    :return dataset, dropped_data:  dataset without invalid data,
                dataset of invalid data
    """
    liV = [1] * len(df)
    condition = (df[['utility_rater01', 'utility_rater02', 'originality_rater01', 'originality_rater02']] == 0).any(axis=1)

    liV = [0 if cond else li for cond, li in zip(condition, liV)]

    # Dropping answers rated as 0 by at least one rater
    df['valid'] = liV
    df_invalid = df[df['valid'] == 0]
    df = df[df['valid'] != 0]

    # Dropping respondent_id that seems to belong to no one
    df_strange = df[df['respondent_id'] == 9999]
    df = df[df['respondent_id'] != 9999]

    # Dropping empty answers
    df_empty = df[df['original_response'] == 'nan']
    df = df[df['original_response'] != 'nan']
    df = df.drop(columns=['valid'])

    df_dropped = pd.concat([df_empty, df_strange, df_invalid], axis=0, ignore_index=True)

    return df, df_dropped

In [None]:
# Applying to dataset
df, df_dropped = drop_invalid(df)
num_dropped = len(df_dropped)

print(num_dropped)
print(len(df))

In [None]:
# Function for cleaning valid responses
def clean_response(dataset, col_response):
    """
    Function cleans the responses

    :param dataset: dataset which include column(s) of responses
    :param col_response: column name of responses to be cleaned
    :return dataset: input dataset with clean responses added
    """
    # Upper to lowercase, remove punctuation and redundant spaces/letters
    dataset[col_response] = [x.lower() for x in dataset[col_response]]
    dataset[col_response] = [re.sub(r'[^\w\s]', ' ', x) for x in dataset[col_response]]  # delete any signs
    dataset[col_response] = [re.sub(r'\b\w\b', ' ', x) for x in dataset[col_response]] # delete loose letters
    dataset[col_response] = [x.strip() for x in dataset[col_response]]  # delete extra white space before/after string
    dataset[col_response] = [' '.join(x.split()) for x in dataset[col_response]]  # delete every extra space in string
    return dataset

In [None]:
# Applying to dataset
df = clean_response(df, 'translated_response')

In [None]:
# Transforming utility from 5-point scale to three categories
df = df.dropna(subset=["originality_rater01", "utility_rater01"])

# Function to categorize ratings
def categorize_rating(rating):
    if rating in [1, 2]:
        return 'low'
    elif rating in [3, 4]:
        return 'medium'
    elif rating == 5:
        return 'high'
    else:
        return 'unknown'

df['category_rater01'] = df['utility_rater01'].apply(categorize_rating)
df['category_rater02'] = df['utility_rater02'].apply(lambda x: categorize_rating(x) if not pd.isna(x) else np.nan)

# Function that takes rater 1's (better rater's) scores if raters disagree on category
def determine_final_category(row):
    if pd.isna(row['utility_rater02']):
        return row['category_rater01']
    elif row['category_rater01'] == row['category_rater02']:
        return row['category_rater01']
    else:
        return row['category_rater02']

df['final_category'] = df.apply(determine_final_category, axis=1)

# Drop intermediate categories
df = df.drop(columns=['category_rater01', 'category_rater02'])

## Universal Sentence Encoder (USE)

In [None]:
# Getting Universal Sentence Encoder embeddings
def get_embeddings(texts, batch_size=100):
      # Load USE module
      os.environ['TFHUB_CACHE_DIR'] = '/tf_cache'
      module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3')
      embeddings = []
      for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embeddings.append(module(batch).numpy())

      return np.vstack(embeddings)

def cosine_similarity(vec1, vec2):
      return 1 - spatial.distance.cosine(vec1, vec2)

# Function for calculating semantic distance from USE embeddings
def sem_dis(dataset, batch_size=100):
    """
    Function to add word embeddings and calculate semantic distance.

    :param dataset: dataset containing at least the responses
    :return dataset: input with extra columns for the word embedding values (one column per value)
                        and the semantic distance
    """
    # Array containing all original responses
    responses = dataset[['translated_response']].copy()
    objs = pd.DataFrame([['belt'], ['book'], ['brick'], ['can'], ['fork'], ['paperclip'], ['stick'], ['towel']], columns=['translated_response'])
    responses = pd.concat([responses, objs], axis=0, ignore_index=True)
    responses = responses.to_numpy()

    # Get embeddings for top 10 uses for each object
    top_use_embeddings = {obj: get_embeddings(uses) for obj, uses in top_10s.items()}

    # Get embeddings for the 'translated_response' column in the dataset
    emb = get_embeddings(responses, batch_size=batch_size)
    emb = pd.DataFrame(emb)
    emb.columns = emb.columns.astype(str)

    # Word embeddings of the AUT objects
    belt = emb.iloc[-8, :]
    book = emb.iloc[-7, :]
    brick = emb.iloc[-6, :]
    can = emb.iloc[-5, :]
    fork = emb.iloc[-4, :]
    paperclip = emb.iloc[-3, :]
    stick = emb.iloc[-2, :]
    towel = emb.iloc[-1, :]

    emb = emb.iloc[:-8, :]
    dataset = pd.concat([dataset.reset_index(drop=True), emb], axis=1)

    dist = []

    # Calculate semantic distance from AUT object for each response
    for i in range(len(emb)):
        if dataset['object'][i] == 'belt':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], belt))
        elif dataset['object'][i] == 'book':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], book))
        elif dataset['object'][i] == 'brick':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], brick))
        elif dataset['object'][i] == 'can':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], can))
        elif dataset['object'][i] == 'fork':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], fork))
        elif dataset['object'][i] == 'paperclip':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], paperclip))
        elif dataset['object'][i] == 'stick':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], stick))
        elif dataset['object'][i] == 'towel':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], towel))

    for i, row in dataset.iterrows():
      obj = row['object']
      use_embedding = emb.iloc[i, :]

      # Calculate similarity between response and top 10 uses for that object
      if obj in top_use_embeddings:
          similarities = [cosine_similarity(use_embedding, top_use_embedding) for top_use_embedding in top_use_embeddings[obj]]
          # Store similarity to each of the top 10 uses in a separate column
          for j, sim in enumerate(similarities):
              dataset.loc[i, f'sim_{j+1}'] = sim

    dataset["similarity"] = dist

    return dataset, emb

In [None]:
# Adding semantic distance to df + creating embeddings
df, embeddings = sem_dis(df, batch_size=100)

# Descriptive statistics of semantic distance
mean_sem_dis = df.similarity.mean().round(2)
sd_sem_dis = df.similarity.std().round(2)
print(mean_sem_dis)
print(sd_sem_dis)

max_sem_dis = df.similarity.max().round(2)
max_sem_dis_resp = df['translated_response'][df.similarity.idxmax()]
max_sem_dis_obj = df['object'][df.similarity.idxmax()]
print(max_sem_dis)
print(max_sem_dis_resp)
print(max_sem_dis_obj)

min_sem_dis = df.similarity.min().round(2)
min_sem_dis_resp = df['translated_response'][df.similarity.idxmin()]
min_sem_dis_obj = df['object'][df.similarity.idxmin()]
print(min_sem_dis)
print(min_sem_dis_resp)
print(min_sem_dis_obj)

## GloVe Embeddings

In [None]:
# Downloading GloVe embeddings
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip = "glove.6B.zip"
glove_dir = "glove.6B"

if not os.path.exists(glove_zip):
    print(f"Downloading {glove_zip}...")
    response = requests.get(glove_url)
    with open(glove_zip, 'wb') as f:
        f.write(response.content)

# Extracting the embeddings
if not os.path.exists(glove_dir):
    print(f"Extracting {glove_zip}...")
    with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)

In [None]:
# Function for loading the GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

glove_file = "glove.6B/glove.6B.100d.txt"
embeddings_index = load_glove_embeddings(glove_file)

In [None]:
# Function for getting the GloVe embedding for each response
def get_glove_embeddings(df, embeddings_index, embedding_dim=100):
    texts = df['translated_response'].to_numpy()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    for text in texts:
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        word_embeddings = [embeddings_index[word] for word in words if word in embeddings_index]

        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
        else:
            sentence_embedding = np.zeros(embedding_dim)

        embeddings.append(sentence_embedding)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_glove_embeddings(df, embeddings_index)

## BERT Embeddings

In [None]:
# Initializing BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function for getting the BERT embedding for each response
def get_bert_embeddings(df, embedding_dim=768):
    texts = df['translated_response'].tolist()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    for text in texts:
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        # Tokenizing text
        inputs = tokenizer(" ".join(words), return_tensors="pt", max_length=512, truncation=True, padding=True)
        inputs.to(device)

        # Getting BERT embeddings
        with torch.no_grad():
            outputs = model(**inputs)

        # Average across all token embeddings to get sentence embedding
        token_embeddings = outputs.last_hidden_state
        sentence_embedding = torch.mean(token_embeddings, dim=1).squeeze().cpu().numpy()

        embeddings.append(sentence_embedding)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"bert_embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_bert_embeddings(df)

## ELMo Embeddings

In [None]:
# Importing necessary files for ELMo
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Initializing embedder
elmo = Elmo(options_file, weight_file, 1, dropout=0)

# Function for getting the ELMo embedding for each response
def get_elmo_embeddings(df, embedding_dim=1024, batch_size=32):
    texts = df['translated_response'].tolist()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    # Process responses in batches
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = []

        for text in batch_texts:
            words = word_tokenize(text)
            words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

            if words:
                # Getting ELMo embeddings
                character_ids = batch_to_ids([words])
                with torch.no_grad():
                    elmo_embeddings = elmo(character_ids)['elmo_representations'][0].numpy()

                # Average across all token embeddings to get sentence embedding
                sentence_embedding = np.mean(elmo_embeddings, axis=1).squeeze()
            else:
                sentence_embedding = np.zeros(embedding_dim)

            batch_embeddings.append(sentence_embedding)

        embeddings.extend(batch_embeddings)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_elmo_embeddings(df)

## Word2Vec Embeddings

In [None]:
# Loading Word2Vec model
model = api.load('word2vec-google-news-300')

# Function for getting the Word2Vec embedding for each response
def get_word2vec_embeddings(df, embedding_dim=300):
    texts = df['translated_response'].tolist()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    for text in texts:
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        # Getting Word2Vec embeddings
        word_embeddings = [model[word] for word in words if word in model]

        # Average across all token embeddings to get sentence embedding
        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
        else:
            sentence_embedding = np.zeros(embedding_dim)

        embeddings.append(sentence_embedding)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_word2vec_embeddings(df)

## FastText Embeddings

In [None]:
# Loading FastText model
fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model('cc.en.300.bin')

# Function for getting the FastText embedding for each response
def get_fasttext_embeddings(df, embedding_dim=300):
    texts = df['translated_response'].tolist()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    for text in texts:
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        # Getting FastText embeddings
        word_embeddings = [model.get_word_vector(word) for word in words if word in model.words]

        # Average across all token embeddings to get sentence embedding
        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
        else:
            sentence_embedding = np.zeros(embedding_dim)

        embeddings.append(sentence_embedding)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_fasttext_embeddings(df)

## Modelling

In [None]:
# Creating training + validation and test sets
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate out the 'towel' responses (which is the hold-out AUT object)
towel_responses = df[df['object'] == 'towel']
non_towel_responses = df[df['object'] != 'towel']

# Perform stratified sampling on the non-towel responses
train_non_towel, test_non_towel = train_test_split(
    non_towel_responses,
    test_size=0.1,
    stratify=non_towel_responses['final_category'],
    random_state=42
)

# Form the final test set ('towel' responses + stratified sample of remaining)
test_set = pd.concat([towel_responses, test_non_towel])

# Form the final training + validation set
train_set = non_towel_responses[~non_towel_responses.index.isin(test_non_towel.index)]
train_set = pd.concat([train_set, train_non_towel])

# Shuffle final training and test sets
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
test_set = test_set.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop duplicates based on 'translated_response', keeping the first occurrence
train_set = train_set.groupby('object', group_keys=False).apply(lambda x: x.drop_duplicates(subset='translated_response', keep='first'))

# Reset the index
train_set = train_set.reset_index(drop=True)

In [None]:
# Set up for training Naive Bayes, Logistic Regression, k-nearest Neighbors and LightGBM models

def get_feature_set(dataset):
    # Adjust based on which embedding set you want to train models on
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('511') + 1
    feature_set = dataset.iloc[:, i1:i2]

# Function for calculating AIC and BIC for model comparison
def calculate_aic_bic(log_likelihood, n_params, n_samples):
    aic = 2 * n_params - 2 * log_likelihood
    bic = np.log(n_samples) * n_params - 2 * log_likelihood
    return aic, bic

def train_models(train_set, test_set, print_results=False):
    SEED = 1
    i = 0

    # Transforming categorical labels into numbers for model
    le = LabelEncoder()
    train_set['final_category_encoded'] = le.fit_transform(train_set['final_category'])
    test_set['final_category_encoded'] = le.transform(test_set['final_category'])

    # Getting feature set and outcome variable
    predictorVar = get_feature_set(train_set)
    targetVar = 'final_category_encoded'
    X_trainval = train_set[predictorVar.columns]
    y_trainval = train_set[targetVar]
    X_test = test_set[predictorVar.columns]
    y_test = test_set[targetVar]

    # Setting up 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setting up hyperparameters for each model (adjust as needed)
    params_log_reg = {
        'C': 1.0,
        'penalty': 'l2',
        'solver': 'liblinear',
        'random_state': SEED
    }

    params_knn = {
        'n_neighbors': 5,
        'weights': 'uniform',
        'algorithm': 'auto'
    }

    params_lgb = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'learning_rate': 0.1,
        'n_estimators': 100,
        'random_state': SEED
    }

    log_reg = LogisticRegression(**params_log_reg)
    nb = GaussianNB()
    knn = KNeighborsClassifier(**params_knn)
    lgb_model = lgb.LGBMClassifier(**params_lgb)

    # Set up saving for trained models
    trained_models = {"utility": dict()}
    df_predictions = pd.DataFrame(columns=['measure', 'model', 'set', 'actual', 'predictions', 'accuracy', 'precision', 'recall', 'f1_score', 'aic', 'bic'], index=range(0, 14))

    models = [('Logistic Regression', log_reg), ('Naive Bayes', nb), ('KNN', knn), ('LightGBM', lgb_model), ('Base Model', 'mode')]

    accuracy_test_scores = []
    accuracy_val_scores = []
    precision_test_scores = []
    precision_val_scores = []
    recall_test_scores = []
    recall_val_scores = []
    f1_test_scores = []
    f1_val_scores = []
    aic_test_scores = []
    aic_val_scores = []
    bic_test_scores = []
    bic_val_scores = []

   # Training and validating each model using 5-fold cross-validation
    for ml_name, ml in models:
        for train_index, val_index in skf.split(X_trainval, y_trainval):
            X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[val_index]
            y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[val_index]

            if ml_name == 'Base Model':
                mode_val = train_set['final_category_encoded'].mode()[0]
                y_pred_val = np.repeat(mode_val, len(X_val))
                y_pred_test = np.repeat(mode_val, len(X_test))
                y_pred_prob_val = np.ones((len(X_val), len(le.classes_))) / len(le.classes_)
                y_pred_prob_test = np.ones((len(X_test), len(le.classes_))) / len(le.classes_)
            else:
                ml.fit(X_train, y_train)
                y_pred_val = ml.predict(X_val)
                y_pred_test = ml.predict(X_test)
                y_pred_prob_val = ml.predict_proba(X_val)
                y_pred_prob_test = ml.predict_proba(X_test)

            trained_models['utility'].update({ml_name: ml})

            # Calculating accuracy, precision, recall and F1-score
            accuracy_val = accuracy_score(y_val, y_pred_val)
            precision_val = precision_score(y_val, y_pred_val, average='weighted', zero_division=0)
            recall_val = recall_score(y_val, y_pred_val, average='weighted')
            f1_val = f1_score(y_val, y_pred_val, average='weighted')

            accuracy_test = accuracy_score(y_test, y_pred_test)
            precision_test = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
            recall_test = recall_score(y_test, y_pred_test, average='weighted')
            f1_test = f1_score(y_test, y_pred_test, average='weighted')

            # Calculating AIC and BIC
            log_likelihood_val = -log_loss(y_val, y_pred_prob_val, labels=np.arange(len(le.classes_)))
            log_likelihood_test = -log_loss(y_test, y_pred_prob_test, labels=np.arange(len(le.classes_)))
            n_params = len(ml.get_params()) if ml_name != 'Base Model' else 0
            n_samples_val = len(y_val)
            n_samples_test = len(y_test)
            aic_val, bic_val = calculate_aic_bic(log_likelihood_val, n_params, n_samples_val)
            aic_test, bic_test = calculate_aic_bic(log_likelihood_test, n_params, n_samples_test)

            # Appending metrics to appropriate lists
            accuracy_test_scores.extend([accuracy_test])
            accuracy_val_scores.extend([accuracy_val])
            precision_test_scores.extend([precision_test])
            precision_val_scores.extend([precision_val])
            recall_test_scores.extend([recall_test])
            recall_val_scores.extend([recall_val])
            f1_test_scores.extend([f1_test])
            f1_val_scores.extend([f1_val])
            aic_test_scores.extend([aic_test])
            aic_val_scores.extend([aic_val])
            bic_test_scores.extend([bic_test])
            bic_val_scores.extend([bic_val])

            # Adding results to df_predictions
            df_predictions.loc[i] = np.array(['utility', ml_name, 'test', y_test, y_pred_test, accuracy_test, precision_test, recall_test, f1_test,
                                             aic_test, bic_test], dtype=object)

            df_predictions.loc[i + 1] = np.array(['utility', ml_name, 'validation', y_val, y_pred_val, accuracy_val, precision_val, recall_val, f1_val,
                                                 aic_val, bic_val], dtype=object)

            i += 2

        # Calculating mean metrics across folds
        mean_accuracy_test = np.mean(accuracy_test_scores)
        mean_accuracy_val = np.mean(accuracy_val_scores)
        mean_precision_test = np.mean(precision_test_scores)
        mean_precision_val = np.mean(precision_val_scores)
        mean_recall_test = np.mean(recall_test_scores)
        mean_recall_val = np.mean(recall_val_scores)
        mean_f1_test = np.mean(f1_test_scores)
        mean_f1_val = np.mean(f1_val_scores)
        mean_aic_test = np.mean(aic_test_scores)
        mean_aic_val = np.mean(aic_val_scores)
        mean_bic_test = np.mean(bic_test_scores)
        mean_bic_val = np.mean(bic_val_scores)

        # Printing metrics
        if print_results:
            print(y_pred_test)
            print("utility \n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format(ml_name, mean_accuracy_test, mean_precision_test, mean_recall_test, mean_f1_test, mean_aic_test, mean_bic_test))
            print("\n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format('test set', mean_accuracy_val, mean_precision_val, mean_recall_val, mean_f1_val, mean_aic_val, mean_bic_val))
            print("\n")

    return df_predictions, trained_models

In [None]:
# Set up for training random forest classifier and XGBoost models

def get_feature_set(dataset):
    # Adjust based on which embedding set you want to train models on
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('511') + 1
    feature_set = dataset.iloc[:, i1:i2]
    return feature_set

# Function for calculating AIC and BIC for model comparison
def calculate_aic_bic(log_likelihood, n_params, n_samples):
    aic = 2 * n_params - 2 * log_likelihood
    bic = np.log(n_samples) * n_params - 2 * log_likelihood
    return aic, bic

def train_models(train_set, test_set, print_results=False):
    SEED = 1
    i = 0

    # Transforming categorical labels into numbers for model
    le = LabelEncoder()
    train_set['final_category_encoded'] = le.fit_transform(train_set['final_category'])
    test_set['final_category_encoded'] = le.transform(test_set['final_category'])

    # Getting feature set and outcome variable
    predictorVar = get_feature_set(train_set)
    targetVar = 'final_category_encoded'
    X_trainval = train_set[predictorVar.columns]
    y_trainval = train_set[targetVar]
    X_test = test_set[predictorVar.columns]
    y_test = test_set[targetVar]

    # Setting up 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setting up hyperparameters for each model (adjust as needed)
    params_rf = {
        'max_depth': 20,
        'min_samples_leaf': 3,
        'min_samples_split': 8,
        'n_estimators': 200,
        'class_weight': 'balanced'
    }

    params_xgb = {
        'learning_rate': 0.2,
        'max_depth': 10,
        'n_estimators': 200,
        'subsample': 0.6,
        'eta': 0.1,
        'objective': 'multi:softmax',  # for classification
        'num_class': 3,
        'scale_pos_weight': len(y_trainval) / (2 * np.bincount(y_trainval))
    }

    rf = RandomForestClassifier(random_state=SEED)
    rf.set_params(**params_rf)
    xgb_model = xgb.XGBClassifier(**params_xgb)

    # Set up saving for trained models
    trained_models = {"utility": dict()}
    df_predictions = pd.DataFrame(columns=['measure', 'model', 'set', 'actual', 'predictions', 'accuracy', 'precision', 'recall', 'f1_score', 'aic', 'bic'], index=range(0, 14))

    models = [('Random Forest', rf), ('XGBoost', xgb_model), ('Base Model', 'mode')]
    accuracy_test_scores = []
    accuracy_val_scores = []
    precision_test_scores = []
    precision_val_scores = []
    recall_test_scores = []
    recall_val_scores = []
    f1_test_scores = []
    f1_val_scores = []
    aic_test_scores = []
    aic_val_scores = []
    bic_test_scores = []
    bic_val_scores = []

    # Training and validating each model using 5-fold cross-validation
    for ml_name, ml in models:
      for train_index, val_index in skf.split(X_trainval, y_trainval):
          X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[val_index]
          y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[val_index]

          if ml_name == 'Random Forest':
              ml.fit(X_train, y_train)
              y_pred_val = ml.predict(X_val)
              y_pred_test = ml.predict(X_test)
              y_pred_prob_val = ml.predict_proba(X_val)
              y_pred_prob_test = ml.predict_proba(X_test)
          elif ml_name == 'Base Model':
              mode_val = train_set['final_category_encoded'].mode()[0]
              y_pred_val = np.repeat(mode_val, len(X_val))
              y_pred_test = np.repeat(mode_val, len(X_test))
              y_pred_prob_val = np.ones((len(X_val), len(le.classes_))) / len(le.classes_)
              y_pred_prob_test = np.ones((len(X_test), len(le.classes_))) / len(le.classes_)
          else:
              ml.fit(X_train, y_train)
              y_pred_val = ml.predict(X_val)
              y_pred_test = ml.predict(X_test)
              y_pred_prob_val = ml.predict_proba(X_val)
              y_pred_prob_test = ml.predict_proba(X_test)

          trained_models['utility'].update({ml_name: ml})

          # Calculating accuracy, precision, recall and F1-score
          accuracy_val = accuracy_score(y_val, y_pred_val)
          precision_val = precision_score(y_val, y_pred_val, average='weighted', zero_division=0)
          recall_val = recall_score(y_val, y_pred_val, average='weighted')
          f1_val = f1_score(y_val, y_pred_val, average='weighted')

          accuracy_test = accuracy_score(y_test, y_pred_test)
          precision_test = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
          recall_test = recall_score(y_test, y_pred_test, average='weighted')
          f1_test = f1_score(y_test, y_pred_test, average='weighted')

          # Calculating AIC and BIC
          log_likelihood_val = -log_loss(y_val, y_pred_prob_val, labels=np.arange(len(le.classes_)))
          log_likelihood_test = -log_loss(y_test, y_pred_prob_test, labels=np.arange(len(le.classes_)))
          n_params_rf = len(params_rf) if ml_name == 'Random Forest' else len(params_xgb)
          n_samples_val = len(y_val)
          n_samples_test = len(y_test)
          aic_val, bic_val = calculate_aic_bic(log_likelihood_val, n_params_rf, n_samples_val)
          aic_test, bic_test = calculate_aic_bic(log_likelihood_test, n_params_rf, n_samples_test)

          # Appending metrics to appropriate lists
          accuracy_test_scores.extend([accuracy_test])
          accuracy_val_scores.extend([accuracy_val])
          precision_test_scores.extend([precision_test])
          precision_val_scores.extend([precision_val])
          recall_test_scores.extend([recall_test])
          recall_val_scores.extend([recall_val])
          f1_test_scores.extend([f1_test])
          f1_val_scores.extend([f1_val])
          aic_test_scores.extend([aic_test])
          aic_val_scores.extend([aic_val])
          bic_test_scores.extend([bic_test])
          bic_val_scores.extend([bic_val])

          # Adding results to df_predictions
          df_predictions.loc[i] = np.array(['utility', ml_name, 'test', y_test, y_pred_test, accuracy_test, precision_test, recall_test, f1_test,
                                             aic_test, bic_test], dtype=object)

          df_predictions.loc[i + 1] = np.array(['utility', ml_name, 'validation', y_val, y_pred_val, accuracy_val, precision_val, recall_val, f1_val,
                                                 aic_val, bic_val], dtype=object)

          i += 2

      # Calculating mean metrics across folds
      mean_accuracy_test = np.mean(accuracy_test_scores)
      mean_accuracy_val = np.mean(accuracy_val_scores)
      mean_precision_test = np.mean(precision_test_scores)
      mean_precision_val = np.mean(precision_val_scores)
      mean_recall_test = np.mean(recall_test_scores)
      mean_recall_val = np.mean(recall_val_scores)
      mean_f1_test = np.mean(f1_test_scores)
      mean_f1_val = np.mean(f1_val_scores)
      mean_aic_test = np.mean(aic_test_scores)
      mean_aic_val = np.mean(aic_val_scores)
      mean_bic_test = np.mean(bic_test_scores)
      mean_bic_val = np.mean(bic_val_scores)

      # Printing metrics
      if print_results:
            print(y_pred_test)
            print("utility \n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format(ml_name, mean_accuracy_test, mean_precision_test, mean_recall_test, mean_f1_test, mean_aic_test, mean_bic_test))
            print("\n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format('test set', mean_accuracy_val, mean_precision_val, mean_recall_val, mean_f1_val, mean_aic_val, mean_bic_val))
            print("\n")

    return df_predictions, trained_models

In [None]:
# Train models
models, predictions = train_models(train_set, test_set, print_results = True)