# Training Non-Generative Machine Learning Models to Predict Response Utility Ratings on the Alternate Uses Task

The purpose of this notebook is to take data from a number of previous papers on the Alternate Uses Task (namely [Stevenson, 2020](http://modelingcreativity.org/blog/wp-content/uploads/2020/07/ABBAS_report_200711_final.pdf); [Stevenson, 2022](https://arxiv.org/pdf/2206.08932); [Nath, 2024](https://arxiv.org/pdf/2405.00899) and [Hubert, 2024](https://www.nature.com/articles/s41598-024-53303-w.pdf)), create several features that might be predictive of response utility and then train a range of non-generative machine learning models on this task to establish the best predictor-model combination for predicting human rater utility scores.

## Set Up

### Importing Packages

Before running this make sure the utils.py document is downloaded from [here](https://github.com/allenai/comet-atomic-2020/tree/master/models/comet_atomic2020_bart).

In [None]:
import pandas as pd
import numpy as np
import glob
import re
import os
import json
import torch
import zipfile
import argparse
import requests
import time
import seaborn as sns
import chardet
from tqdm import tqdm
from pathlib import Path
import joblib

from matplotlib import pyplot as plt
from IPython.display import display, HTML

import tensorflow_text
import tensorflow_hub as hub
import nltk
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from fuzzywuzzy import fuzz

from scipy import spatial
from scipy.stats import pearsonr
from utils import calculate_rouge, use_task_specific_params, calculate_bleu_score, trim_batch

from sklearn.preprocessing import OrdinalEncoder, RobustScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

ModuleNotFoundError: No module named 'tensorflow_text'

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

### Importing Data

In [None]:
# Function for reading in Stevenson, 2020 data files and uniting rater_01 and rater_02 files
def read_files(path):
    """
    Read the csv files from ./data/Stevenson-2020-human

    :param path: string with path to files
    :return dataset: merged dataset
    """

    path = path
    all_files = glob.glob(path + "/*.csv")
    liR1 = []
    liR2 = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0, nrows=1, encoding='latin1')
        if len(df.columns) == 1:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1', sep=';')
        else:
            df = pd.read_csv(filename, index_col=None, header=0, encoding='latin1')

        if '_rater01' in filename:
            liR1.append(df)


        else:
            liR2.append(df.loc[:, ['response_id', 'respondent_id', 'originality_rater02', 'utility_rater02']])

    frameR1 = pd.concat(liR1, axis=0, ignore_index=True)
    frameR2 = pd.concat(liR2, axis=0, ignore_index=True)

    df = frameR1.merge(frameR2, on=['response_id','respondent_id'],
                   how='left')

    df["translated_response"] = df["translated_response"].astype(str)
    df["response_id"] = df["response_id"].astype(str)
    df["respondent_id"] = df["respondent_id"].astype(str)

    return df

In [None]:
# Reading in files where encoding needs to be detected (change file paths as needed)
with open('/data/Nath-2024-LLM.csv', 'rb') as f:
    result = chardet.detect(f.read())
df1 = pd.read_csv('/data/Nath-2024-LLM.csv', encoding=result['encoding'])

with open('/data/Hubert-2024-LLM.csv', 'rb') as f:
    result = chardet.detect(f.read())
df2 = pd.read_csv('/data/Hubert-2024-LLM.csv', encoding=result['encoding'])

In [None]:
# Loading the rest of the data files (change file paths as needed)
df3 = pd.read_csv('/data/Nath-2024-human.csv')
df4 = pd.read_csv('/data/Stevenson-2022-LLM.csv')
df5 = read_files('/data/Stevenson-2020-human')
df6 = pd.read_excel('/data/additional-LLM.xlsx')

# Uniting all data files in one dataframe
df = pd.concat([df1, df2, df3, df4, df5, df6], axis=0)

In [None]:
# Crowdsourcing adjectives from invalid answers to AUT
df['translated_response'] = df['translated_response'].astype(str)

# Ensuring they are adjectives and at least 3 characters
def is_adjective_and_long_enough(word):
    return len(word) >= 3 and len(wn.synsets(word, pos=wn.ADJ)) > 0

filtered_df = df[
    (df[['utility_rater01', 'utility_rater02', 'originality_rater01', 'originality_rater02']] == 0).any(axis=1) &
    df['translated_response'].str.split().apply(lambda x: len(x) == 1 and is_adjective_and_long_enough(x[0]))
]

# Taking 4 most popular
properties = filtered_df.groupby('object')['translated_response'].apply(
    lambda x: x.value_counts().index.tolist()[:4]
).to_dict()

## Cleaning Data

In [None]:
# Function for dropping invalid answers
def drop_invalid(df):
    """
    Drops all answers that were either empty, had a rating of 0 for at least one score,
    or of which the respondent number was 9999 (indicating an invalid respondent)

    :param df: dataset with all columns needed for further steps
    :return dataset, dropped_data:  dataset without invalid data,
                dataset of invalid data
    """
    liV = [1] * len(df)
    condition = (df[['utility_rater01', 'utility_rater02', 'originality_rater01', 'originality_rater02']] == 0).any(axis=1)

    liV = [0 if cond else li for cond, li in zip(condition, liV)]

    # Dropping answers rated as 0 by at least one rater
    df['valid'] = liV
    df_invalid = df[df['valid'] == 0]
    df = df[df['valid'] != 0]

    # Dropping respondent_id that seems to belong to no one
    df_strange = df[df['respondent_id'] == 9999]
    df = df[df['respondent_id'] != 9999]

    # Dropping empty answers
    df_empty = df[df['original_response'] == 'nan']
    df = df[df['original_response'] != 'nan']
    df = df.drop(columns=['valid'])

    df_dropped = pd.concat([df_empty, df_strange, df_invalid], axis=0, ignore_index=True)

    return df, df_dropped

In [None]:
# Applying to dataset
df, df_dropped = drop_invalid(df)
num_dropped = len(df_dropped)

print(num_dropped)
print(len(df))

In [None]:
# Function for cleaning valid responses
def clean_response(dataset, col_response):
    """
    Function cleans the responses

    :param dataset: dataset which include column(s) of responses
    :param col_response: column name of responses to be cleaned
    :return dataset: input dataset with clean responses added
    """
    # Upper to lowercase, remove punctuation and redundant spaces/letters
    dataset[col_response] = [x.lower() for x in dataset[col_response]]
    dataset[col_response] = [re.sub(r'[^\w\s]', ' ', x) for x in dataset[col_response]]  # delete any signs
    dataset[col_response] = [re.sub(r'\b\w\b', ' ', x) for x in dataset[col_response]] # delete loose letters
    dataset[col_response] = [x.strip() for x in dataset[col_response]]  # delete extra white space before/after string
    dataset[col_response] = [' '.join(x.split()) for x in dataset[col_response]]  # delete every extra space in string
    return dataset

In [None]:
# Applying to dataset
df = clean_response(df, 'translated_response')
df = clean_response(df, 'final_response')

In [None]:
# Remove stopwords from each use
def remove_stopwords(phrase):
    words = word_tokenize(phrase)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [None]:
df['translated_response'] = df['translated_response'].apply(lambda x: remove_stopwords(x))

In [None]:
# Transforming utility from 5-point scale to three categories
df = df.dropna(subset=["originality_rater01", "utility_rater01"])

# Function to categorize ratings
def categorize_rating(rating):
    if rating in [1, 2]:
        return 'low'
    elif rating in [3, 4]:
        return 'medium'
    elif rating == 5:
        return 'high'
    else:
        return 'unknown'

df['category_rater01'] = df['utility_rater01'].apply(categorize_rating)
df['category_rater02'] = df['utility_rater02'].apply(lambda x: categorize_rating(x) if not pd.isna(x) else np.nan)

# Function that takes rater 1's (better rater's) scores if raters disagree on category
def determine_final_category(row):
    if pd.isna(row['utility_rater02']):
        return row['category_rater01']
    elif row['category_rater01'] == row['category_rater02']:
        return row['category_rater01']
    else:
        return row['category_rater02']

df['final_category'] = df.apply(determine_final_category, axis=1)

# Drop intermediate categories
df = df.drop(columns=['category_rater01', 'category_rater02'])

## Feature Engineering

### Elaboration

In [None]:
# Function for counting number of words in response (elaboration)
def num_words(dataset, col_response):
    """
    Function to calculate the number of words of each response

    :param dataset: dataset containing the columns of col_responses
    :param col_response: column name of responses of which words are counted
    :return dataset: input dataset with number of words
    """
    answers = dataset[col_response].copy()
    numWords = [len(x.split()) for x in answers]
    dataset["number_words"] = numWords.copy()

    return dataset

In [None]:
# Applying to dataset
df = num_words(df, 'translated_response')

# Descriptive statistics of elaboration
mean_words = df.number_words.mean().round(1)
sd_words = df.number_words.std().round(1)
print(mean_words)
print(sd_words)

### (Inverse) Frequency

In [None]:
# Function to calculate the frequency and inverse frequency of a response
def frequency(dataset, col_response):
    """
    Function to calculate the (inverse) frequency of each response in the data given the object

    :param dataset: dataset containing col_obj and col_responses
    :param col_response: column name of the responses
    :return dataset: input dataset including the inverse frequency and frequency
    """
    # Get unique set of objects
    objects = list(set(dataset['object']))

    df_objects = []
    top_10s = {}
    for obj in objects:
        # All answers for one object
        df_obj = dataset[dataset['object'] == obj].copy()

        # Frequency of each answer within object
        frequency_answers = pd.DataFrame(df_obj.translated_response.value_counts())
        # Creates dictionary of 10 most frequent answers for each object
        top_10s[f"{obj}"] = df_obj.translated_response.value_counts().head(10).index.to_list()
        df_obj['frequency_answer'] = 0
        df_obj['frequency_answer_inverse'] = 0

        # Add (inverse) frequency of each response to data
        for resp, freq in frequency_answers.itertuples():
            ind = df_obj.index[df_obj[col_response] == resp].tolist()
            df_obj.loc[ind, 'frequency_answer'] = freq
            df_obj.loc[ind, 'frequency_answer_inverse'] = 1 / freq
        df_objects.append(df_obj)

    dataset = pd.concat(df_objects, axis=0, ignore_index=True)

    return dataset, top_10s

In [None]:
# Add frequency of each response to df + produce dictionary of top uses
df, top_10s = frequency(df, 'translated_response')

# Descriptive statistics of (inverse) frequency
mean_frequency = df.frequency_answer.mean().round(1)
sd_frequency = df.frequency_answer.std().round(1)
mean_inverse = df.frequency_answer_inverse.mean().round(2)
sd_inverse = df.frequency_answer_inverse.std().round(2)

print(mean_frequency)
print(sd_frequency)
print(mean_inverse)
print(sd_inverse)

### Embeddings

In [None]:
# Downloading GloVe embeddings
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip = "glove.6B.zip"
glove_dir = "glove.6B"

if not os.path.exists(glove_zip):
    print(f"Downloading {glove_zip}...")
    response = requests.get(glove_url)
    with open(glove_zip, 'wb') as f:
        f.write(response.content)

# Extracting the embeddings
if not os.path.exists(glove_dir):
    print(f"Extracting {glove_zip}...")
    with zipfile.ZipFile(glove_zip, 'r') as zip_ref:
        zip_ref.extractall(glove_dir)

In [None]:
# Function for loading the GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings_index = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print(f"Loaded {len(embeddings_index)} word vectors.")
    return embeddings_index

glove_file = "glove.6B/glove.6B.100d.txt"
embeddings_index = load_glove_embeddings(glove_file)

In [None]:
# Function for getting the GloVe embedding for each response
def get_glove_embeddings(df, embeddings_index, embedding_dim=100):
    texts = df['final_response'].to_numpy()
    stop_words = set(stopwords.words('english'))
    embeddings = []

    for text in texts:
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

        word_embeddings = [embeddings_index[word] for word in words if word in embeddings_index]

        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
        else:
            sentence_embedding = np.zeros(embedding_dim)

        embeddings.append(sentence_embedding)

    # Converting embeddings from list to df
    embeddings_df = pd.DataFrame(embeddings, columns=[f"embedding_{i}" for i in range(embedding_dim)])

    # Adding to the original df
    df = pd.concat([df.reset_index(drop=True), embeddings_df], axis=1)
    return df

In [None]:
# Applying to the dataset
df = get_glove_embeddings(df, embeddings_index)

### Semantic Distance

In [None]:
# Getting Universal Sentence Encoder embeddings
def get_embeddings(texts, batch_size=100):
      # Load USE module
      os.environ['TFHUB_CACHE_DIR'] = '/tf_cache'
      module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3')
      embeddings = []
      for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        embeddings.append(module(batch).numpy())

      return np.vstack(embeddings)

def cosine_similarity(vec1, vec2):
      return 1 - spatial.distance.cosine(vec1, vec2)

# Function for calculating semantic distance from USE embeddings
def sem_dis(dataset, batch_size=100):
    """
    Function to add word embeddings and calculate semantic distance.

    :param dataset: dataset containing at least the responses
    :return dataset: input with extra columns for the word embedding values (one column per value)
                        and the semantic distance
    """
    # Array containing all original responses
    responses = dataset[['translated_response']].copy()
    objs = pd.DataFrame([['belt'], ['book'], ['brick'], ['can'], ['fork'], ['paperclip'], ['stick'], ['towel']], columns=['translated_response'])
    responses = pd.concat([responses, objs], axis=0, ignore_index=True)
    responses = responses.to_numpy()

    # Get embeddings for top 10 uses for each object
    top_use_embeddings = {obj: get_embeddings(uses) for obj, uses in top_10s.items()}

    # Get embeddings for the 'translated_response' column in the dataset
    emb = get_embeddings(responses, batch_size=batch_size)
    emb = pd.DataFrame(emb)
    emb.columns = emb.columns.astype(str)

    # Word embeddings of the AUT objects
    belt = emb.iloc[-8, :]
    book = emb.iloc[-7, :]
    brick = emb.iloc[-6, :]
    can = emb.iloc[-5, :]
    fork = emb.iloc[-4, :]
    paperclip = emb.iloc[-3, :]
    stick = emb.iloc[-2, :]
    towel = emb.iloc[-1, :]

    emb = emb.iloc[:-8, :]

    dist = []

    # Calculate semantic distance from AUT object for each response
    for i in range(len(emb)):
        if dataset['object'][i] == 'belt':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], belt))
        elif dataset['object'][i] == 'book':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], book))
        elif dataset['object'][i] == 'brick':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], brick))
        elif dataset['object'][i] == 'can':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], can))
        elif dataset['object'][i] == 'fork':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], fork))
        elif dataset['object'][i] == 'paperclip':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], paperclip))
        elif dataset['object'][i] == 'stick':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], stick))
        elif dataset['object'][i] == 'towel':
            dist.append(spatial.distance.cosine(emb.iloc[i, :], towel))

    for i, row in dataset.iterrows():
      obj = row['object']
      use_embedding = emb.iloc[i, :]

      # Calculate similarity between response and top 10 uses for that object
      if obj in top_use_embeddings:
          similarities = [cosine_similarity(use_embedding, top_use_embedding) for top_use_embedding in top_use_embeddings[obj]]
          # Store similarity to each of the top 10 uses in a separate column
          for j, sim in enumerate(similarities):
              dataset.loc[i, f'sim_{j+1}'] = sim

    dataset["similarity"] = dist

    return dataset, emb

In [None]:
# Adding semantic distance to df + creating embeddings
df, embeddings = sem_dis(df, batch_size=100)

# Descriptive statistics of semantic distance
mean_sem_dis = df.similarity.mean().round(2)
sd_sem_dis = df.similarity.std().round(2)
print(mean_sem_dis)
print(sd_sem_dis)

max_sem_dis = df.similarity.max().round(2)
max_sem_dis_resp = df['translated_response'][df.similarity.idxmax()]
max_sem_dis_obj = df['object'][df.similarity.idxmax()]
print(max_sem_dis)
print(max_sem_dis_resp)
print(max_sem_dis_obj)

min_sem_dis = df.similarity.min().round(2)
min_sem_dis_resp = df['translated_response'][df.similarity.idxmin()]
min_sem_dis_obj = df['object'][df.similarity.idxmin()]
print(min_sem_dis)
print(min_sem_dis_resp)
print(min_sem_dis_obj)

### Knowledge Graph Predictors

#### COMET Top 10 Uses

To load in the COMET knowledge graph, you first need to download the download_model.sh file from [here](https://github.com/allenai/comet-atomic-2020/tree/master/models/comet_atomic2020_bart). Then, you need to run the following command in your command line:

bash download_model.sh

In [None]:
# Set up for COMET knowledge graph
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i : i + n]


class Comet:
    def __init__(self, model_path):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        task = "summarization"
        use_task_specific_params(self.model, task)
        self.batch_size = 1
        self.decoder_start_token_id = None

    def generate(
            self,
            queries,
            decode_method="beam",
            num_generate=5,
            ):

        with torch.no_grad():
            examples = queries

            decs = []
            for batch in list(chunks(examples, self.batch_size)):

                batch = self.tokenizer(batch, return_tensors="pt", truncation=True, padding="max_length").to(self.device)
                input_ids, attention_mask = trim_batch(**batch, pad_token_id=self.tokenizer.pad_token_id)

                summaries = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_start_token_id=self.decoder_start_token_id,
                    num_beams=num_generate,
                    num_return_sequences=num_generate,
                    )

                dec = self.tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                decs.append(dec)

            return decs


all_relations = [
    "AtLocation",
    "CapableOf",
    "Causes",
    "CausesDesire",
    "CreatedBy",
    "DefinedAs",
    "DesireOf",
    "Desires",
    "HasA",
    "HasFirstSubevent",
    "HasLastSubevent",
    "HasPainCharacter",
    "HasPainIntensity",
    "HasPrerequisite",
    "HasProperty",
    "HasSubEvent",
    "HasSubevent",
    "HinderedBy",
    "InheritsFrom",
    "InstanceOf",
    "IsA",
    "LocatedNear",
    "LocationOfAction",
    "MadeOf",
    "MadeUpOf",
    "MotivatedByGoal",
    "NotCapableOf",
    "NotDesires",
    "NotHasA",
    "NotHasProperty",
    "NotIsA",
    "NotMadeOf",
    "ObjectUse",
    "PartOf",
    "ReceivesAction",
    "RelatedTo",
    "SymbolOf",
    "UsedFor",
    "isAfter",
    "isBefore",
    "isFilledBy",
    "oEffect",
    "oReact",
    "oWant",
    "xAttr",
    "xEffect",
    "xIntent",
    "xNeed",
    "xReact",
    "xReason",
    "xWant",
    ]

In [None]:
# Calculating semantic distance between object and 10 uses generated by COMET

# Remove stopwords from each use
def remove_stopwords(phrase):
    words = word_tokenize(phrase)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

# Remove uses that are too similar
def remove_similar_phrases(key, phrases, threshold=20):
    phrases_no_stopwords = [remove_stopwords(phrase) for phrase in phrases]

    unique_phrases = []

    key_no_stopwords = remove_stopwords(key)
    unique_phrases.append(key_no_stopwords)

    for i, phrase in enumerate(phrases_no_stopwords):
        similarity_scores = [fuzz.ratio(phrase, unique_phrase) for unique_phrase in unique_phrases]
        if all(score < threshold for score in similarity_scores):
            unique_phrases.append(phrase)

    # Ensure there are exactly 10 phrases
    filtered_phrases = [phrases[i] for i, phrase in enumerate(phrases_no_stopwords) if remove_stopwords(phrases[i]) in unique_phrases[1:]]
    return filtered_phrases[:10]

# Use the xUsedFor relationship in COMET to generate uses for each object
def generate_and_filter_phrases(objects, threshold=20):
    top10_uses = {}
    comet = Comet("/content/comet-atomic_2020_BART_aaai")
    comet.model.zero_grad()

    for obj in objects:
        queries = []
        head = obj
        rel = "xUsedFor"
        query = "{} {}".format(head, rel)
        queries.append(query)

        # Generate more uses until we have 10 unique ones
        filtered_results = []
        while len(filtered_results) < 10:
            results = comet.generate(queries, decode_method="greedy", num_generate=20)
            combined_results = filtered_results + results[0]
            filtered_results = remove_similar_phrases(head, combined_results, threshold)

        filtered_results = filtered_results[:10]
        top10_uses[obj] = [filtered_results]

    return top10_uses

def calculate_similarities(dataset, top10_uses, emb):
    top_use_embeddings = {}

    # Calculate embeddings for 10 uses from COMET
    top_use_embeddings = {obj: get_embeddings(uses) for obj, uses in top10_uses.items()}

    dist = []

    for i, row in dataset.iterrows():
        obj = row['object']
        use_embedding = emb.iloc[i, :].values

        # Calculate semantic distance between response and each of 10 uses for that object
        if obj in top_use_embeddings:
            similarities = [cosine_similarity(use_embedding, top_use_embedding) for top_use_embedding in top_use_embeddings[obj]]
            # Store similarity for each use in a separate column
            for j, sim in enumerate(similarities):
                dataset.loc[i, f'comet_sim_{j+1}'] = sim

    return dataset

if __name__ == "__main__":
    # Apply for AUT objects in dataset
    objects = ['belt', 'book', 'brick', 'can', 'fork', 'paperclip', 'stick', 'towel']
    filtered_dict = generate_and_filter_phrases(objects, threshold=70)
    updated_dataset = calculate_similarities(df, filtered_dict, embeddings)

#### Related Uses

In [None]:
# Generating properties for AUT objects (which don't already have them from invalid AUT responses) using LLM to then be able to generate related uses based on these

# List of objects that don't have adjectives from invalid AUT responses
objects = ['belt', 'book', 'stick']

objects_with_properties = {}

# Headers for the API request from Together.ai
url = "https://api.together.xyz/v1/chat/completions"
api_key = # Insert API key
headers = {
    "accept": "application/json",
    "content-type": "application/json",
    "Authorization": f"Bearer {api_key}"
}

# Function to get 4 characteristics for each object from Llama-3
def get_characteristics(obj):
    payload = {
        "messages": [
            {
                "role": "system",
                "content": f"What are the top 4 characteristics of a {obj}? I would like simply a list of 4 adjectives describing the object."
            }
        ],
        "model": "meta-llama/Llama-3-8b-chat-hf"
    }

    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        result = response.json()
        characteristics_text = result['choices'][0]['message']['content']
        # Extract characteristics from the response text
        characteristics = re.findall(r'\d+\.\s+([\w\-]+)', characteristics_text)
        return characteristics
    else:
        print(f"Failed to get characteristics for {obj}: {response.status_code}")
        return []

# Get each object's characteristics
for obj in objects:
    characteristics = get_characteristics(obj)
    objects_with_properties[obj] = characteristics
    time.sleep(0.7)

print(objects_with_properties)


In [None]:
# Merge with previously gathered characteristics (from invalid AUT responses)
objects_with_properties.update(properties)

In [None]:
# Generating related uses using COMET

lemmatizer = WordNetLemmatizer()

# Function to remove stopwords and lemmatize
def preprocess_text(text):
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

# Function to remove similar phrases
def remove_similar_phrases(phrases, threshold=70):
    unique_phrases = []
    for phrase in phrases:
        if all(fuzz.ratio(phrase, unique_phrase) < threshold for unique_phrase in unique_phrases):
            unique_phrases.append(phrase)
    return unique_phrases

# Querying COMET for related uses for previously established characteristics using RelatedTo relationship
def generate_and_filter_phrases(objects_with_characteristics, threshold=20):
    top5_uses = {}
    comet = Comet("/content/comet-atomic_2020_BART_aaai")
    comet.model.zero_grad()

    for obj, characteristics in objects_with_characteristics.items():
        all_results = []

        for characteristic in characteristics:
            queries = []
            start = characteristic
            rel = "RelatedTo"
            query = "{} {}".format(start, rel)
            queries.append(query)
            iteration_count = 0
            filtered_results = []
            num_generate = 20

            # Generate more related concepts until we have 5 unique ones
            while len(filtered_results) < 5 and iteration_count < 1:
                iteration_count += 1
                results = comet.generate(queries, decode_method="greedy", num_generate=num_generate)[0]
                processed_results = [preprocess_text(result) for result in results]
                combined_results = filtered_results + processed_results
                filtered_results = remove_similar_phrases(combined_results, threshold)

            # Ensure exactly 5 unique concepts for each characteristic
            filtered_results = filtered_results[:5]
            all_results.extend(filtered_results)

        # Remove similar concepts
        all_results = remove_similar_phrases(all_results, 80)
        top5_uses[obj] = all_results[:8]

    return top5_uses

if __name__ == '__main__':
    # Applying to previously generated characteristics of AUT objects
    top5_uses = generate_and_filter_phrases(objects_with_properties)

In [None]:
# Calculating semantic distance between response and the generated related uses

def calculate_similarities(dataset, final_results, emb):
    # Generate embeddings for related uses
    top_use_embeddings = {obj: get_embeddings(uses) for obj, uses in final_results.items()}

    dist = []

    for i, row in dataset.iterrows():
        obj = row['object']
        use_embedding = emb.iloc[i, :].values

        # Calculating similarity between response and each related use for that object
        if obj in top_use_embeddings and top_use_embeddings[obj] is not None:
          similarities = [cosine_similarity(use_embedding, top_use_embedding) for top_use_embedding in top_use_embeddings[obj]]
          # Store similarity for each of the 8 related uses in a separate column
          for j, sim in enumerate(similarities):
            dataset.loc[i, f'related_obj_sim_{j+1}'] = sim
        else:
          print(f"Nothing for {obj}.")

    return dataset

# Applying to dataset
df = calculate_similarities(df, top5_uses, embeddings)

## Cleaning Up Predictors

In [None]:
# Function to identify low variance predictors in the feature set
def low_var(dataset, variance_threshold):
    """
    function to remove features with low variance

    :param dataset: dataset containing at least the features
    :param variance_threshold: minimum proportion of data that should vary
    :return: columns of which  have the same number
    """
    # Define feature set
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('related_obj_sim_8') + 1

    df_features = dataset.iloc[:, i1:i2].copy()

    column_names = df_features.columns.values.tolist()
    ord_enc = OrdinalEncoder()
    df_features[column_names] = ord_enc.fit_transform(df_features[column_names])

    # Compare to variance threshold
    selector = VarianceThreshold(threshold=variance_threshold)
    selector.fit(df_features)

    # Get list of low variance columns
    low_var_cols = [column for column in df_features.columns
                    if column not in df_features.columns[selector.get_support()]]

    return low_var_cols

In [None]:
# Function to transform list into string (for listing dropped columns)
def list_to_string(listed_words):
    """
    Functions that makes a written summation of values in a list
    :param listed_words: list containing multiple strings
    :return: one string with the items of the input as written summation
    """
    listed_words = [str(x) for x in listed_words]

    written_summation = ""
    for i in range(len(listed_words)):
        if i == 0:
            written_summation = written_summation + listed_words[i]
        elif i == (len(listed_words) - 1):
            written_summation = written_summation + ", and " + listed_words[i]
        else:
            written_summation = written_summation + ", " + listed_words[i]

    return written_summation

In [None]:
# Applying to dataset
low_var_cols = low_var(df, 0.3)

dropped_features_var = [re.sub('type_', '', x) for x in low_var_cols]
dropped_features_var = list_to_string(low_var_cols)
print(dropped_features_var)

# Dropping low variance columns
df = df.drop(low_var_cols, axis=1)

In [None]:
# Function for removing highly correlated predictors in the feature set
def high_cor(dataset, correlation_threshold):
    """
    function to remove features with high correlation

    :param dataset: dataset containing at least the features
    :param correlation_threshold: minimum correlation for which features should be dropped
    :return: dataframe containing features that have a correlation higher than the threshold
    """
    # Define feature set
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('related_obj_sim_8') + 1
    df_features = dataset.iloc[:, i1:i2]

    # Get correlation matrix of features
    cor_matrix = df_features.corr().abs()
    upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape), k=1).astype(np.bool_))

    # Compare to correlation threshold
    cor_features = np.where(upper_tri > correlation_threshold)

    # Get list of highly correlated predictors
    cor_tri = upper_tri.iloc[list(np.unique(cor_features[0])), list(np.unique(cor_features[1]))]

    return cor_tri

In [None]:
# Applying to dataset
high_cor_cols = high_cor(df, .8)

dropped_features_cor = list_to_string(high_cor_cols)
print(dropped_features_cor)

# Drop highly correlated predictors
df = df.drop(high_cor_cols, axis=1)

## Training Models

In [None]:
# Creating training + validation and test sets
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate out the 'towel' responses (which is the hold-out AUT object)
towel_responses = df[df['object'] == 'towel']
non_towel_responses = df[df['object'] != 'towel']

# Perform stratified sampling on the non-towel responses
train_non_towel, test_non_towel = train_test_split(
    non_towel_responses,
    test_size=0.1,
    stratify=non_towel_responses['final_category'],
    random_state=42
)

# Form the final test set ('towel' responses + stratified sample of remaining)
test_set = pd.concat([towel_responses, test_non_towel])

# Form the final training + validation set
train_set = non_towel_responses[~non_towel_responses.index.isin(test_non_towel.index)]
train_set = pd.concat([train_set, train_non_towel])

# Shuffle final training and test sets
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)
test_set = test_set.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop duplicates based on 'translated_response', keeping the first occurrence
train_set = train_set.groupby('object', group_keys=False).apply(lambda x: x.drop_duplicates(subset='translated_response', keep='first'))

# Reset the index
train_set = train_set.reset_index(drop=True)

In [None]:
# Set up for training Naive Bayes, Logistic Regression, k-nearest Neighbors and LightGBM models

def get_feature_set(dataset):
    # Adjust based on which features you want to train models on
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('similarity') + 1
    feature_set = dataset.iloc[:, i1:i2]

# Function for calculating AIC and BIC for model comparison
def calculate_aic_bic(log_likelihood, n_params, n_samples):
    aic = 2 * n_params - 2 * log_likelihood
    bic = np.log(n_samples) * n_params - 2 * log_likelihood
    return aic, bic

def train_models(train_set, test_set, print_results=False):
    SEED = 1
    i = 0

    # Transforming categorical labels into numbers for model
    le = LabelEncoder()
    train_set['final_category_encoded'] = le.fit_transform(train_set['final_category'])
    test_set['final_category_encoded'] = le.transform(test_set['final_category'])

    # Getting feature set and outcome variable
    predictorVar = get_feature_set(train_set)
    targetVar = 'final_category_encoded'
    X_trainval = train_set[predictorVar.columns]
    y_trainval = train_set[targetVar]
    X_test = test_set[predictorVar.columns]
    y_test = test_set[targetVar]

    # Setting up 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setting up hyperparameters for each model (adjust as needed)
    params_log_reg = {
        'C': 1.0,
        'penalty': 'l2',
        'solver': 'liblinear',
        'random_state': SEED
    }

    params_knn = {
        'n_neighbors': 5,
        'weights': 'uniform',
        'algorithm': 'auto'
    }

    params_lgb = {
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        'learning_rate': 0.1,
        'n_estimators': 100,
        'random_state': SEED
    }

    log_reg = LogisticRegression(**params_log_reg)
    nb = GaussianNB()
    knn = KNeighborsClassifier(**params_knn)
    lgb_model = lgb.LGBMClassifier(**params_lgb)

    # Set up saving for trained models
    trained_models = {"utility": dict()}
    df_predictions = pd.DataFrame(columns=['measure', 'model', 'set', 'actual', 'predictions', 'accuracy', 'precision', 'recall', 'f1_score', 'aic', 'bic'], index=range(0, 14))

    models = [('Logistic Regression', log_reg), ('Naive Bayes', nb), ('KNN', knn), ('LightGBM', lgb_model), ('Base Model', 'mode')]

    accuracy_test_scores = []
    accuracy_val_scores = []
    precision_test_scores = []
    precision_val_scores = []
    recall_test_scores = []
    recall_val_scores = []
    f1_test_scores = []
    f1_val_scores = []
    aic_test_scores = []
    aic_val_scores = []
    bic_test_scores = []
    bic_val_scores = []

   # Training and validating each model using 5-fold cross-validation
    for ml_name, ml in models:
        for train_index, val_index in skf.split(X_trainval, y_trainval):
            X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[val_index]
            y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[val_index]

            if ml_name == 'Base Model':
                mode_val = train_set['final_category_encoded'].mode()[0]
                y_pred_val = np.repeat(mode_val, len(X_val))
                y_pred_test = np.repeat(mode_val, len(X_test))
                y_pred_prob_val = np.ones((len(X_val), len(le.classes_))) / len(le.classes_)
                y_pred_prob_test = np.ones((len(X_test), len(le.classes_))) / len(le.classes_)
            else:
                ml.fit(X_train, y_train)
                y_pred_val = ml.predict(X_val)
                y_pred_test = ml.predict(X_test)
                y_pred_prob_val = ml.predict_proba(X_val)
                y_pred_prob_test = ml.predict_proba(X_test)

            trained_models['utility'].update({ml_name: ml})

            # Calculating accuracy, precision, recall and F1-score
            accuracy_val = accuracy_score(y_val, y_pred_val)
            precision_val = precision_score(y_val, y_pred_val, average='weighted', zero_division=0)
            recall_val = recall_score(y_val, y_pred_val, average='weighted')
            f1_val = f1_score(y_val, y_pred_val, average='weighted')

            accuracy_test = accuracy_score(y_test, y_pred_test)
            precision_test = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
            recall_test = recall_score(y_test, y_pred_test, average='weighted')
            f1_test = f1_score(y_test, y_pred_test, average='weighted')

            # Calculating AIC and BIC
            log_likelihood_val = -log_loss(y_val, y_pred_prob_val, labels=np.arange(len(le.classes_)))
            log_likelihood_test = -log_loss(y_test, y_pred_prob_test, labels=np.arange(len(le.classes_)))
            n_params = len(ml.get_params()) if ml_name != 'Base Model' else 0
            n_samples_val = len(y_val)
            n_samples_test = len(y_test)
            aic_val, bic_val = calculate_aic_bic(log_likelihood_val, n_params, n_samples_val)
            aic_test, bic_test = calculate_aic_bic(log_likelihood_test, n_params, n_samples_test)

            # Appending metrics to appropriate lists
            accuracy_test_scores.extend([accuracy_test])
            accuracy_val_scores.extend([accuracy_val])
            precision_test_scores.extend([precision_test])
            precision_val_scores.extend([precision_val])
            recall_test_scores.extend([recall_test])
            recall_val_scores.extend([recall_val])
            f1_test_scores.extend([f1_test])
            f1_val_scores.extend([f1_val])
            aic_test_scores.extend([aic_test])
            aic_val_scores.extend([aic_val])
            bic_test_scores.extend([bic_test])
            bic_val_scores.extend([bic_val])

            # Adding results to df_predictions
            df_predictions.loc[i] = np.array(['utility', ml_name, 'test', y_test, y_pred_test, accuracy_test, precision_test, recall_test, f1_test,
                                             aic_test, bic_test], dtype=object)

            df_predictions.loc[i + 1] = np.array(['utility', ml_name, 'validation', y_val, y_pred_val, accuracy_val, precision_val, recall_val, f1_val,
                                                 aic_val, bic_val], dtype=object)

            i += 2

        # Calculating mean metrics across folds
        mean_accuracy_test = np.mean(accuracy_test_scores)
        mean_accuracy_val = np.mean(accuracy_val_scores)
        mean_precision_test = np.mean(precision_test_scores)
        mean_precision_val = np.mean(precision_val_scores)
        mean_recall_test = np.mean(recall_test_scores)
        mean_recall_val = np.mean(recall_val_scores)
        mean_f1_test = np.mean(f1_test_scores)
        mean_f1_val = np.mean(f1_val_scores)
        mean_aic_test = np.mean(aic_test_scores)
        mean_aic_val = np.mean(aic_val_scores)
        mean_bic_test = np.mean(bic_test_scores)
        mean_bic_val = np.mean(bic_val_scores)

        # Printing metrics
        if print_results:
            print(y_pred_test)
            print("utility \n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format(ml_name, mean_accuracy_test, mean_precision_test, mean_recall_test, mean_f1_test, mean_aic_test, mean_bic_test))
            print("\n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format('test set', mean_accuracy_val, mean_precision_val, mean_recall_val, mean_f1_val, mean_aic_val, mean_bic_val))
            print("\n")

    return df_predictions, trained_models

In [None]:
# Set up for training random forest classifier and XGBoost models

def get_feature_set(dataset):
    # Adjust based on which features you want to train models on
    i1 = dataset.columns.get_loc('final_category') + 1
    i2 = dataset.columns.get_loc('similarity') + 1
    feature_set = dataset.iloc[:, i1:i2]
    return feature_set

# Function for calculating AIC and BIC for model comparison
def calculate_aic_bic(log_likelihood, n_params, n_samples):
    aic = 2 * n_params - 2 * log_likelihood
    bic = np.log(n_samples) * n_params - 2 * log_likelihood
    return aic, bic

def train_models(train_set, test_set, print_results=False):
    SEED = 1
    i = 0

    # Transforming categorical labels into numbers for model
    le = LabelEncoder()
    train_set['final_category_encoded'] = le.fit_transform(train_set['final_category'])
    test_set['final_category_encoded'] = le.transform(test_set['final_category'])

    # Getting feature set and outcome variable
    predictorVar = get_feature_set(train_set)
    targetVar = 'final_category_encoded'
    X_trainval = train_set[predictorVar.columns]
    y_trainval = train_set[targetVar]
    X_test = test_set[predictorVar.columns]
    y_test = test_set[targetVar]

    # Setting up 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setting up hyperparameters for each model (adjust as needed)
    params_rf = {
        'max_depth': 20,
        'min_samples_leaf': 3,
        'min_samples_split': 8,
        'n_estimators': 200,
        'class_weight': 'balanced'
    }

    params_xgb = {
        'learning_rate': 0.2,
        'max_depth': 10,
        'n_estimators': 200,
        'subsample': 0.6,
        'eta': 0.1,
        'objective': 'multi:softmax',  # for classification
        'num_class': 3,
        'scale_pos_weight': len(y_trainval) / (2 * np.bincount(y_trainval))
    }

    rf = RandomForestClassifier(random_state=SEED)
    rf.set_params(**params_rf)
    xgb_model = xgb.XGBClassifier(**params_xgb)

    # Set up saving for trained models
    trained_models = {"utility": dict()}
    df_predictions = pd.DataFrame(columns=['measure', 'model', 'set', 'actual', 'predictions', 'accuracy', 'precision', 'recall', 'f1_score', 'aic', 'bic'], index=range(0, 14))

    models = [('Random Forest', rf), ('XGBoost', xgb_model), ('Base Model', 'mode')]
    accuracy_test_scores = []
    accuracy_val_scores = []
    precision_test_scores = []
    precision_val_scores = []
    recall_test_scores = []
    recall_val_scores = []
    f1_test_scores = []
    f1_val_scores = []
    aic_test_scores = []
    aic_val_scores = []
    bic_test_scores = []
    bic_val_scores = []

    # Training and validating each model using 5-fold cross-validation
    for ml_name, ml in models:
      for train_index, val_index in skf.split(X_trainval, y_trainval):
          X_train, X_val = X_trainval.iloc[train_index], X_trainval.iloc[val_index]
          y_train, y_val = y_trainval.iloc[train_index], y_trainval.iloc[val_index]

          if ml_name == 'Random Forest':
              ml.fit(X_train, y_train)
              y_pred_val = ml.predict(X_val)
              y_pred_test = ml.predict(X_test)
              y_pred_prob_val = ml.predict_proba(X_val)
              y_pred_prob_test = ml.predict_proba(X_test)
          elif ml_name == 'Base Model':
              mode_val = train_set['final_category_encoded'].mode()[0]
              y_pred_val = np.repeat(mode_val, len(X_val))
              y_pred_test = np.repeat(mode_val, len(X_test))
              y_pred_prob_val = np.ones((len(X_val), len(le.classes_))) / len(le.classes_)
              y_pred_prob_test = np.ones((len(X_test), len(le.classes_))) / len(le.classes_)
          else:
              ml.fit(X_train, y_train)
              y_pred_val = ml.predict(X_val)
              y_pred_test = ml.predict(X_test)
              y_pred_prob_val = ml.predict_proba(X_val)
              y_pred_prob_test = ml.predict_proba(X_test)

          trained_models['utility'].update({ml_name: ml})

          # Calculating accuracy, precision, recall and F1-score
          accuracy_val = accuracy_score(y_val, y_pred_val)
          precision_val = precision_score(y_val, y_pred_val, average='weighted', zero_division=0)
          recall_val = recall_score(y_val, y_pred_val, average='weighted')
          f1_val = f1_score(y_val, y_pred_val, average='weighted')

          accuracy_test = accuracy_score(y_test, y_pred_test)
          precision_test = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
          recall_test = recall_score(y_test, y_pred_test, average='weighted')
          f1_test = f1_score(y_test, y_pred_test, average='weighted')

          # Calculating AIC and BIC
          log_likelihood_val = -log_loss(y_val, y_pred_prob_val, labels=np.arange(len(le.classes_)))
          log_likelihood_test = -log_loss(y_test, y_pred_prob_test, labels=np.arange(len(le.classes_)))
          n_params_rf = len(params_rf) if ml_name == 'Random Forest' else len(params_xgb)
          n_samples_val = len(y_val)
          n_samples_test = len(y_test)
          aic_val, bic_val = calculate_aic_bic(log_likelihood_val, n_params_rf, n_samples_val)
          aic_test, bic_test = calculate_aic_bic(log_likelihood_test, n_params_rf, n_samples_test)

          # Appending metrics to appropriate lists
          accuracy_test_scores.extend([accuracy_test])
          accuracy_val_scores.extend([accuracy_val])
          precision_test_scores.extend([precision_test])
          precision_val_scores.extend([precision_val])
          recall_test_scores.extend([recall_test])
          recall_val_scores.extend([recall_val])
          f1_test_scores.extend([f1_test])
          f1_val_scores.extend([f1_val])
          aic_test_scores.extend([aic_test])
          aic_val_scores.extend([aic_val])
          bic_test_scores.extend([bic_test])
          bic_val_scores.extend([bic_val])

          # Adding results to df_predictions
          df_predictions.loc[i] = np.array(['utility', ml_name, 'test', y_test, y_pred_test, accuracy_test, precision_test, recall_test, f1_test,
                                             aic_test, bic_test], dtype=object)

          df_predictions.loc[i + 1] = np.array(['utility', ml_name, 'validation', y_val, y_pred_val, accuracy_val, precision_val, recall_val, f1_val,
                                                 aic_val, bic_val], dtype=object)

          i += 2

      # Calculating mean metrics across folds
      mean_accuracy_test = np.mean(accuracy_test_scores)
      mean_accuracy_val = np.mean(accuracy_val_scores)
      mean_precision_test = np.mean(precision_test_scores)
      mean_precision_val = np.mean(precision_val_scores)
      mean_recall_test = np.mean(recall_test_scores)
      mean_recall_val = np.mean(recall_val_scores)
      mean_f1_test = np.mean(f1_test_scores)
      mean_f1_val = np.mean(f1_val_scores)
      mean_aic_test = np.mean(aic_test_scores)
      mean_aic_val = np.mean(aic_val_scores)
      mean_bic_test = np.mean(bic_test_scores)
      mean_bic_val = np.mean(bic_val_scores)

      # Printing metrics
      if print_results:
            print(y_pred_test)
            print("utility \n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format(ml_name, mean_accuracy_test, mean_precision_test, mean_recall_test, mean_f1_test, mean_aic_test, mean_bic_test))
            print("\n")
            print("{:s} \n Accuracy: {:.3f}, \n Precision: {:.3f}, \n Recall: {:.3f} \n F1: {:.3f}, AIC: {:.3f}, BIC: {:.3f} \n".format('test set', mean_accuracy_val, mean_precision_val, mean_recall_val, mean_f1_val, mean_aic_val, mean_bic_val))
            print("\n")

      # Save XGBoost model if ml_name is XGBoost (for feature importance analysis)
      if ml_name == 'XGBoost':
          joblib.dump(ml, 'xgb_model.pkl')
          print("XGBoost model saved successfully.")

    return df_predictions, trained_models

In [None]:
# Train models
models, df_predictions = train_models(train_set, test_set, print_results = True)

## Model Evaluation

### Feature Importance

In [None]:
# Loading the XGBoost model
model = joblib.load('xgb_model.pkl')

# Extracting feature importances
if hasattr(model, 'feature_importances_'):
    feature_importances = model.feature_importances_
    feature_names = model.feature_names_in_
elif hasattr(model, 'get_score'):
    feature_importances = model.get_score(importance_type='weight')
    feature_names = list(feature_importances.keys())
    feature_importances = list(feature_importances.values())

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Select top 15 features
top_features = importance_df.head(15)

# Visualize in a plot
plt.figure(figsize=(10, 8))
plt.barh(top_features['Feature'], top_features['Importance'], color='#204D74')
plt.xlabel('Importance', fontsize = 14)
plt.yticks(fontsize = 12)
plt.xticks(fontsize = 12)
plt.gca().invert_yaxis()
plt.show()