In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip


 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sns.set_theme(style="whitegrid")

## Modelling Task B

For this analysis we will be creating a linear regression model to predict the hardness score. The hardness score represent the difficulty of the question being asked. Therefore, it is independent of the main dataset (conversation data) or the response embedding data in the auxilliary dataset. To predict the hardness score we will make the assumption that each row of embedding data from the prompt embeddings corresponds to each row of the topic_and_hardness dataset.

The problem statement for task B states that we must use linear regression to determine the hardness score. Therefore, any linear model from the sklearn library would meet this criteria. Therefore, we will perform an analysis and return the results of the best performing models. We will then select the top two models for hyperparameter tuning to create our final models, then the best model of the tuned models will be the final model.

In [114]:
# Auxiliary Datasets

# Prompt embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "../training_data/chatbot-arena-prompts-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "../training_data/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)


In [70]:
# Turn the prompt embeddings data into a pandas dataframe
num_features = prompt_embeddings.shape[1]
column_names = [f"feature_{i+1}" for i in range(num_features)]
df_prompt = pd.DataFrame(prompt_embeddings, columns = column_names)
df_prompt.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,-0.024708,-0.114236,0.034814,0.006923,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807


In [71]:
topic_and_hardness.head(1)

Unnamed: 0,question_id,prompt,openai_scores_raw_choices_nested,topic_modeling_1,score_reason_1,score_value_1,topic_modeling_2,score_reason_2,score_value_2,topic_modeling_3,score_reason_3,score_value_3
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...",Technical Comparison,This prompt requires the AI to accurately comp...,9,Software Comparison,This prompt assesses the AI's factual accuracy...,8,"Comparison, Technology",This prompt requires the AI to demonstrate kno...,9


In [None]:
topic_and_hardness = topic_and_hardness.drop(columns = ["score_reason_1", 
                                                        "score_reason_2", 
                                                        "score_reason_3",
                                                        "openai_scores_raw_choices_nested",
                                                        "question_id",
                                                        "topic_modeling_1",
                                                        "topic_modeling_2",
                                                        "topic_modeling_3"], axis = 1)
topic_and_hardness.columns

Index(['question_id', 'prompt', 'topic_modeling_1', 'score_value_1',
       'topic_modeling_2', 'score_value_2', 'topic_modeling_3',
       'score_value_3'],
      dtype='object')

In [73]:
# Add a column for prompt length
topic_and_hardness["prompt_length"] = topic_and_hardness["prompt"].apply(len)

In [74]:
# Clean the score data
for i in range(3):
    topic_and_hardness[f"score_value_{i+1}"] = topic_and_hardness[f"score_value_{i+1}"].apply(
        # Clean nested list element into an int
        lambda x: x[0][0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], list) and len(x[0]) == 1 else (
            # Else clean the list element into an int
            x[0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], (int, float)) 
            # Else leave it alone
            else x
        )
    )

In [75]:
topic_and_hardness["score_value_1"][
    topic_and_hardness["score_value_1"].apply(lambda x: isinstance(x, list))
]

Series([], Name: score_value_1, dtype: float64)

In [76]:
# Clean the topic modeling data
for i in range(3):
    topic_and_hardness[f"topic_modeling_{i+1}"] = topic_and_hardness[f"topic_modeling_{i+1}"].apply(
        # Clean list element into an string
        lambda x: ", ".join(x) if isinstance(x, list) else x
    )

In [77]:
topic_and_hardness["topic_modeling_3"][
    topic_and_hardness["topic_modeling_3"].apply(lambda x: isinstance(x, list))
]

Series([], Name: topic_modeling_3, dtype: object)

In [78]:
group_1 = topic_and_hardness.groupby(["topic_modeling_1", "prompt_length"])["score_value_1"].mean().reset_index()
group_1

Unnamed: 0,topic_modeling_1,prompt_length,score_value_1
0,3D Modeling,42,7.333333
1,3D Modeling,57,7.000000
2,3D Modeling,102,8.000000
3,3D Modeling,103,9.000000
4,3D Printing,158,7.000000
...,...,...,...
21263,"troubleshooting, technology",39,7.000000
21264,urban forestry,71,8.000000
21265,"videogames, recommendation",54,7.000000
21266,weather forecast,122,7.000000


In [79]:
topic_and_hardness.head(1)

Unnamed: 0,question_id,prompt,topic_modeling_1,score_value_1,topic_modeling_2,score_value_2,topic_modeling_3,score_value_3,prompt_length
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,Technical Comparison,9.0,Software Comparison,8.0,"Comparison, Technology",9.0,47


In [None]:
df_train = topic_and_hardness.drop(columns = ["question_id", "topic_modeling_1", "topic_modeling_2", "topic_modeling_3"])

In [81]:
df_train.head(1)

Unnamed: 0,score_value_1,score_value_2,score_value_3,prompt_length
0,9.0,8.0,9.0,47


In [82]:
# Concatenate the training data with the prompt embeddings
# Create the modelling data
df_train = pd.concat([df_prompt, df_train], axis = 1)
df_train.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,score_value_1,score_value_2,score_value_3,prompt_length
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807,9.0,8.0,9.0,47


In [83]:
df_train.isnull().sum()

feature_1         0
feature_2         0
feature_3         0
feature_4         0
feature_5         0
                 ..
feature_256       0
score_value_1    26
score_value_2    26
score_value_3    26
prompt_length     0
Length: 260, dtype: int64

In [84]:
# Drop rows with missing values
df_train.drop(df_train[df_train["score_value_1"].isnull() == True].index, inplace = True)
df_train.shape

(25256, 260)

In [85]:
df_train.isnull().sum()

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
                ..
feature_256      0
score_value_1    0
score_value_2    0
score_value_3    0
prompt_length    0
Length: 260, dtype: int64

In [86]:
# Extract the valid prompt embeddings back into an array
embedding_columns = [col for col in df_train.columns if col.startswith("feature_")]
valid_prompt_embeddings = df_train[embedding_columns].to_numpy()

In [87]:
df_train.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,score_value_1,score_value_2,score_value_3,prompt_length
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807,9.0,8.0,9.0,47


## Fold based feature engineering

In [88]:
from sklearn.model_selection import KFold

# Split the data into folds
kfolds = KFold(n_splits = 5, shuffle = True, random_state = 42)
folds = list(kfolds.split(df_train))
folds

[(array([    0,     1,     2, ..., 25253, 25254, 25255]),
  array([   17,    29,    30, ..., 25226, 25239, 25240])),
 (array([    1,     2,     5, ..., 25252, 25254, 25255]),
  array([    0,     3,     4, ..., 25247, 25251, 25253])),
 (array([    0,     1,     2, ..., 25252, 25253, 25254]),
  array([    5,     7,     8, ..., 25249, 25250, 25255])),
 (array([    0,     3,     4, ..., 25253, 25254, 25255]),
  array([    1,     2,    10, ..., 25205, 25210, 25229])),
 (array([    0,     1,     2, ..., 25251, 25253, 25255]),
  array([    9,    11,    13, ..., 25245, 25252, 25254]))]

In [89]:
for i in range(5):
    df_train[f"similarity_{i+1}"] = None
    for j in range(3):
        df_train[f"sim_{i+1}_score_{j+1}"] = None

In [90]:
df_train.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,sim_3_score_2,sim_3_score_3,similarity_4,sim_4_score_1,sim_4_score_2,sim_4_score_3,similarity_5,sim_5_score_1,sim_5_score_2,sim_5_score_3
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,,,,,,,,,,


In [91]:
from sklearn.metrics.pairwise import cosine_similarity

# Loop through the folds
for fold_idx, (train_idx, val_idx) in enumerate(folds):
    # Index the dataset (Note: the indices of training data / val data match df_train)
    training_data = df_train.iloc[train_idx]
    val_data = df_train.iloc[val_idx]

    # Obtain prompt embeddings
    training_prompt_indices = [col for col in training_data.columns if col.startswith("feature_")]
    val_prompt_indices = [col for col in val_data.columns if col.startswith("feature_")]
    training_prompt_embeddings = training_data[training_prompt_indices].to_numpy()
    val_prompt_embeddings = val_data[val_prompt_indices].to_numpy()

    # Obtain the similarity of every prompt in the validation data to every prompt in the training data
    for idx, prompt_embedding in zip(val_idx, val_prompt_embeddings):
        prompt_embedding = prompt_embedding.reshape(1, -1)
        similarity = cosine_similarity(prompt_embedding, training_prompt_embeddings).flatten()

        # Obtain the similarity scores and the index of the top five most similar prompts
        top_five_indices = np.argsort(similarity)[-5:][::-1]
        top_five_similarities = similarity[top_five_indices]

        # Add this data to the dataframe
        for order, (idy, sim_score) in enumerate(zip(top_five_indices, top_five_similarities), start = 1):
            # Add the similarity scores for the top five most similar prompts to the dataframe
            df_train.loc[idx, f"similarity_{order}"] = sim_score

            # Add the scores from the similar prompts to the data
            for j in range(3):
                df_train.loc[idx, f"sim_{order}_score_{j+1}"] = training_data.iloc[idy][f"score_value_{j+1}"]

In [92]:
df_train.isnull().sum()

feature_1        26
feature_2        26
feature_3        26
feature_4        26
feature_5        26
                 ..
sim_4_score_3    26
similarity_5     26
sim_5_score_1    26
sim_5_score_2    26
sim_5_score_3    26
Length: 280, dtype: int64

In [93]:
# Drop rows with missing values
df_train.drop(df_train[df_train["score_value_1"].isnull() == True].index, inplace = True)
df_train.shape

(25256, 280)

In [94]:
df_train.isnull().sum()

feature_1         0
feature_2         0
feature_3         0
feature_4         0
feature_5         0
                 ..
sim_4_score_3    26
similarity_5     26
sim_5_score_1    26
sim_5_score_2    26
sim_5_score_3    26
Length: 280, dtype: int64

In [95]:
# Drop rows with missing values
df_train.drop(df_train[df_train["sim_5_score_3"].isnull() == True].index, inplace = True)
df_train.shape

(25230, 280)

In [30]:
df_train.isnull().sum()

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
                ..
sim_4_score_2    0
sim_4_score_3    0
sim_5_score_1    0
sim_5_score_2    0
sim_5_score_3    0
Length: 295, dtype: int64

In [96]:
df_train.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,sim_3_score_2,sim_3_score_3,similarity_4,sim_4_score_1,sim_4_score_2,sim_4_score_3,similarity_5,sim_5_score_1,sim_5_score_2,sim_5_score_3
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,7.0,7.0,0.487742,9.0,9.0,9.0,0.475612,9.0,8.0,8.0


In [116]:
df_train["score_value_1"] = df_train["score_value_1"].fillna(0).astype(int)
df_train["score_value_2"] = df_train["score_value_2"].fillna(0).astype(int)
df_train["score_value_3"] = df_train["score_value_3"].fillna(0).astype(int)
df_train["score_value_1"] = df_train["score_value_1"].astype(int)
df_train["score_value_2"] = df_train["score_value_2"].astype(int)
df_train["score_value_3"] = df_train["score_value_3"].astype(int)

## Model Building

In [115]:
# Data Processing Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Model Building Libraries
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import (
    LinearRegression,
    Ridge, 
    Lasso,
    ElasticNet,
    SGDRegressor,
    BayesianRidge,
    ARDRegression
)

from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

In [117]:
# Split the data into X and y
X = df_train.drop(columns = ["score_value_1", 
                             "score_value_2", 
                             "score_value_3", ])
y = df_train[["score_value_1", "score_value_2", "score_value_3"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Scale the data.
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [126]:
# Create an empty list to store all of the models for testing
models = []

# Append models into the list\
models.append(("Adaboost", AdaBoostClassifier()))
models.append(("GradientBoosting", GradientBoostingClassifier()))
models.append(("RandomForest", RandomForestClassifier(n_jobs = -1)))
models.append(("Bagging",BaggingClassifier(n_jobs = -1)))
models.append(("XGBClassifier", XGBClassifier(eval_metric = "logloss", n_jobs = -1)))

# Create lists to store the output of the training loop
model_names = []
train_MSE = []
test_MSE = []
trained_models = []

# Loop through the models to obtain mean cross-validated MSE scores
for name, model in models:

    # Add the model name to the list for this iteration
    model_names.append(name)

    # Set training parameters
    scoring = "neg_mean_squared_error"
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

    # Get the mean cross-validated MSE score on the training data
    #cv_result = cross_val_score(estimator = model, 
                                      #X = X_train_s, 
                                      #y = y_train["score_value_1"], 
                                      #cv = kfold, 
                                      #scoring = scoring
                                      #)
    
    # Evaluate the model on MSE
    model.fit(X_train_s, y_train["score_value_1"])
    y_pred_train = model.predict(X_train_s)
    y_pred_test = model.predict(X_test_s)

    train_mse = mean_squared_error(y_train["score_value_1"], y_pred_train)
    test_mse = mean_squared_error(y_test["score_value_1"], y_pred_test)

    train_MSE.append(train_mse)
    test_MSE.append(test_mse)

# Print Results
print("\n" "Cross-Validation MSE on Training Data:")
for i in range(len(model_names)):
        print("{}: {}".format(model_names[i], train_MSE[i]))

print("\n" "MSE on Testing Data:")
for i in range(len(model_names)):
    print("{}: {}".format(model_names[i], test_MSE[i]))

KeyboardInterrupt: 

In [None]:
# Create an empty list to store all of the models for testing
models = []

# Append models into the list\
models.append(("LinearRegression", LinearRegression()))
models.append(("Ridge", Ridge()))
models.append(("Lasso", Lasso()))
models.append(("ElasticNet",ElasticNet()))
models.append(("SGDRegressor", SGDRegressor()))
models.append(("BayesianRidge", BayesianRidge()))
models.append(("ARDRegression", ARDRegression()))

# Create lists to store the output of the training loop
model_names = []
train_MSE = []
test_MSE = []
trained_models = []

# Loop through the models to obtain mean cross-validated MSE scores
for name, model in models:

    # Add the model name to the list for this iteration
    model_names.append(name)

    # Set training parameters
    scoring = "neg_mean_squared_error"
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

    # Get the mean cross-validated MSE score on the training data
    train_cv_result = cross_validate(estimator = model, 
                                      X = X_train_s, 
                                      y = y_train, 
                                      cv = kfold, 
                                      scoring = scoring,
                                      return_train_score = True,
                                      return_estimator = True)
    avg_train_MSE = -train_cv_result['train_score']
    min_score_index = np.argmin(avg_train_MSE)

    train_MSE.append(avg_train_MSE[min_score_index])
    cv_model = train_cv_result['estimator'][min_score_index]
    trained_models.append(cv_model)

    # Get the MSE score on the test data
    y_pred = cv_model.predict(X_test_s)
    y_pred_int = np.round(y_pred).astype(int) # Round predictions to nearest integer
    comp_MSE = mean_squared_error(y_test, y_pred_int)
    test_MSE.append(comp_MSE)

# Print Results
print("\n" "Cross-Validation MSE on Training Data:")
for i in range(len(model_names)):
        print("{}: {}".format(model_names[i], train_MSE[i]))

print("\n" "MSE on Testing Data:")
for i in range(len(model_names)):
    print("{}: {}".format(model_names[i], test_MSE[i]))


Cross-Validation MSE on Training Data:
LinearRegression: 2.5262352808967656
Ridge: 2.526235288661006
Lasso: 3.627914308214588
ElasticNet: 3.627914308214588
SGDRegressor: 2.621268220513191
BayesianRidge: 2.531605310033661
ARDRegression: 2.540252394038225

MSE on Testing Data:
LinearRegression: 2.7758840886070373
Ridge: 2.7758400493239974
Lasso: 3.6570220636808037
ElasticNet: 3.6570220636808037
SGDRegressor: 2.850574712643678
BayesianRidge: 2.7739023208702167
ARDRegression: 2.782489981063108


From the above analysis you can see that the models perform similarly on the testing and training data, but there are some slight differences. 

The Lasso and ElasticNet models have the highest MSE at ~3.50 and ~2.61. The SGDRegressor model is clearly the fifth place candidate based on its MSE of ~2.02 on the testing data.

The remaining models have similar MSE scores on the test data. By obersving the Ridge and Linear Regression models and applying some critical thinking, we can conclude that when using mean_squared_error as the loss metric for the Linear Regression model it effectively becomes a Ridge model. Therefore the top three candidates are: Ridge, ARDRegression, and BayesianRidge.

We will choose the Ridge model and the ARDRegression model as our top two models to perform hyperparameter tuning on.

Note: The Ridge model only has one parameter for hyperparameter tuning, alpha. The BayesianRidge model has four parameters for tuning: alpha_1, alpha_2, lambda_1, lambda_2.

## Hyperparameter Tuning

In [105]:
# Hyperparameter tuning for Ridge model

# Set up the parameter grid
param_grid = {
    "alpha" : [1E-10, 1E-9, 1E-8, 1E-7, 1E-6, 1E-5, 1E-4, 1E-3]}
scoring = 'neg_mean_squared_error'

# Perform GridSearchCV across the parameter grid
grid_search = GridSearchCV(estimator = Ridge(), 
                           param_grid = param_grid, 
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True # Return the MSE for each alpha in .cv_results_
                           )
grid_search.fit(X_train_s, y_train)

# Obtain the best model from grid search
best_ridge_model = grid_search.best_estimator_
best_ridge_param = grid_search.best_params_

# Evaluate the model on the test data
y_pred = best_ridge_model.predict(X_test_s)
test_MSE = mean_squared_error(y_test, y_pred)

# Print the results
print(f"The best alpha is {best_ridge_param}")
print(f"The cross-validated MSE for the best Ridge model is {-grid_search.best_score_}")
print(f"The MSE of the best Ridge model versus the test data is {test_MSE}")

The best alpha is {'alpha': 0.001}
The cross-validated MSE for the best Ridge model is 2.6664906606232335
The MSE of the best Ridge model versus the test data is 2.6720156459706694


In [112]:
# Hyperparameter tuning for BayesianRidge model

from sklearn.model_selection import RandomizedSearchCV

# Set up the parameter grid
param_grid = {
    "alpha_1" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "alpha_2" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "lambda_1" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "lambda_2" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    
}
scoring = 'neg_mean_squared_error'

# Perform RandomizedSearchCV across the parameter grid
rand_search = RandomizedSearchCV(estimator = BayesianRidge(), 
                           param_distributions = param_grid, 
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True, # Return the MSE for each alpha in .cv_results_
                           n_iter = 100,
                           random_state = 42, 
                           n_jobs = -1
                           )
rand_search.fit(X_train_s, y_train["score_value_1"])

# Obtain the best model and parameters
best_BAY_model = rand_search.best_estimator_
best_params = rand_search.best_params_

# Evaluate the model on the test data
y_pred = best_BAY_model.predict(X_test_s)
test_MSE = mean_squared_error(y_test["score_value_1"], y_pred)

# Print the results
print(f"The cross-validated MSE for the best BayesianRidge model is {-rand_search.best_score_}")
print(f"The MSE of the best BayesianRidge versus the test data is {test_MSE}")

The cross-validated MSE for the best BayesianRidge model is 2.6888014314823026
The MSE of the best BayesianRidge versus the test data is 2.613380166701588


In [110]:
X.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,prompt_length,assumed_score_2,assumed_score_3
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,0.006923,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807,47,6.75,8.666667


## Evaluating against the Test Data

In [54]:
# Auxiliary Datasets

# Embedding Data -- we will use this data in the "Embedding Data" section
test_embeddings = np.load(
    "../testing_data/arena-test-set-prompts-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
test_hardness = pd.read_json(
    "../testing_data/arena-test-set-topic-modeling.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [55]:
# Turn the prompt embeddings data into a pandas dataframe
num_features = test_embeddings.shape[1]
column_names = [f"feature_{i+1}" for i in range(num_features)]
df_test_prompt = pd.DataFrame(test_embeddings, columns = column_names)
df_test_prompt.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-0.055131,-0.115709,0.055225,0.050576,0.010953,-0.004206,0.062269,0.064194,0.06936,0.016847,...,0.035079,-0.044518,0.043931,0.000368,-0.076169,0.002721,0.008611,-0.01545,-0.033905,-0.057479


In [138]:
df_test_prompt.shape

(3200, 256)

In [57]:
# Add a column for prompt length
test_hardness["prompt_length"] = test_hardness["prompt"].apply(len)

In [62]:
# Clean the topic modeling data
for i in range(3):
    test_hardness[f"topic_modeling_{i+1}"] = test_hardness[f"topic_modeling_{i+1}"].apply(
        # Clean list element into an string
        lambda x: ", ".join(x) if isinstance(x, list) else x
    )

In [58]:
test_hardness.head(1)

Unnamed: 0,question_id,prompt,topic_modeling_1,topic_modeling_2,topic_modeling_3,prompt_length
0,4f332ebd8cdc4ff2be74aa8828ff20d5,what do you think about the future of iran?,Future Prediction,Future Prediction,Future Prediction,43


In [70]:
counter_in = 0
counter_out = 0
for topic in test_hardness["topic_modeling_1"].unique():
    if topic in (topic_and_hardness["topic_modeling_1"].unique()):
        counter_in += 1
    else:
        counter_out += 1

print(counter_in, counter_out)

895 1130


You can see that out of the unique topics in the test data, only 895 of them are in the training data. The majority of topics in the test data are not in the training data. How can we find the assumed score? First, we can determine the cosine similarity between the embedding for the test prompt versus all of the training prompts. Then we can assume that the topic of the test prompt is the same topic as the most similar training prompt. Once we have a new assumed topic we can assume the hardness score is the average hardness score for that topic. This will allow us to have an assumed hardness score for every prompt embedding.

In [127]:
from sklearn.metrics.pairwise import cosine_similarity

# Create a function to determine the topic of the most similar training prompt.
def assumed_topic(test_prompt_embedding, all_training_embeddings, topic_modeling):
    # Reshape the test prompt embedding
    test_prompt_embedding = np.array(test_prompt_embedding).reshape(1, -1)
    # Determine the cosine similarity between the test prompt and all training prompts
    test_similarity = cosine_similarity(test_prompt_embedding, all_training_embeddings).flatten()

    # Determine the index of the most similar training prompt
    index = np.argmax(test_similarity)

    # Return the most similar training prompt
    return topic_modeling.iloc[index]

In [112]:
test_hardness.columns

Index(['question_id', 'prompt', 'topic_modeling_1', 'topic_modeling_2',
       'topic_modeling_3', 'prompt_length'],
      dtype='object')

In [113]:
test_hardness = test_hardness.drop(columns = ['question_id', 'prompt'])
test_hardness.head(1)

Unnamed: 0,topic_modeling_1,topic_modeling_2,topic_modeling_3,prompt_length
0,Future Prediction,Future Prediction,Future Prediction,43


In [114]:
# Concatenate the training data with the prompt embeddings
df_test = pd.concat([df_test_prompt, test_hardness], axis = 1)
df_test.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,topic_modeling_1,topic_modeling_2,topic_modeling_3,prompt_length
0,-0.055131,-0.115709,0.055225,0.050576,0.010953,-0.004206,0.062269,0.064194,0.06936,0.016847,...,-0.076169,0.002721,0.008611,-0.01545,-0.033905,-0.057479,Future Prediction,Future Prediction,Future Prediction,43


In [170]:
# Introduce the prediction heuristic for topic_modeling_2
assumed_scores_2 = []
assumed_scores_3 = []

for i in range(test_hardness.shape[0]):
    topic_2 = test_hardness["topic_modeling_2"].iloc[i]
    topic_3 = test_hardness["topic_modeling_3"].iloc[i]
    test_embedding = test_embeddings[i]

    if topic_2 in topic_2_avg_scores:
        # If topic is in the training data, use the average score for the topic
        assumed_score_2 = topic_2_avg_scores[topic_2]
    else:
        # If the topic is not in the training data, use the average score for the topic of the most similar prompt.
        most_similar_topic = assumed_topic(test_embedding,
                                           valid_prompt_embeddings,
                                           df_train["topic_modeling_2"])
        assumed_score_2 = topic_2_avg_scores[most_similar_topic]
    assumed_scores_2.append(assumed_score_2)

    if topic_3 in topic_3_avg_scores:
        # If topic is in the training data, use the average score for the topic
        assumed_score_3 = topic_3_avg_scores[topic_3]
    else:
        # If the topic is not in the training data, use the average score for the topic of the most similar prompt.
        most_similar_topic = assumed_topic(test_embedding,
                                           valid_prompt_embeddings,
                                           df_train["topic_modeling_3"])
        assumed_score_3 = topic_3_avg_scores[most_similar_topic]
    
    assumed_scores_3.append(assumed_score_3)

# Create the test dataframe
df_test = pd.DataFrame()
df_test["prompt_length"] = test_hardness["prompt_length"]
df_test["assumed_score_2"] = assumed_score_2
df_test["assumed_score_3"] = assumed_score_3
# Concatenate the training data with the prompt embeddings
df_test = pd.concat([df_test_prompt, df_test], axis = 1)
df_test.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,prompt_length,assumed_score_2,assumed_score_3
0,-0.055131,-0.115709,0.055225,0.050576,0.010953,-0.004206,0.062269,0.064194,0.06936,0.016847,...,0.000368,-0.076169,0.002721,0.008611,-0.01545,-0.033905,-0.057479,43,7.5,6.833333


In [172]:
# Scale the data before making predictions
scaler = StandardScaler()
X_train_s = scaler.fit_transform(df_test)

# Make Predictions
y_pred = best_BAY_model.predict(X_train_s)
y_pred.astype(int)

array([7, 6, 7, ..., 7, 6, 7])

In [173]:
submission_df = pd.read_csv("submission_20241129_084701.csv")
submission_df

Unnamed: 0,question_id,winner,hardness_score
0,4f332ebd8cdc4ff2be74aa8828ff20d5,model_b,9
1,f2be6f13e5ed40e5b81443223996494c,model_b,9
2,5fafefb8a0c54243afb52d2892946cea,model_b,9
3,7834f572267f40709ecebb273a2b346b,model_a,9
4,1ccc7e58290245c4bd5457fce45f8640,model_a,9
...,...,...,...
3195,eb08f8a7f20840c99efe9fc8c03f1c13,model_a,9
3196,4baca918f1f5440599ae9edb3bfa8cc1,model_b,9
3197,a787ce60dc1440f39455ab20e3bffe33,model_b,9
3198,3dc09f20eedb405ab3dc980cf7bff5d0,model_a,9


In [175]:
submission_df["hardness_score"] = y_pred.astype(int)
submission_df

Unnamed: 0,question_id,winner,hardness_score
0,4f332ebd8cdc4ff2be74aa8828ff20d5,model_b,7
1,f2be6f13e5ed40e5b81443223996494c,model_b,6
2,5fafefb8a0c54243afb52d2892946cea,model_b,7
3,7834f572267f40709ecebb273a2b346b,model_a,6
4,1ccc7e58290245c4bd5457fce45f8640,model_a,7
...,...,...,...
3195,eb08f8a7f20840c99efe9fc8c03f1c13,model_a,7
3196,4baca918f1f5440599ae9edb3bfa8cc1,model_b,7
3197,a787ce60dc1440f39455ab20e3bffe33,model_b,7
3198,3dc09f20eedb405ab3dc980cf7bff5d0,model_a,6


In [176]:
submission_df.to_csv("submission.csv", index = False)

In [214]:
test_prompt_embedding = test_embeddings[25]

similarity = cosine_similarity(np.array(test_prompt_embedding).reshape(1, -1), valid_prompt_embeddings)
similarity

array([[ 0.17592709, -0.03469267,  0.15824214, ...,  0.25422472,
         0.17422746,  0.24273388]], dtype=float32)

In [215]:
top_five_indices = np.argsort(similarity)[-5:][::-1]
top_five_indices

array([[ 2308,  8667, 18111, ..., 19321, 10712, 13465]])