In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sns.set_theme(style="whitegrid")

## Modelling Task B

For this analysis we will be creating a linear regression model to predict the hardness score. The hardness score represent the difficulty of the question being asked. Therefore, it is independent of the main dataset (conversation data) or the response embedding data in the auxilliary dataset. To predict the hardness score we will make the assumption that each row of embedding data from the prompt embeddings corresponds to each row of the topic_and_hardness dataset.

The problem statement for task B states that we must use linear regression to determine the hardness score. Therefore, any linear model from the sklearn library would meet this criteria. Therefore, we will perform an analysis and return the results of the best performing models. We will then select the top two models for hyperparameter tuning to create our final models, then the best model of the tuned models will be the final model.

In [62]:
# Auxiliary Datasets

# Prompt embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "../training_data/chatbot-arena-prompts-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "../training_data/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

# Response embedding Data -- we will use this data in the "Embedding Data" section
response_embeddings = np.load(
    "../training_data/chatbot-arena-prompts-embeddings.npy"
)

In [63]:
prompt_embeddings.shape

(25282, 256)

In [64]:
response_embeddings.shape

(25282, 256)

In [65]:
topic_and_hardness.shape

(25282, 12)

In [66]:
topic_and_hardness.head(1)

Unnamed: 0,question_id,prompt,openai_scores_raw_choices_nested,topic_modeling_1,score_reason_1,score_value_1,topic_modeling_2,score_reason_2,score_value_2,topic_modeling_3,score_reason_3,score_value_3
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,"[{'finish_reason': 'stop', 'index': 0, 'logpro...",Technical Comparison,This prompt requires the AI to accurately comp...,9,Software Comparison,This prompt assesses the AI's factual accuracy...,8,"Comparison, Technology",This prompt requires the AI to demonstrate kno...,9


In [67]:
topic_and_hardness["topic_modeling_1"].unique

<bound method Series.unique of 0             Technical Comparison
1               Reasoning, Emotion
2                Camera comparison
3                    Chatbot Arena
4                       Time Query
                   ...            
25277     Mathematics, Measurement
25278        Information Retrieval
25279    Training, Hyperparameters
25280            Language Modeling
25281          Workflow Automation
Name: topic_modeling_1, Length: 25282, dtype: object>

Based on the above analysis, majority of the data in the topic_and_hardness dataframe is not useful for analysis. Therefore our method will be to create an ensemble type approach where we train a model to target each of the three score values, then average the result from the three models to obtain the final predicted hardness score. But first we must find out which model performs best on our data.

Let us first create our data.

In [68]:
topic_and_hardness.columns

Index(['question_id', 'prompt', 'openai_scores_raw_choices_nested',
       'topic_modeling_1', 'score_reason_1', 'score_value_1',
       'topic_modeling_2', 'score_reason_2', 'score_value_2',
       'topic_modeling_3', 'score_reason_3', 'score_value_3'],
      dtype='object')

In [69]:
# Extract the score value columns
data = topic_and_hardness[["score_value_1", "score_value_2", "score_value_3"]].copy()
data["prompt_length"] = topic_and_hardness["prompt"].apply(len)
data.head(1)

Unnamed: 0,score_value_1,score_value_2,score_value_3,prompt_length
0,9,8,9,47


In [70]:
data.isnull().sum()

score_value_1    26
score_value_2    26
score_value_3    26
prompt_length     0
dtype: int64

In [71]:
# Turn the prompt embeddings data into a pandas dataframe
num_features = response_embeddings.shape[1]
column_names = [f"feature_{i+1}" for i in range(num_features)]
df_response = pd.DataFrame(response_embeddings, columns = column_names)
df_response.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,-0.024708,-0.114236,0.034814,0.006923,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807


In [72]:
# Turn the prompt embeddings data into a pandas dataframe
num_features = prompt_embeddings.shape[1]
column_names = [f"feature_{i+1}" for i in range(num_features)]
df_prompt = pd.DataFrame(prompt_embeddings, columns = column_names)
df_prompt.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,-0.024708,-0.114236,0.034814,0.006923,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807


In [85]:
# Create the modelling data
m_data = pd.concat([df_response, data], axis = 1)
#m_data = pd.concat([m_data, data], axis = 1)
m_data.head(5)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,score_value_1,score_value_2,score_value_3,prompt_length
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807,9,8,9,47
1,0.006028,0.028436,-0.091022,0.039573,-0.080445,-0.0536,-0.046251,-0.026352,-0.081835,0.04504,...,-0.022912,-0.082866,0.055752,0.085062,-0.053332,0.001854,9,8,8,49
2,-0.035222,-0.109402,-0.022247,-0.037604,0.037931,-0.049936,-0.011818,0.03361,0.032769,0.022925,...,0.032092,0.055308,-0.035479,-0.141167,0.004774,0.004169,2,6,2,32
3,-0.05053,-0.004413,0.090092,0.029821,-0.037979,-0.095112,-0.016179,0.006698,-0.06379,0.046847,...,-0.013089,0.060516,0.032741,-0.034432,0.045946,-0.063517,8,8,8,35
4,-0.038406,0.045207,0.061096,0.051551,0.046493,-0.016303,0.058638,0.054352,-0.065154,-0.023475,...,0.070126,-0.039035,-0.083557,-0.045493,0.012152,-0.010252,2,2,2,17


In [86]:
# Drop rows with missing values
m_data.drop(list(m_data[m_data["score_value_1"].isnull() == True].index), inplace = True)

In [87]:
m_data.isnull().sum()

feature_1        0
feature_2        0
feature_3        0
feature_4        0
feature_5        0
                ..
feature_256      0
score_value_1    0
score_value_2    0
score_value_3    0
prompt_length    0
Length: 260, dtype: int64

In [88]:
# Clean the data
for i in range(3):
    m_data[f"score_value_{i+1}"] = m_data[f"score_value_{i+1}"].apply(
        # Clean nested list element into an int
        lambda x: x[0][0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], list) and len(x[0]) == 1 else (
            # Else clean the list element into an int
            x[0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], (int, float)) 
            # Else leave it alone
            else x
        )
    )

In [89]:
m_data.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,score_value_1,score_value_2,score_value_3,prompt_length
0,-0.123763,-0.117352,0.045677,0.015849,0.085833,-0.027624,0.003787,-0.08236,0.088994,-0.00169,...,0.015938,0.059344,-0.162139,-0.024396,-0.03724,-0.043807,9.0,8,9.0,47
1,0.006028,0.028436,-0.091022,0.039573,-0.080445,-0.0536,-0.046251,-0.026352,-0.081835,0.04504,...,-0.022912,-0.082866,0.055752,0.085062,-0.053332,0.001854,9.0,8,8.0,49
2,-0.035222,-0.109402,-0.022247,-0.037604,0.037931,-0.049936,-0.011818,0.03361,0.032769,0.022925,...,0.032092,0.055308,-0.035479,-0.141167,0.004774,0.004169,2.0,6,2.0,32
3,-0.05053,-0.004413,0.090092,0.029821,-0.037979,-0.095112,-0.016179,0.006698,-0.06379,0.046847,...,-0.013089,0.060516,0.032741,-0.034432,0.045946,-0.063517,8.0,8,8.0,35
4,-0.038406,0.045207,0.061096,0.051551,0.046493,-0.016303,0.058638,0.054352,-0.065154,-0.023475,...,0.070126,-0.039035,-0.083557,-0.045493,0.012152,-0.010252,2.0,2,2.0,17


In [79]:
# Data Processing Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Model Building Libraries
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import (
    LinearRegression,
    Ridge, 
    Lasso,
    ElasticNet,
    SGDRegressor,
    BayesianRidge,
    ARDRegression
)

In [90]:
# Data Preprocessing

# Split the data into X and y
X = m_data.drop(columns = ["score_value_1", "score_value_2", "score_value_3"])
y_1 = m_data["score_value_1"]
y_2 = m_data["score_value_2"]
y_3 = m_data["score_value_3"]

X_train, X_test, y_train_1, y_test_1 = train_test_split(X, y_1, test_size = 0.3, random_state = 42)
_, _, y_train_2, y_test_2 = train_test_split(X, y_2, test_size = 0.3, random_state = 42)
_, _, y_train_3, y_test_3 = train_test_split(X, y_3, test_size = 0.3, random_state = 42)

# Scale the data.
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)


In [91]:
y_train_1

8684     8.0
3273     7.0
12870    7.0
25162    9.0
17897    8.0
        ... 
21589    8.0
5392     8.0
861      8.0
15808    8.0
23668    7.0
Name: score_value_1, Length: 17679, dtype: float64

In [92]:
# Create an empty list to store all of the models for testing
models = []

# Append models into the list\
models.append(("LinearRegression", LinearRegression()))
models.append(("Ridge", Ridge()))
models.append(("Lasso", Lasso()))
models.append(("ElasticNet",ElasticNet()))
models.append(("SGDRegressor", SGDRegressor()))
models.append(("BayesianRidge", BayesianRidge()))
models.append(("ARDRegression", ARDRegression()))

# Create lists to store the output of the training loop
model_names = []
train_MSE = []
test_MSE = []

# Loop through the models to obtain mean cross-validated MSE scores
for name, model in models:

    # Add the model name to the list for this iteration
    model_names.append(name)

    # Set training parameters
    scoring = "neg_mean_squared_error"
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

    # Get the mean cross-validated MSE score on the training data
    train_cv_result = cross_val_score(estimator = model, X = X_train_s, y = y_train_1, cv = kfold, scoring = scoring)
    avg_train_MSE = -train_cv_result.mean()
    train_MSE.append(avg_train_MSE)
    

    # Get the MSE score on the test data
    model.fit(X_train_s, y_train_1)
    y_pred = model.predict(X_test_s)
    y_pred_int = np.round(y_pred).astype(int) # Round predictions to nearest integer
    comp_MSE = mean_squared_error(y_test_1, y_pred_int)
    test_MSE.append(comp_MSE)

# Print Results
print("\n" "Cross-Validation MSE on Training Data:")
for i in range(len(model_names)):
    print("{}: {}".format(model_names[i], train_MSE[i]))

print("\n" "MSE on Testing Data:")
for i in range(len(model_names)):
    print("{}: {}".format(model_names[i], test_MSE[i]))



Cross-Validation MSE on Training Data:
LinearRegression: 2.6539774077052627
Ridge: 2.653964388762386
Lasso: 3.614491152408501
ElasticNet: 3.614491152408501
SGDRegressor: 2.7568733667495446
BayesianRidge: 2.6481119432862767
ARDRegression: 2.6570359163228963

MSE on Testing Data:
LinearRegression: 2.7925353042101095
Ridge: 2.7925353042101095
Lasso: 3.7500910650653294
ElasticNet: 3.7500910650653294
SGDRegressor: 2.842766266332321
BayesianRidge: 2.7925353042101095
ARDRegression: 2.807184901676125


From the above analysis you can see that the models perform similarly on the testing and training data, but there are some slight differences. 

The Lasso and ElasticNet models have the highest MSE at ~1.93 and ~1.59. The SGDRegressor model is clearly the fifth place candidate based on its MSE of ~0.8 on the testing data.

The remaining models have similar MSE scores on the test data. By obersving the Ridge and Linear Regression models and applying some critical thinking, we can conclude that when using mean_squared_error as the loss metric for the Linear Regression model it effectively becomes a Ridge model. Therefore the top three candidates are: Ridge, ARDRegression, and BayesianRidge.

We will choose the Ridge model and the ARDRegression model as our top two models to perform hyperparameter tuning on.

Note: The Ridge model only has one parameter for hyperparameter tuning, alpha. The ARDRegression model has four parameters for tuning: alpha_1, alpha_2, lambda_1, lambda_2.

## Hyperparameter Tuning


In [93]:
# Hyperparameter tuning for Ridge model

# Set up the parameter grid
param_grid = {
    "alpha" : [1E-10, 1E-9, 1E-8, 1E-7, 1E-6, 1E-5, 1E-4, 1E-3]}
scoring = 'neg_mean_squared_error'

# Perform GridSearchCV across the parameter grid
grid_search = GridSearchCV(estimator = Ridge(), 
                           param_grid = param_grid, 
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True # Return the MSE for each alpha in .cv_results_
                           )
grid_search.fit(X_train_s, y_train_1)

# Create empty variables to store the best model
best_ridge_model = []
best_alpha = []
best_ridge_train_MSE = None
best_ridge_test_MSE = float("inf")

# Loop through each alpha in the parameter grid
for i, param in enumerate(grid_search.cv_results_["params"]):

    # Obtain the train_MSE for the iteration
    train_MSE = -grid_search.cv_results_["mean_test_score"][i]

    # Train the Ridge model on the alpha for this iteration
    model = Ridge(alpha = param["alpha"])
    model.fit(X_train_s, y_train_1)
    y_pred = model.predict(X_test_s)
    y_pred_int = np.round(y_pred).astype(int)

    # Obtain the test_MSE for the iteration
    test_MSE = mean_squared_error(y_test_1, y_pred_int)

    if test_MSE < best_ridge_test_MSE:
        best_ridge_model = model
        best_alpha = param["alpha"]
        best_ridge_train_MSE = train_MSE
        best_ridge_test_MSE = test_MSE

# Print the results
print(f"The best alpha is {best_alpha}")
print(f"The cross-validated MSE for the best Ridge model is {best_ridge_train_MSE}")
print(f"The MSE of the best Ridge model versus the test data is {best_ridge_test_MSE}")


The best alpha is 1e-10
The cross-validated MSE for the best Ridge model is 2.650754617963383
The MSE of the best Ridge model versus the test data is 2.7925353042101095


In [94]:
# Hyperparameter tuning for BayesianRidge model

from sklearn.model_selection import RandomizedSearchCV

# Set up the parameter grid
param_grid = {
    "alpha_1" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "alpha_2" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "lambda_1" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    "lambda_2" : [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
    
}
scoring = 'neg_mean_squared_error'

# Perform GridSearchCV across the parameter grid
rand_search = RandomizedSearchCV(estimator = ARDRegression(), 
                           param_distributions = param_grid, 
                           cv = 5,
                           scoring = 'neg_mean_squared_error',
                           return_train_score = True, # Return the MSE for each alpha in .cv_results_
                           n_iter = 100,
                           random_state = 42, 
                           n_jobs = -1
                           )
rand_search.fit(X_train_s, y_train_1)

# Create empty variables to store the best model
best_params = []
best_ARD_train_MSE = None
best_ARD_test_MSE = float("inf")

# Loop through each alpha in the parameter grid
for i, param in enumerate(rand_search.cv_results_["params"]):

    # Obtain the train_MSE for the iteration
    train_MSE = -rand_search.cv_results_["mean_test_score"][i]

    # Train the Ridge model on the alpha for this iteration
    model = ARDRegression(**param)  # Use ** for every combination of possible parameters
    model.fit(X_train_s, y_train_1)
    y_pred = model.predict(X_test_s)
    y_pred_int = np.round(y_pred).astype(int)

    # Obtain the test_MSE for the iteration
    test_MSE = mean_squared_error(y_test_1, y_pred_int)

    if test_MSE < best_ARD_test_MSE:
        best_ARD_model = model
        best_params = param
        best_ARD_train_MSE = train_MSE
        best_ARD_test_MSE = test_MSE

# Print the results
print(f"The cross-validated MSE for the best ARDRegression model is {best_ARD_train_MSE}")
print(f"The MSE of the best ARDRegression model versus the test data is {best_ARD_test_MSE}")

The cross-validated MSE for the best ARDRegression model is 2.6478653160508805
The MSE of the best ARDRegression model versus the test data is 2.790687607232414


In [21]:
# Obtain the best parameters for the final ARDRegression model
best_params

{'lambda_2': 0.0001, 'lambda_1': 1, 'alpha_2': 1e-06, 'alpha_1': 1e-05}

Now that we have determined our best model, let us employ our ensemble method to obtain the most accurate prediction given the probabalistic responses of GPT3.5. To do this we are going to train three models on each of the score values generated by GPT3.5 and then average the results of the models to obtain our final prediction.

In [34]:
X_train_s.shape

(17679, 259)

In [23]:
# Split the data for y2 and y3 on indices to align with y
y2 = m_data["score_value_2"]
y3 = m_data["score_value_3"]

_, _, y2_train, y2_test = train_test_split(X, y2, test_size = 0.3, random_state = 42)
_, _, y3_train, y3_test = train_test_split(X, y3, test_size = 0.3, random_state = 42)

# Initialize the models
model1 = ARDRegression(alpha_1 = 1E-5, alpha_2 = 1E-6, lambda_1 = 1, lambda_2 = 0.0001)
model2 = ARDRegression(alpha_1 = 1E-5, alpha_2 = 1E-6, lambda_1 = 1, lambda_2 = 0.0001)
model3 = ARDRegression(alpha_1 = 1E-5, alpha_2 = 1E-6, lambda_1 = 1, lambda_2 = 0.0001)

# Set training parameters
scoring = "neg_mean_squared_error"
kfold1 = KFold(n_splits = 5, shuffle = True, random_state = 42)
kfold2 = KFold(n_splits = 5, shuffle = True, random_state = 16)
kfold3 = KFold(n_splits = 5, shuffle = True, random_state = 6)

# Get the mean cross-validated MSE score on the training data
train_cv_result_1 = cross_val_score(estimator = model1, X = X_train_s, y = y_train, cv = kfold1, scoring = scoring)
train_cv_result_2 = cross_val_score(estimator = model2, X = X_train_s, y = y2_train, cv = kfold2, scoring = scoring)
train_cv_result_3 = cross_val_score(estimator = model3, X = X_train_s, y = y3_train, cv = kfold3, scoring = scoring)

avg_train_MSE_1 = -train_cv_result_1.mean()
avg_train_MSE_2 = -train_cv_result_2.mean()
avg_train_MSE_3 = -train_cv_result_3.mean()    

# Fit the models and obtain predictions
model1.fit(X_train_s, y_train)
y1_pred = model.predict(X_test_s)
y1_pred_int = np.round(y1_pred).astype(int) # Round predictions to nearest integer

model2.fit(X_train_s, y2_train)
y2_pred = model.predict(X_test_s)
y2_pred_int = np.round(y2_pred).astype(int) # Round predictions to nearest integer

model3.fit(X_train_s, y3_train)
y3_pred = model.predict(X_test_s)
y3_pred_int = np.round(y3_pred).astype(int) # Round predictions to nearest integer

# Print the MSE of the training data
print(f"The MSE of the ARDRegression model predicting 'score_value_1' is {avg_train_MSE_1}.")
print(f"The MSE of the ARDRegression model predicting 'score_value_2' is {avg_train_MSE_2}.")
print(f"The MSE of the ARDRegression model predicting 'score_value_3' is {avg_train_MSE_3}.")

# Add results to a dataframe and obtain the average prediction
df_final = pd.DataFrame()
df_final["model_1_pred"] = y1_pred_int
df_final["model_2_pred"] = y2_pred_int
df_final["model_3_pred"] = y3_pred_int
df_final["avg_pred"] = np.round(df_final[["model_1_pred", "model_2_pred", "model_3_pred"]].mean(axis = 1)).astype(int)

# Obtain the MSE on the test data using the average prediction
test_MSE_1  = mean_squared_error(y_test, df_final["avg_pred"])
test_MSE_2  = mean_squared_error(y2_test, df_final["avg_pred"])
test_MSE_3  = mean_squared_error(y3_test, df_final["avg_pred"])
print(f"The MSE of the ARDRegression Ensemble agains the test data for 'score_value_1' is: {test_MSE_1}.")
print(f"The MSE of the ARDRegression Ensemble agains the test data for 'score_value_2' is: {test_MSE_2}.")
print(f"The MSE of the ARDRegression Ensemble agains the test data for 'score_value_3' is: {test_MSE_3}.")


The MSE of the ARDRegression model predicting 'score_value_1' is 0.7035689059689595.
The MSE of the ARDRegression model predicting 'score_value_2' is 0.7247891585771813.
The MSE of the ARDRegression model predicting 'score_value_3' is 0.7101071992500068.
The MSE of the ARDRegression Ensemble agains the test data for 'score_value_1' is: 0.7762755708063878.
The MSE of the ARDRegression Ensemble agains the test data for 'score_value_2' is: 0.8230170252078659.
The MSE of the ARDRegression Ensemble agains the test data for 'score_value_3' is: 0.821471558664379.


## Evaluating Against the Test Data

In [24]:
# Auxiliary Datasets

# Embedding Data -- we will use this data in the "Embedding Data" section
test_prompt_embeddings = np.load(
    "../testing_data/arena-test-set-prompts-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "../testing_data/arena-test-set-topic-modeling.jsonl.gz",
    lines=True,
    compression="gzip"
)

In [25]:
test_prompt_embeddings

array([[-0.05513094, -0.11570924,  0.05522487, ..., -0.01544981,
        -0.03390506, -0.05747894],
       [-0.02218126, -0.03471164, -0.05304605, ...,  0.01184425,
        -0.05673543,  0.10960151],
       [ 0.0456381 , -0.08502936,  0.00432697, ..., -0.06711485,
         0.07976342, -0.02999518],
       ...,
       [-0.01514673, -0.06583532,  0.09111056, ..., -0.02442352,
        -0.07683857, -0.05174748],
       [ 0.02692666, -0.04334887, -0.03203418, ..., -0.04427489,
         0.11957113, -0.10678061],
       [ 0.1114219 , -0.00910184,  0.03681982, ...,  0.01050424,
        -0.02826794,  0.09706794]], dtype=float32)

In [26]:
topic_and_hardness.head()

Unnamed: 0,question_id,prompt,topic_modeling_1,topic_modeling_2,topic_modeling_3
0,4f332ebd8cdc4ff2be74aa8828ff20d5,what do you think about the future of iran?,Future Prediction,Future Prediction,Future Prediction
1,f2be6f13e5ed40e5b81443223996494c,Salut ! Tu es un méchant chatbot !,"Role-playing, Evaluation","Role-play, Evaluation","Creativity, Factual Accuracy"
2,5fafefb8a0c54243afb52d2892946cea,⚔️ Chatbot Arena ⚔️\nRules:\n Chat with two...,Chatbot Evaluation,Chatbot Evaluation,Chatbot Evaluation
3,7834f572267f40709ecebb273a2b346b,Guess the word that i have in my mind,Guessing Game,Word Guessing,Word Guessing
4,1ccc7e58290245c4bd5457fce45f8640,You are a peasant living in the village. But s...,"Problem-Solving, Creativity",Problem Solving,"Problem-solving, Creativity"


In [28]:
topic_and_hardness.shape

(3200, 5)

In [29]:
test_prompt_embeddings.shape

(3200, 256)

In [30]:
# Perform some data manipulation
test_df = pd.DataFrame()
test_df["prompt_length"] = topic_and_hardness["prompt"].apply(len)
test_df.head(1)

Unnamed: 0,prompt_length
0,43


In [32]:
# Turn the prompt embeddings data into a pandas dataframe
num_features = test_prompt_embeddings.shape[1]
column_names = [f"feature_{i+1}" for i in range(num_features)]
embeddings = pd.DataFrame(test_prompt_embeddings, columns = column_names)
embeddings.head(1)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_247,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256
0,-0.055131,-0.115709,0.055225,0.050576,0.010953,-0.004206,0.062269,0.064194,0.06936,0.016847,...,0.035079,-0.044518,0.043931,0.000368,-0.076169,0.002721,0.008611,-0.01545,-0.033905,-0.057479


In [33]:
# Create the modelling data
test_data = pd.concat([embeddings, test_df], axis = 1)
test_data.head(5)

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_248,feature_249,feature_250,feature_251,feature_252,feature_253,feature_254,feature_255,feature_256,prompt_length
0,-0.055131,-0.115709,0.055225,0.050576,0.010953,-0.004206,0.062269,0.064194,0.06936,0.016847,...,-0.044518,0.043931,0.000368,-0.076169,0.002721,0.008611,-0.01545,-0.033905,-0.057479,43
1,-0.022181,-0.034712,-0.053046,-0.064429,-0.00079,-0.065599,-0.029762,0.032057,-0.108342,0.009685,...,-0.004786,0.011192,0.076667,0.1213,0.071898,0.045037,0.011844,-0.056735,0.109602,34
2,0.045638,-0.085029,0.004327,-0.047471,-0.024484,-0.02682,-0.066908,-0.002554,0.008854,0.065876,...,-0.115438,0.019166,0.001164,0.026768,0.085855,-0.040475,-0.067115,0.079763,-0.029995,325
3,0.007901,-0.037956,-0.083973,-0.018601,-0.066053,-0.065125,-0.009083,0.082465,-0.024516,-0.046597,...,-0.00255,0.045959,0.003057,0.098007,-0.02859,0.040015,-0.062342,-0.048916,-0.007213,37
4,0.051753,0.016543,0.055762,0.017472,-0.106851,-0.083537,0.059957,0.027934,-0.01556,0.034891,...,-0.059957,-0.005297,0.061444,-0.047929,0.111843,0.1094,0.096973,-0.04299,-0.052841,166


In [None]:
# Scale the data.
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)