In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gzip


 
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

sns.set_theme(style="whitegrid")

## Modelling Task B

For this analysis we will be creating a linear regression model to predict the hardness score. The hardness score represent the difficulty of the question being asked. Therefore, it is independent of the main dataset (conversation data) or the response embedding data in the auxilliary dataset. To predict the hardness score we will make the assumption that each row of embedding data from the prompt embeddings corresponds to each row of the topic_and_hardness dataset.

The problem statement for task B states that we must use linear regression to determine the hardness score. Therefore, any linear model from the sklearn library would meet this criteria. Therefore, we will perform an analysis and return the results of the best performing models. We will then select the top two models for hyperparameter tuning to create our final models, then the best model of the tuned models will be the final model.

In [2]:
# Auxiliary Datasets

# Prompt embedding Data -- we will use this data in the "Embedding Data" section
prompt_embeddings = np.load(
    "../data/training_data/chatbot-arena-prompts-embeddings.npy"
)

# Topic Modeling and Hardness Score Data -- we will use this data in the "Topic Modeling and Hardness Score Data" section
topic_and_hardness = pd.read_json(
    "../data/training_data/chatbot-arena-gpt3-scores.jsonl.gz",
    lines=True,
    compression="gzip"
)

# Test Data
# Test prompt embeddings
test_prompt_embeddings = np.load(
    "../data/testing_data/arena-test-set-prompts-embeddings.npy"
)

# Test topic modeling data
test_topic_and_hardness = pd.read_json(
    "../data/testing_data/arena-test-set-topic-modeling.jsonl.gz",
    lines=True,
    compression="gzip"
)



In [3]:
# Check the data topic and modeling data for null values
null_indices = list(topic_and_hardness[topic_and_hardness["topic_modeling_1"].isnull()].index)
null_indices

[584,
 5060,
 5458,
 5595,
 6260,
 7807,
 7808,
 7809,
 8529,
 10755,
 10857,
 14269,
 15368,
 20363,
 23775,
 23776,
 23778,
 23779,
 23780,
 23781,
 23782,
 23783,
 24962,
 24963,
 24965,
 24966]

In [4]:
prompt_embeddings = np.delete(prompt_embeddings, null_indices, axis = 0)

In [5]:
# Drop rows with missing values
topic_and_hardness.drop(
    topic_and_hardness[topic_and_hardness["score_value_1"].isnull() == True].index,
    inplace = True
)
# Reset the index for continuity
topic_and_hardness.reset_index(drop = True, inplace = True)
topic_and_hardness.shape

(25256, 12)

In [6]:
topic_and_hardness = topic_and_hardness.drop(columns = ["score_reason_1", 
                                                        "score_reason_2", 
                                                        "score_reason_3",
                                                        "openai_scores_raw_choices_nested",
                                                        "question_id"
                                                        ])
test_topic_and_hardness = test_topic_and_hardness.drop(columns = ["question_id"])
topic_and_hardness.shape

(25256, 7)

In [7]:
# Clean the score data
for i in range(3):
    topic_and_hardness[f"score_value_{i+1}"] = topic_and_hardness[f"score_value_{i+1}"].apply(
        # Clean nested list element into an int
        lambda x: x[0][0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], list) and len(x[0]) == 1 else (
            # Else clean the list element into an int
            x[0] if isinstance(x, list) and len(x) == 1 and isinstance(x[0], (int, float)) 
            # Else leave it alone
            else x
        )
    )

In [8]:
list(topic_and_hardness["score_value_1"].unique())

[np.float64(9.0),
 np.float64(2.0),
 np.float64(8.0),
 np.float64(7.0),
 np.float64(6.0),
 np.float64(3.0),
 np.float64(5.0),
 np.float64(1.0),
 np.float64(4.0),
 np.float64(0.8)]

In [9]:
topic_and_hardness["score_value_1"] = topic_and_hardness["score_value_1"].fillna(0).astype(int)
topic_and_hardness["score_value_2"] = topic_and_hardness["score_value_2"].fillna(0).astype(int)
topic_and_hardness["score_value_3"] = topic_and_hardness["score_value_3"].fillna(0).astype(int)
topic_and_hardness["score_value_1"] = topic_and_hardness["score_value_1"].astype(int)
topic_and_hardness["score_value_2"] = topic_and_hardness["score_value_2"].astype(int)
topic_and_hardness["score_value_3"] = topic_and_hardness["score_value_3"].astype(int)

In [10]:
topic_and_hardness["score_value_1"].value_counts()

score_value_1
8    10918
7     7161
9     3381
2     2159
6      828
3      512
4      177
1       95
5       24
0        1
Name: count, dtype: int64

In [11]:
topic_and_hardness["score_value_2"].value_counts()

score_value_2
8    10778
7     7146
9     3544
2     2215
6      838
3      476
4      155
1       84
5       20
Name: count, dtype: int64

In [12]:
topic_and_hardness["score_value_3"].value_counts()

score_value_3
8    10890
7     7105
9     3490
2     2126
6      837
3      482
4      184
1      114
5       27
0        1
Name: count, dtype: int64

In [13]:
with pd.option_context('display.max_rows', None):
    print(topic_and_hardness["topic_modeling_1"].value_counts())

topic_modeling_1
Creative Writing                                                                           565
Factual Accuracy                                                                           510
Problem-solving, Creativity                                                                398
Factual Knowledge                                                                          300
Problem Solving                                                                            283
Math Problem                                                                               209
Information Retrieval                                                                      188
Problem-solving, Factual accuracy                                                          176
Creativity, Humor                                                                          164
Problem-Solving, Creativity                                                                144
Problem-Solving                  

## Feature Engineering

In [14]:
# Libraries for computing statistics
from scipy.stats import skew, kurtosis

# Libraries for clustering, scaling, and PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Libraries for prompt similarity
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
# Compute statistics across embedding dimensions
def compute_statistics(embeddings):
    features = {
        'mean': np.mean(embeddings, axis = 1),
        'variance': np.var(embeddings, axis = 1),
        'skewness': skew(embeddings, axis = 1),
        'kurtosis': kurtosis(embeddings, axis = 1),
        'l2_norm': np.linalg.norm(embeddings, axis = 1)
    }
    return features

In [16]:
# Obtain textual features from the prompt
def compute_prompt_features(prompts):
    features = {
        'num_words': [len(p.split()) for p in prompts],
        'num_chars': [len(p) for p in prompts],
        'num_unique_words': [len(set(p.split())) for p in prompts]
    }
    return features

In [17]:
# Compute the distance for each prompt embedding to each cluster centroid
def compute_centroid_distances(pca_train_test, centroids):
    return np.linalg.norm(pca_train_test[:, np.newaxis] - centroids, axis = 2)

# Function to perform prompt clustering which returns the distance to each cluster centroid
def prompt_clustering(train_embeddings, test_embeddings):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_embeddings)
    scaled_test = scaler.transform(test_embeddings)

    pca = PCA(n_components = 150)
    pca_train = pca.fit_transform(scaled_train)
    pca_test = pca.transform(scaled_test)

    kmeans = KMeans(n_clusters = 4, random_state = 42)
    kmeans.fit(pca_train)

    cluster_distances_train = compute_centroid_distances(pca_train, kmeans.cluster_centers_)
    cluster_distances_test = compute_centroid_distances(pca_test, kmeans.cluster_centers_)

    return cluster_distances_train, cluster_distances_test


In [18]:
# Create a function to compute the similarity between the embedding and the average embedding for a score
def compute_similarities(embeddings, hardness_mean_embeddings):      
    dataframes = {}
    
    for score_value, mean_embeddings in hardness_mean_embeddings.items():
        # Create an empty array to store the similarities
        similarities = []
        for emb in embeddings:
            similarity = [
                cosine_similarity(emb.reshape(1, -1), mean_emb.reshape(1, -1))[0, 0]
                for mean_emb in mean_embeddings.values()
            ]
            similarities.append(similarity)
        # Create the dataframe of similarities
        columns = [f"{score_value}_avg_sim_{j}" for j in range(10)]
        dataframes[f"df_sim_{score_value}"] = pd.DataFrame(similarities, columns = columns)

    return dataframes

In [19]:
# Data Creation pipeline
def pipeline(prompt_embeddings, topic_and_hardness, test_prompt_embeddings, test_topic_and_hardness):
    
    # Create statistics dataframes
    train_statistics = compute_statistics(prompt_embeddings)
    df_train_stats = pd.DataFrame(train_statistics)

    test_statistics = compute_statistics(test_prompt_embeddings)
    df_test_stats = pd.DataFrame(test_statistics)

    # Create prompt features dataframes
    train_prompt_features = compute_prompt_features(topic_and_hardness["prompt"])
    df_train_prompt_features = pd.DataFrame(train_prompt_features)

    test_prompt_features = compute_prompt_features(test_topic_and_hardness["prompt"])
    df_test_prompt_features = pd.DataFrame(test_prompt_features)

    # Create cluster centroid distance dataframes
    train_CD, test_CD = prompt_clustering(prompt_embeddings, test_prompt_embeddings)
    df_cluster_train = pd.DataFrame(train_CD, columns = ["CD_1", "CD_2", "CD_3", "CD_4"])
    df_cluster_test = pd.DataFrame(test_CD, columns = ["CD_1", "CD_2", "CD_3", "CD_4"])

    # Create dictionaries for each score value containing the indices of prompts for each score value
    hardness_indices = {
        f"score_value_{i}": {
            j: topic_and_hardness[topic_and_hardness["score_value_1"] == i].index
            for j in range(10)
        }
        for i in range(4)
    }
    
    # Define the labels for the hardness scores
    hardness_score_labels = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
    
    # Compute the mean embeddings for each hardness score
    hardness_mean_embeddings = {}
    for score_value, index_dict in hardness_indices.items():
        hardness_mean_embeddings[score_value] = {
            hardness: np.mean(prompt_embeddings[index_dict[hardness]], axis = 0)
            for hardness in hardness_score_labels
        }

    # Create dataframes for cosine similarity of each prompt to the average prompt embedding to each score
    train_dataframes = compute_similarities(prompt_embeddings, hardness_mean_embeddings)
    test_dataframes = compute_similarities(test_prompt_embeddings, hardness_mean_embeddings)
    
    df_train_sim_1 = train_dataframes["df_sim_score_value_1"]
    df_train_sim_2 = train_dataframes["df_sim_score_value_2"]
    df_train_sim_3 = train_dataframes["df_sim_score_value_3"]

    df_test_sim_1 = test_dataframes["df_sim_score_value_1"]
    df_test_sim_2 = test_dataframes["df_sim_score_value_2"]
    df_test_sim_3 = test_dataframes["df_sim_score_value_3"]

    # Convert prompt embeddings into a dataframe
    # Turn the prompt embeddings data into a pandas dataframe
    num_features = prompt_embeddings.shape[1]
    column_names = [f"feature_{i+1}" for i in range(num_features)]
    df_prompt_train = pd.DataFrame(prompt_embeddings, columns = column_names)
    df_prompt_test = pd.DataFrame(test_prompt_embeddings, columns = column_names)
                                  
    # Drop the topic modeling data
    topic_and_hardness = topic_and_hardness.drop(columns = ["topic_modeling_1", "topic_modeling_2", "topic_modeling_3"])
    test_topic_and_hardness = test_topic_and_hardness.drop(columns = ["topic_modeling_1", "topic_modeling_2", "topic_modeling_3"])

    # Create the final dataframes to be used in modeling
    df_train = pd.concat([df_prompt_train,
                          df_train_sim_1,
                          df_train_sim_2,
                          df_train_sim_3,
                          df_cluster_train,
                          df_train_prompt_features,
                          df_train_stats
                          ], axis = 1)
    df_test = pd.concat([df_prompt_test,
                          df_test_sim_1,
                          df_test_sim_2,
                          df_test_sim_3,
                          df_cluster_test,
                          df_test_prompt_features,
                          df_test_stats
                          ], axis = 1)
    
    return df_train, df_test


In [20]:
df_train, df_test = pipeline(prompt_embeddings, topic_and_hardness, test_prompt_embeddings, test_topic_and_hardness)

## Model Building

In [21]:
# Data Processing Libraries
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate, KFold
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Model Building Libraries
from sklearn.linear_model import (
    LinearRegression,
    Ridge, 
    Lasso,
    ElasticNet,
    SGDRegressor,
    BayesianRidge,
    ARDRegression
)

In [22]:
# Split the data into X and y
X = df_train
y_1 = topic_and_hardness["score_value_1"]
y_2 = topic_and_hardness["score_value_2"]
y_3 = topic_and_hardness["score_value_3"]

X_train, X_test, y_train_1, y_test_1 = train_test_split(X, y_1, test_size = 0.3, random_state = 42)
_, _, y_train_2, y_test_2 = train_test_split(X, y_2, test_size = 0.3, random_state = 42)
_, _, y_train_3, y_test_3 = train_test_split(X, y_3, test_size = 0.3, random_state = 42)

# Scale the data.
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

In [23]:
y_train_1.unique()

array([8, 7, 9, 3, 6, 2, 4, 1, 5])

In [24]:
# Create an empty list to store all of the models for testing
models = []

# Append models into the list\
models.append(("LinearRegression", LinearRegression()))
models.append(("Ridge", Ridge()))
models.append(("Lasso", Lasso()))
models.append(("ElasticNet",ElasticNet()))
models.append(("SGDRegressor", SGDRegressor()))
models.append(("BayesianRidge", BayesianRidge()))
models.append(("ARDRegression", ARDRegression()))

# Create lists to store the output of the training loop
model_names = []
train_MSE = []
test_MSE = []
trained_models = []

# Loop through the models to obtain mean cross-validated MSE scores
for name, model in models:

    # Add the model name to the list for this iteration
    model_names.append(name)

    # Set training parameters
    scoring = "neg_mean_squared_error"
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)

    # Get the mean cross-validated MSE score on the training data
    train_cv_result = cross_validate(estimator = model, 
                                      X = X_train_s, 
                                      y = y_train_1, 
                                      cv = kfold, 
                                      scoring = scoring,
                                      return_train_score = True,
                                      return_estimator = True)
    avg_train_MSE = -train_cv_result['train_score']
    min_score_index = np.argmin(avg_train_MSE)

    train_MSE.append(avg_train_MSE[min_score_index])
    cv_model = train_cv_result['estimator'][min_score_index]
    trained_models.append(cv_model)

    # Get the MSE score on the test data
    y_pred = cv_model.predict(X_test_s)
    y_pred_int = y_pred.astype(int) # Round predictions to nearest integer
    comp_MSE = mean_squared_error(y_test_1, y_pred_int)
    test_MSE.append(comp_MSE)

# Print Results
print("\n" "Cross-Validation MSE on Training Data:")
for i in range(len(model_names)):
        print("{}: {}".format(model_names[i], train_MSE[i]))

print("\n" "MSE on Testing Data:")
for i in range(len(model_names)):
    print("{}: {}".format(model_names[i], test_MSE[i]))


Cross-Validation MSE on Training Data:
LinearRegression: 2.489969424124836
Ridge: 2.4911694042400665
Lasso: 3.5715116705558327
ElasticNet: 3.5715116705558327
SGDRegressor: 2.586781217927764
BayesianRidge: 2.5047423713547268
ARDRegression: 2.5057994810556377

MSE on Testing Data:
LinearRegression: 2.997360432889006
Ridge: 2.9985482380889534
Lasso: 3.751484756499934
ElasticNet: 3.751484756499934
SGDRegressor: 3.121816022172364
BayesianRidge: 3.001583740266596
ARDRegression: 3.0163653160881614


In [25]:
# Best Performing model so far

df_test_s = scaler.transform(df_test)

# Make Predictions
y_pred = trained_models[6].predict(df_test_s)
submission_df = pd.read_csv("temp_output.csv")
submission_df["hardness_score"] = y_pred.astype(int)
submission_df.to_csv("final_submission.csv", index = False)
submission_df

Unnamed: 0,question_id,winner,hardness_score
0,4f332ebd8cdc4ff2be74aa8828ff20d5,model_b,6
1,f2be6f13e5ed40e5b81443223996494c,model_b,5
2,5fafefb8a0c54243afb52d2892946cea,model_b,7
3,7834f572267f40709ecebb273a2b346b,tie (bothbad),6
4,1ccc7e58290245c4bd5457fce45f8640,model_a,7
...,...,...,...
3195,eb08f8a7f20840c99efe9fc8c03f1c13,model_a,6
3196,4baca918f1f5440599ae9edb3bfa8cc1,model_b,8
3197,a787ce60dc1440f39455ab20e3bffe33,model_b,7
3198,3dc09f20eedb405ab3dc980cf7bff5d0,model_a,3
