In [25]:
%pip install textblob
%pip install textstat

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json, math
from tqdm import tqdm
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.linear_model import Lasso


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import normalize

from sklearn.linear_model import Lasso

import textstat

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [48]:
df = pd.read_json(
    './chatbot-arena-conversations.jsonl.gz',
    lines=True,
)
scores_df = pd.read_json(
    './chatbot-arena-gpt3-scores.jsonl.gz',
    lines=True,
)
df = df.merge(scores_df, left_index=True, right_index=True).drop('question_id_y', axis=1).rename({'question_id_x': 'question_id'}, axis=1)

In [49]:
df['score_value_1'] = df['score_value_1'].apply(lambda x: x[0][0] if isinstance(x, list) else x)
df['score_value_2'] = df['score_value_2'].apply(lambda x: x[0][0] if isinstance(x, list) else x)
df['score_value_3'] = df['score_value_3'].apply(lambda x: x[0][0] if isinstance(x, list) else x)
df['score'] = (df['score_value_1'] + df['score_value_2'] + df['score_value_3']) / 3

In [50]:
embeddings = np.load('./chatbot-arena-prompts-embeddings.npy')
embeddings = normalize(embeddings)

In [51]:
embeddings = embeddings[~df['score'].isna()]
df = df[~df['score'].isna()]

In [104]:
df_train, df_val, embeddings_train, embeddings_val = train_test_split(df, embeddings, test_size=0.2)

In [109]:
def score_feature_engineering(df, embeddings, test=False):
    df['log_prompt_length'] = np.log(df['prompt'].str.len())
    df['log_word_count'] = np.log(df['prompt'].str.split().str.len())
    #df['avg_word_length'] = df['prompt_length'] / df['word_count']
    
    #df['sentiment'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.polarity).abs()
    #df['subjectivity'] = df['prompt'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
    #df['flesch_reading_ease'] = df['prompt'].apply(textstat.flesch_reading_ease)
    
    X = df[['log_prompt_length', 'log_word_count']]
    X = pd.concat([X, pd.DataFrame(embeddings, index=df.index)], axis=1)
    X.columns = X.columns.astype(str)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if not test:
        #X = df[['score', 'prompt_length', 'word_count', 'avg_word_length', 'sentiment', 'subjectivity', 'flesch_reading_ease', 'cluster_difficulty']]
        y = df['score']
        return X,y

    else:
        #X = df[['prompt_length', 'word_count', 'avg_word_length', 'sentiment', 'subjectivity', 'flesch_reading_ease', 'cluster_difficulty']]
        return X

In [110]:
X_train, y_train = score_feature_engineering(df_train, embeddings_train, test=False)
X_val, y_val = score_feature_engineering(df_val, embeddings_val, test=False)

In [111]:
score_model = LinearRegression()
score_model.fit(X_train, y_train)

In [112]:
print(mean_squared_error(y_train, np.clip(score_model.predict(X_train).round(), 1, 9)))
print(mean_squared_error(y_val, np.clip(score_model.predict(X_val).round(), 1, 9)))

1.9273571852144693
1.86242340383475


In [13]:
def compute_mle_elo(df, impute=None, SCALE=400, BASE=10, INIT_RATING=1000): # add imputation
    from sklearn.linear_model import LogisticRegression
    models = pd.concat([df["model_a"], df["model_b"]]).unique()
    models = pd.Series(np.arange(len(models)), index=models)

    # duplicate battles
    df = pd.concat([df, df], ignore_index=True)
    p = len(models.index)
    n = df.shape[0]

    X = np.zeros([n, p])
    X[np.arange(n), models[df["model_a"]]] = +math.log(BASE)
    X[np.arange(n), models[df["model_b"]]] = -math.log(BASE)

    # one A win => two A win
    Y = np.zeros(n)
    Y[df["winner"] == "model_a"] = 1.0

    # one tie => one A win + one B win
    # find tie + tie (both bad) index
    tie_idx = (df["winner"] == "tie") | (df["winner"] == "tie (bothbad)")
    tie_idx[len(tie_idx)//2:] = False
    Y[tie_idx] = 1.0

    lr = LogisticRegression(fit_intercept=False)
    lr.fit(X,Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING

    # set anchor as llama-2-70b-chat = 1082
    if "llama-2-70b-chat" in models.index:
        elo_scores += 1082 - elo_scores[models["llama-2-70b-chat"]]
    return pd.Series(elo_scores, index = models.index).sort_values(ascending=False)

In [14]:
def compute_percent_adversarial(df):
    elo = compute_mle_elo(df).astype(int)
    baseline_df = (df['model_a'].map(elo) > df['model_b'].map(elo)).map({True: 'model_a', False: 'model_b'})
    adversarial_df = df[(df['winner'] != baseline_df) & ~(df['winner'].str.contains('tie'))]
    return adversarial_df['cluster'].value_counts().sort_index() / df['cluster'].value_counts().sort_index()

In [15]:
def compute_percent_tied(df):
    tied_df = df[df['winner'].str.contains('tie')]
    return tied_df['cluster'].value_counts().sort_index() / df['cluster'].value_counts().sort_index()

In [16]:
def compute_percent_tied_bothbad(df):
    bad_tie_df = df[df['winner'].str.contains('bothbad')]
    return bad_tie_df['cluster'].value_counts().sort_index() / df['cluster'].value_counts().sort_index()

In [17]:
def compute_average_lengths(df):
    df['response_a_length'] = df['conversation_a'].str[1].str['content'].str.len()
    df['response_b_length'] = df['conversation_b'].str[1].str['content'].str.len()
    return ((df.groupby('model_a')['response_a_length'].sum() + df.groupby('model_b')['response_b_length'].sum())
                           / (df.groupby('model_a').size() + df.groupby('model_b').size())).rename_axis('model')

In [18]:
def compute_mean_scores(df):
    return df.groupby(all_labels)['score'].mean()

In [19]:
def compute_head_to_head(df):
    all_battles = (pd.pivot_table(df, index='model_a', columns='model_b', aggfunc='size', fill_value=0)
                   + pd.pivot_table(df, index='model_b', columns='model_a', aggfunc='size', fill_value=0))
    model_a_wins = pd.pivot_table(df[df['winner']=='model_a'], index='model_a', columns='model_b', aggfunc='size', fill_value=0).sort_index()
    model_b_wins = pd.pivot_table(df[df['winner']=='model_b'], index='model_b', columns='model_a', aggfunc='size', fill_value=0).sort_index()
    return ((model_a_wins + model_b_wins) / all_battles).fillna(0)

In [20]:
def create_features(input, input_embeddings, all_data, test=False):
    input['prompt_length'] = input['prompt'].str.len()
    elo = compute_mle_elo(all_data).astype(int)
    input['model_a_elo'] = input['model_a'].map(elo)
    input['model_b_elo'] = input['model_b'].map(elo)
    hard_elo = compute_mle_elo(all_data[all_data['score'] >= 8]).astype(int)
    input['model_a_hard_elo'] = input['model_a'].map(hard_elo)
    input['model_b_hard_elo'] = input['model_b'].map(hard_elo)
    lengths = compute_average_lengths(all_data)
    input['model_a_length'] = input['model_a'].map(lengths)
    input['model_b_length'] = input['model_b'].map(lengths)
    head_to_head = compute_head_to_head(all_data)
    input['model_a_win_rate'] = input.apply(lambda x: head_to_head.loc[x['model_a']][x['model_b']], axis=1)
    input['model_b_win_rate'] = input.apply(lambda x: head_to_head.loc[x['model_b']][x['model_a']], axis=1)
    percent_adversarial = compute_percent_adversarial(all_data)
    input['cluster'] = kmeans.predict(input_embeddings)
    input['percent_adversarial'] = input['cluster'].map(percent_adversarial)
    percent_tied = compute_percent_tied(all_data)
    input['percent_tied'] = input['cluster'].map(percent_tied)
    percent_tied_bothbad = compute_percent_tied_bothbad(all_data)
    input['percent_tied_bothbad'] = input['cluster'].map(percent_tied_bothbad)
    cluster_elo = []
    cluster_model_count = []
    for i in range(n_clusters):
        cluster_elo.append(compute_mle_elo(all_data[all_data['cluster']==i]))
        cluster_model_count.append(all_data[all_data['cluster']==i].groupby('model_a').size() + all_data[all_data['cluster']== i].groupby('model_b').size())
    def compute_cluster_elo(x, model_a):
        m = x['model_a'] if model_a else x['model_b']
        c = x['cluster']
        return cluster_elo[c][m] if (m in cluster_model_count[c] and cluster_model_count[c][m] >= 10) else elo[m]
    input['model_a_cluster_elo'] = input.apply(lambda x: compute_cluster_elo(x, True), axis=1)
    input['model_b_cluster_elo'] = input.apply(lambda x: compute_cluster_elo(x, False), axis=1)
    
    features = ['prompt_length', 'score', 'model_a_elo', 'model_b_elo', 'model_a_cluster_elo', 'model_b_cluster_elo', 'model_a_hard_elo', 'model_b_hard_elo', 'cluster', 'model_a_length', 'model_b_length']
    X = input[features]
    if not test:
        y = input['winner'].map({'model_a': 0, 'model_b': 1, 'tie': 2, 'tie (bothbad)': 3})
        return X, y
    else:
        return X

In [21]:
def build_cluster_models(X, y):
    scaler = StandardScaler()
    cluster_models = []
    for n in range(n_clusters):
        #X_cluster = create_ensemble_features(X[X['cluster'] == n].drop(['cluster', 'percent_adversarial', 'percent_tied', 'percent_tied_bothbad'], axis=1))
        #X_cluster = X[X['cluster'] == n].drop(['cluster', 'percent_adversarial', 'percent_tied', 'percent_tied_bothbad'], axis=1)
        X_cluster = scaler.fit_transform(X[X['cluster'] == n].drop('cluster', axis=1))
        #X_cluster = create_ensemble_features(X[X['cluster'] == n].drop('cluster', axis=1))
        y_cluster = y[X['cluster'] == n]
        model = LogisticRegression(max_iter=5000)
        cluster_models.append(model.fit(X_cluster, y_cluster))
    return cluster_models

In [22]:
def predict_cluster_models(X, cluster_models):
    scaler = StandardScaler()
    y = np.zeros(len(X))
    for n in range(n_clusters):
        #X_cluster = create_ensemble_features(X[X['cluster'] == n].drop(['cluster', 'percent_adversarial', 'percent_tied', 'percent_tied_bothbad'], axis=1))
        #X_cluster = X[X['cluster'] == n].drop(['cluster', 'percent_adversarial', 'percent_tied', 'percent_tied_bothbad'], axis=1)
        X_cluster = scaler.fit_transform(X[X['cluster'] == n].drop('cluster', axis=1))
        #X_cluster = create_ensemble_features(X[X['cluster'] == n].drop('cluster', axis=1))
        model = cluster_models[n]
        y[X['cluster']==n] = model.predict(X_cluster)
    return y

In [23]:
n_clusters = 25
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(embeddings)

In [76]:
df['score'] = df['score'].round().astype(int)
df['cluster'] = kmeans.predict(embeddings)
X_train, y_train = create_features(df, embeddings, df)
cluster_models = build_cluster_models(X_train, y_train)

In [77]:
test = pd.read_json(
    './arena-validation-set-prompt-only.jsonl.gz',
    lines=True
)
test_embeddings = np.load('./arena-validation-set-prompts-embeddings.npy')

In [78]:
test['cluster'] = kmeans.predict(test_embeddings)

In [79]:
X_score = score_feature_engineering(test, test_embeddings, True)

In [80]:
test['score'] = np.clip(np.exp(score_model.predict(X_score).round().astype(int)), 1, 9)

In [81]:
test['score'].value_counts()

score
8    994
7    878
9    515
6    378
3    200
5    154
4     85
2      2
Name: count, dtype: int64

In [82]:
X_winner = create_features(test, test_embeddings, df, True)

In [83]:
y_winner = predict_cluster_models(X_winner, cluster_models)

In [84]:
score_predictions = pd.concat([test[['question_id', 'model_a', 'model_b']], X_winner, pd.Series(y_winner).rename('winner')], axis=1)

In [85]:
score_predictions['winner'] = score_predictions['winner'].map({0.0: 'model_a', 1.0: 'model_b', 2.0: 'tie', 3.0: 'tie (bothbad)'})

In [86]:
score_predictions = score_predictions[['question_id', 'winner', 'score']].rename({'score': 'hardness_score'}, axis=1)

In [87]:
from datetime import datetime
from IPython.display import display, HTML

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = "submission_{}.csv".format(timestamp)
score_predictions.to_csv(filename, index=False)

print('Created a CSV file: {}.'.format("submission_{}.csv".format(timestamp)))
display(HTML("Download your test prediction <a href='" + filename + "' download>here</a>."))
print('You may now upload this CSV file to Gradescope for scoring.')#

Created a CSV file: submission_20240510_130556.csv.


You may now upload this CSV file to Gradescope for scoring.
