In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
race_df = pd.read_parquet('./data/race_test_prepared.parquet')
race_df = race_df[race_df['fk_score'] <= 15]
race_df

In [None]:
def train_linear_regression_with_cv(df, embeddings_col, y_col):
    X = np.array(df[embeddings_col].tolist())
    X = X / np.linalg.norm(X, axis=1, keepdims=True)
    
    y = df[y_col].values
    
    kf = KFold(n_splits=7, shuffle=True, random_state=369)
    
    best_model = None
    best_error = float('inf')
    
    for train_index, val_index in kf.split(df):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        predictions = model.predict(X_val)
        error = mean_squared_error(y_val, predictions)
        
        if error < best_error:
            best_error = error
            best_model = model
    
    return best_model

In [None]:
clf = train_linear_regression_with_cv(df=race_df, embeddings_col='embeddings_mini_lm', y_col='fk_score')

In [None]:
def get_predictions(df, clf, embeddings_col, y_col):
    X = np.array(df[embeddings_col].tolist())
    X = X / np.linalg.norm(X, axis=1, keepdims=True)
    
    y = df[y_col].values

    y_pred = clf.predict(X)

    print('MSE:', mean_squared_error(y, y_pred))
    print('MAE:', mean_absolute_error(y, y_pred))
    print('R2:', r2_score(y, y_pred))

    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=y, y=y_pred)
    plt.xlabel('True Values')
    plt.ylabel('Predicted Values')
    plt.grid(True)
    plt.show()

In [None]:
get_predictions(df=race_df, clf=clf, embeddings_col='embeddings_mini_lm', y_col='fk_score')

In [None]:
# race_df['score_easy'] = clf.predict_proba(X=np.array(race_df['embeddings_mini_lm'].tolist()))[:,0]
# race_df['score_hard'] = clf.predict_proba(X=np.array(race_df['embeddings_mini_lm'].tolist()))[:,1]
# race_df['score_medium'] = clf.predict_proba(X=np.array(race_df['embeddings_mini_lm'].tolist()))[:,2]

# race_df['w_easy_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x * clf.coef_[0])
# race_df['w_hard_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x * clf.coef_[1])
# race_df['w_medium_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x * clf.coef_[2])
# race_df['iw_easy_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x / clf.coef_[0])
# race_df['iw_hard_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x / clf.coef_[1])
# race_df['iw_medium_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x / clf.coef_[2])

race_df['score'] = clf.predict(X=np.array(race_df['embeddings_mini_lm'].tolist()))
race_df['absolute_error'] = np.abs(np.array(race_df['fk_score']) - clf.predict(X=np.array(race_df['embeddings_mini_lm'].tolist())))
race_df['w_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x * clf.coef_)
race_df['iw_embeddings'] = race_df['embeddings_mini_lm'].apply(lambda x: x / clf.coef_)

In [None]:
def get_cluster_stats(df, cluster_col, score_col):
    return df.groupby(cluster_col).agg(
        score_mean=pd.NamedAgg(column=score_col, aggfunc='mean'), 
        score_std =pd.NamedAgg(column=score_col, aggfunc='std')
    ).reset_index(drop=False).rename(columns={cluster_col: 'cluster_id'})

n_clusters = 20
seed=0
uw_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['embeddings_mini_lm']])
w_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['w_embeddings']])
iw_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['iw_embeddings']])

race_df['uw_kmeans'] = uw_kmeans.labels_
race_df['w_kmeans'] = w_kmeans.labels_
race_df['iw_kmeans'] = iw_kmeans.labels_

# cs_unweighted = get_cluster_stats(dd2, 'unweighted_kmeans', 'score')
# cs_weighted = get_cluster_stats(dd2, 'weighted_kmeans', 'score')
# cs_inverse_weighted = get_cluster_stats(dd2, 'inverse_weighted_kmeans', 'score')

In [None]:
cluster_df = race_df[['uw_kmeans', 'w_kmeans', 'iw_kmeans', 'score', 'absolute_error']]
cluster_df = pd.melt(cluster_df, id_vars=['score', 'absolute_error'], var_name='weighting', value_name='cluster_id')
cluster_df = cluster_df.groupby(['weighting', 'cluster_id']).agg({'score': ['mean', 'var'], 'absolute_error': ['mean', 'var']}).reset_index()
cluster_df.columns = ['weighting', 'cluster_id', 'mean_score', 'var_score', 'mean_absolute_error', 'var_absolute_error']

In [None]:
cluster_df

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='mean_score', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='mean_score', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Mean Score by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='var_score', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='var_score', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Variance of Scores by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='mean_absolute_error', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='mean_absolute_error', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Mean Absolute Error by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='var_absolute_error', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='var_absolute_error', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Variance of Absolute Errors by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()

In [None]:
race_df['fk_score'].hist(bins=20)

In [None]:
len(race_df[race_df['fk_score'] <= 4])

In [None]:
aspect_embedding = np.mean(race_df[race_df['fk_score'] <= 4]['embeddings_mini_lm'])

# optionally weight the aspect embeddings with the classifier / regression model coefficients
aspect_embedding = aspect_embedding * clf.coef_

aspect_embedding = aspect_embedding / np.linalg.norm(aspect_embedding)

race_df['w_embeddings'] = race_df['embeddings_mini_lm']
race_df['iw_embeddings'] = race_df['embeddings_mini_lm']

alpha = 2
for i, embedding in enumerate(race_df['embeddings_mini_lm']):
    embedding = embedding / np.linalg.norm(embedding)
    projection = np.dot(embedding, aspect_embedding.T) * aspect_embedding
    projection = projection.reshape(-1,)
    race_df['w_embeddings'].iloc[i] = embedding + alpha * projection
    race_df['iw_embeddings'].iloc[i] = embedding - alpha * projection


In [None]:
n_clusters = 40
seed=0
uw_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['embeddings_mini_lm']])
w_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['w_embeddings']])
iw_kmeans = KMeans(n_clusters=n_clusters, random_state=seed, n_init="auto").fit( [v for v in race_df['iw_embeddings']])

race_df['uw_kmeans'] = uw_kmeans.labels_
race_df['w_kmeans'] = w_kmeans.labels_
race_df['iw_kmeans'] = iw_kmeans.labels_

cluster_df = race_df[['uw_kmeans', 'w_kmeans', 'iw_kmeans', 'score', 'absolute_error']]
cluster_df = pd.melt(cluster_df, id_vars=['score', 'absolute_error'], var_name='weighting', value_name='cluster_id')
cluster_df = cluster_df.groupby(['weighting', 'cluster_id']).agg({'score': ['mean', 'var'], 'absolute_error': ['mean', 'var']}).reset_index()
cluster_df.columns = ['weighting', 'cluster_id', 'mean_score', 'var_score', 'mean_absolute_error', 'var_absolute_error']

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='mean_score', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='mean_score', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Mean Score by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()

In [None]:
sns.set(rc={"figure.figsize":(5, 3)})
fig, axes = plt.subplots(2, 1, sharey=True)

sns.stripplot(x='var_score', y='weighting', data=cluster_df, jitter=True, hue='weighting', dodge=True, ax=axes[0])
axes[0].legend(loc='upper right', bbox_to_anchor=(1.5, 1))
axes[0].set(ylabel=None)
axes[0].set(xticklabels=[])
axes[0].set(xlabel=None)

sns.boxplot(x='var_score', y ='weighting', data=cluster_df, hue='weighting', dodge=True, ax=axes[1])
axes[1].set(ylabel=None)
axes[1].set(xlabel='Variance of Scores by Cluster')
axes[1].legend([], [], frameon=False)

plt.show()