Classifier Experiment.
Using HGBM and MLP for Manual features, BERT embeddings (text only) and DeBERTa embeddings (text only)

In [9]:
import sys
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import pandas as pd
import ast
import re

sys.path.append('C:/Users/jp3g20/Desktop/m/summary-eval')
from summary_eval.data import summary_df, prompts_df
from summary_eval.settings import TRAIN_SIZE
from summary_eval.testing import cross_validate

def convert_to_list(string):
    # Use regular expression to extract numbers from the string
    numbers = re.findall(r'[-+]?\d*\.\d+e[-+]?\d+|\d+', string)
    # Convert the extracted numbers to floats
    numbers = [float(num) for num in numbers]
    return numbers

converters = {'text_embeddings': convert_to_list,
              'prompt_embeddings': convert_to_list,
              'prompt_question_embeddings': convert_to_list}

bert_embeddings = pd.read_csv('bert_embeddings.csv', converters=converters)
roberta_embeddings = pd.read_csv('roberta_embeddings.csv', converters=converters)
features_df = pd.read_csv('feature_df.csv') # Maybe append some of my features to this later on for improving results but for now use this as a base.

In [25]:
model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

In [26]:
X_text_embeddings = np.vstack(bert_embeddings['text_embeddings'].values)

y = summary_df[['content', 'wording']]
X_train, X_test, y_train, y_test = train_test_split(X_text_embeddings, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,5)

2024-05-12 12:26:51,554 - INFO - Using 5x5 cross validation


  0%|          | 0/25 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.483903,0.633806,0.558854,0.377755,0.499422,0.438588,0.783767,0.629315,0.706541
stdev,0.011953,0.01477,0.013362,0.008439,0.011006,0.009723,0.013819,0.016656,0.015237
n_trials,25.0,25.0,2.0,25.0,25.0,2.0,25.0,25.0,2.0


In [27]:
X_text_embeddings = np.vstack(roberta_embeddings['text_embeddings'].values)

X_train, X_test, y_train, y_test = train_test_split(X_text_embeddings, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,5)

2024-05-12 12:30:32,817 - INFO - Using 5x5 cross validation


  0%|          | 0/25 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.451093,0.613936,0.532515,0.343702,0.476911,0.410307,0.812135,0.652221,0.732178
stdev,0.012566,0.014743,0.013654,0.008777,0.010666,0.009722,0.011884,0.015148,0.013516
n_trials,25.0,25.0,2.0,25.0,25.0,2.0,25.0,25.0,2.0


In [29]:
features_df = features_df.drop(columns=['student_id', 'prompt_id', 'text', 'content', 'wording'])
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,5)

2024-05-12 12:35:01,670 - INFO - Using 5x5 cross validation


  0%|          | 0/25 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.479718,0.656005,0.567861,0.360438,0.501057,0.430747,0.787633,0.602898,0.695266
stdev,0.012821,0.016264,0.014542,0.007939,0.011102,0.00952,0.011592,0.018375,0.014984
n_trials,25.0,25.0,2.0,25.0,25.0,2.0,25.0,25.0,2.0


In [10]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(learning_rate='adaptive', learning_rate_init=0.001, hidden_layer_sizes=(512))

In [3]:

from sklearn.model_selection import KFold, GridSearchCV

X_text_embeddings = np.vstack(bert_embeddings['text_embeddings'].values)
y = summary_df[['content', 'wording']]
X_train, X_test, y_train, y_test = train_test_split(X_text_embeddings, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

cross_validate(model, X_train, y_train,5,2)


2024-05-12 16:29:03,167 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.51052,0.669686,0.590103,0.400007,0.523895,0.461951,0.759531,0.58615,0.672841
stdev,0.006398,0.013297,0.009847,0.005541,0.010924,0.008232,0.011794,0.017586,0.01469
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [4]:
X_text_embeddings_roberta = np.vstack(roberta_embeddings['text_embeddings'].values)

X_train, X_test, y_train, y_test = train_test_split(X_text_embeddings_roberta, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,2)

2024-05-12 16:38:03,497 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]



Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.501828,0.679494,0.590661,0.390941,0.531308,0.461125,0.767825,0.573988,0.670906
stdev,0.011451,0.010899,0.011175,0.011459,0.010195,0.010827,0.008555,0.015399,0.011977
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [13]:
#features_df = features_df.drop(columns=['student_id', 'prompt_id', 'text', 'content', 'wording'])
X_train, X_test, y_train, y_test = train_test_split(features_df, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
X_train.fillna(X_train.mean(), inplace=True)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,2)

2024-05-12 16:56:51,472 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.664758,0.827455,0.746107,0.482122,0.619712,0.550917,0.581952,0.35635,0.469151
stdev,0.12276,0.115594,0.119177,0.10675,0.078048,0.092399,0.166336,0.187894,0.177115
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


Now that it is apparent that HGBM performs better for this problem than and MLP, work can be done on combining features selected manually with embeddings with both early and late fusion.

First early fusion

In [14]:
model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

In [15]:
X_combined = np.hstack((X_text_embeddings, features_df))
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,2)

2024-05-12 16:58:08,467 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.429643,0.578604,0.504123,0.325255,0.444293,0.384774,0.82975,0.691213,0.760481
stdev,0.014034,0.01384,0.013937,0.009547,0.01024,0.009894,0.009708,0.010437,0.010073
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [16]:
X_combined_roberta = np.hstack((X_text_embeddings_roberta, features_df))
X_train, X_test, y_train, y_test = train_test_split(X_combined_roberta, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
cross_validate(model, X_train, y_train,5,2)

2024-05-12 16:59:39,206 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.431053,0.581322,0.506188,0.32516,0.445529,0.385345,0.828707,0.688292,0.7585
stdev,0.012916,0.017182,0.015049,0.008807,0.012685,0.010746,0.007592,0.013504,0.010548
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


The Late fusion - will begin with an average of embedding model output and features model output.

In [34]:
from sklearn.base import BaseEstimator, RegressorMixin

class EnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, model1, model2, num_embedding_features):
        self.model1 = model1
        self.model2 = model2
        self.num_embedding_features = num_embedding_features

    def fit(self, X, y):
        X_emb = X.iloc[:, :self.num_embedding_features]  # Select the first 'num_embedding_features' columns
        X_feat = X.iloc[:, self.num_embedding_features:]  # Select the remaining columns
        self.model1.fit(X_emb, y)
        self.model2.fit(X_feat, y)
        return self

    def predict(self, X):
        X_emb = X.iloc[:, :self.num_embedding_features]
        X_feat = X.iloc[:, self.num_embedding_features:]
        preds1 = self.model1.predict(X_emb)
        preds2 = self.model2.predict(X_feat)
        return (preds1 + preds2) / 2

    def score(self, X, y, sample_weight=None):
        predictions = self.predict(X)
        return np.mean((y - predictions) ** 2)





In [35]:
# Define the first MultiOutputRegressor model
model1 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Define the second MultiOutputRegressor model
model2 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Create the ensemble model using the EnsembleRegressor class
ensemble_model = EnsembleRegressor(model1, model2, len(X_text_embeddings[0]))

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)

cross_validate(ensemble_model, X_train, y_train,5,2)

2024-05-12 17:16:55,959 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.448137,0.601338,0.524737,0.343215,0.470637,0.406926,0.814817,0.666446,0.740631
stdev,0.011194,0.014189,0.012692,0.00791,0.009771,0.00884,0.008209,0.011931,0.01007
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_combined_roberta, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)

ensemble_model = EnsembleRegressor(model1, model2, len(X_text_embeddings_roberta[0]))

cross_validate(ensemble_model, X_train, y_train,5,2)

2024-05-12 17:18:30,144 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.441041,0.597212,0.519126,0.333585,0.462621,0.398103,0.820648,0.671008,0.745828
stdev,0.014289,0.016084,0.015187,0.010109,0.014384,0.012247,0.009119,0.013234,0.011176
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [48]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression

class EnsembleRegressor2(BaseEstimator, RegressorMixin):
    def __init__(self, model1, model2, num_embedding_features):
        self.model1 = model1
        self.model2 = model2
        self.num_embedding_features = num_embedding_features
        self.final_model = LinearRegression()  # Use Linear Regression as the final model

    def fit(self, X, y):
        X_emb = X.iloc[:, :self.num_embedding_features]  # Select the first 'num_embedding_features' columns
        X_feat = X.iloc[:, self.num_embedding_features:]  # Select the remaining columns
        self.model1.fit(X_emb, y)
        self.model2.fit(X_feat, y)
        preds1 = self.model1.predict(X_emb)
        preds2 = self.model2.predict(X_feat)
        preds_combined = np.hstack((preds1, preds2))  # Combine predictions
        
        self.final_model.fit(preds_combined, y)  # Fit the final model with combined predictions
        return self

    def predict(self, X):
        X_emb = X.iloc[:, :self.num_embedding_features]
        X_feat = X.iloc[:, self.num_embedding_features:]
        preds1 = self.model1.predict(X_emb)
        preds2 = self.model2.predict(X_feat)
        preds_combined = np.hstack((preds1, preds2))  # Combine predictions
        return self.final_model.predict(preds_combined)  # Predict using the final model

    def score(self, X, y, sample_weight=None):
        predictions = self.predict(X)
        return np.mean((y - predictions) ** 2)


In [49]:
model1 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Define the second MultiOutputRegressor model
model2 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Create the ensemble model using the EnsembleRegressor class
ensemble_model = EnsembleRegressor2(model1, model2, len(X_text_embeddings[0]))

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)

cross_validate(ensemble_model, X_train, y_train,5,2)

2024-05-12 17:33:00,164 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.492529,0.633591,0.56306,0.384735,0.49647,0.440603,0.776148,0.629674,0.702911
stdev,0.011932,0.014235,0.013083,0.008205,0.008323,0.008264,0.013492,0.013655,0.013573
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [50]:
model1 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Define the second MultiOutputRegressor model
model2 = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

# Create the ensemble model using the EnsembleRegressor class
ensemble_model = EnsembleRegressor2(model1, model2, len(X_text_embeddings_roberta[0]))

X_train, X_test, y_train, y_test = train_test_split(X_combined_roberta, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)

cross_validate(ensemble_model, X_train, y_train,5,2)

2024-05-12 17:36:23,808 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.463547,0.619969,0.541758,0.353277,0.479015,0.416146,0.801799,0.645461,0.72363
stdev,0.016868,0.012911,0.014889,0.012636,0.010505,0.01157,0.012465,0.011281,0.011873
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


Finally looking at using question and prompt embeddings aswell.

In [56]:
X_prompt_embeddings = np.vstack(bert_embeddings['prompt_embeddings'].values)
X_prompt_question_embeddings = np.vstack(bert_embeddings['prompt_question_embeddings'].values)


X_embeddings = np.hstack((X_text_embeddings, X_prompt_embeddings, X_prompt_question_embeddings, features_df))

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)
model = MultiOutputRegressor(HistGradientBoostingRegressor(random_state=0))

cross_validate(model, X_train, y_train,5,2)


2024-05-12 17:53:17,196 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.429643,0.578844,0.504243,0.325255,0.444519,0.384887,0.82975,0.690959,0.760354
stdev,0.014034,0.01555,0.014792,0.009547,0.011184,0.010365,0.009708,0.011661,0.010685
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0


In [57]:
X_prompt_embeddings = np.vstack(roberta_embeddings['prompt_embeddings'].values)
X_prompt_question_embeddings = np.vstack(roberta_embeddings['prompt_question_embeddings'].values)


X_embeddings = np.hstack((X_text_embeddings, X_prompt_embeddings, X_prompt_question_embeddings, features_df))

X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)
X_train = pd.DataFrame(X_train)

cross_validate(model, X_train, y_train,5,2)

2024-05-12 17:57:18,138 - INFO - Using 2x5 cross validation


  0%|          | 0/10 [00:00<?, ?it/s]

Metric,rmse,rmse,rmse,mae,mae,mae,r2,r2,r2
Target,content,wording,mean_columnwise,content,wording,mean_columnwise,content,wording,mean_columnwise
mean,0.429643,0.578844,0.504243,0.325255,0.444519,0.384887,0.82975,0.690959,0.760354
stdev,0.014034,0.01555,0.014792,0.009547,0.011184,0.010365,0.009708,0.011661,0.010685
n_trials,10.0,10.0,2.0,10.0,10.0,2.0,10.0,10.0,2.0
