In [1]:
import pandas as pd
import numpy as np

In [29]:
# test_size=0.2
test_size=0.5 # PoC that small datasize is a great limiting factor on performance, that further size reduction deteriotates even much more

In [6]:
# read data
df_filtered_active_ap = pd.read_pickle('outputs/filtered_events_active_prompt.pkl')

In [10]:
# transform the dataframe to be more easily processable for ML
new_rows = []
for _, row in df_filtered_active_ap.iterrows():
    for sqx, apu in zip(row['sample_question_x'], row['ap_uncertainty']):
        new_rows.append({'str_question': row['str_question'], 'sample_question_x': sqx, 'ap_uncertainty': apu})

df = pd.DataFrame(new_rows)
df['uncertainty_number'] = df['ap_uncertainty'].apply(lambda x: x[0])

In [30]:
## elastic net regression
import numpy as np
from sklearn.linear_model import ElasticNet
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV
from transformers import AutoTokenizer, AutoModel
import torch

# Initialize the TinyBERT model and tokenizer
model_name = 'huawei-noah/TinyBERT_General_6L_768D'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Get the string columns from the dataframe
questions = df['str_question'].tolist()
samples = df['sample_question_x'].tolist()

# Convert strings to embeddings
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

question_embeddings = np.array([get_embedding(question) for question in questions])
sample_embeddings = np.array([get_embedding(sample) for sample in samples])

# Combine the two embeddings
X = np.concatenate([question_embeddings, sample_embeddings], axis=1)

# Flatten the embeddings
X = X.reshape(X.shape[0], -1)

# Get the target variable
y = df['uncertainty_number'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Parameters to test
param_grid = {
    'alpha': [0.1, 0.5, 1],
    'l1_ratio': [0.25, 0.5, 0.75],
    'n_components': [20, 50, 100, 200]
}

# Store results
results = []

# Perform grid search
for alpha in param_grid['alpha']:
    for l1_ratio in param_grid['l1_ratio']:
        for n_components in param_grid['n_components']:
            # Apply PCA
            pca = PCA(n_components=n_components)
            X_train_transformed = pca.fit_transform(X_train)
            X_test_transformed = pca.transform(X_test)
            
            # Initialize and train the model
            elastic_net = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
            elastic_net.fit(X_train_transformed, y_train)
            
            # Evaluate the model
            training_score = elastic_net.score(X_train_transformed, y_train)
            testing_score = elastic_net.score(X_test_transformed, y_test)
            
            # Store results
            result = {
                'alpha': alpha,
                'l1_ratio': l1_ratio,
                'n_components': n_components,
                'training_score': training_score,
                'testing_score': testing_score
            }
            results.append(result)



Some weights of the model checkpoint at huawei-noah/TinyBERT_General_6L_768D were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'fit_denses.2.weight', 'fit_denses.2.bias', 'fit_denses.3.bias', 'fit_denses.0.bias', 'fit_denses.5.bias', 'fit_denses.6.weight', 'cls.seq_relationship.bias', 'fit_denses.6.bias', 'fit_denses.1.weight', 'fit_denses.5.weight', 'fit_denses.0.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'fit_denses.3.weight', 'fit_denses.4.bias', 'fit_denses.1.bias', 'fit_denses.4.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertMo

In [31]:
best_score = -np.inf
best_params = None

# Print results and find best parameters
for result in results:
    
    # Update best parameters if current model performs better
    if result['testing_score'] > best_score:
        best_score = result['testing_score']
        best_params = {
            'alpha': result['alpha'],
            'l1_ratio': result['l1_ratio'],
            'n_components': result['n_components']
        }

print("\nBest parameters: ", best_params)
print("\nBest score: ", best_score)


Best parameters:  {'alpha': 0.1, 'l1_ratio': 0.25, 'n_components': 200}

Best score:  0.04866081945105705


In [32]:
## support vector regression
from sklearn.svm import SVR

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Initialize and train the model
svr = SVR(kernel='rbf')  # tried 'linear' to start with
svr.fit(X_train, y_train)

# Evaluate the model
print('Training score:', svr.score(X_train, y_train))
print('Testing score:', svr.score(X_test, y_test))


Training score: 0.498547311950829
Testing score: 0.05820165773335062


In [33]:
## random forest
from sklearn.ensemble import RandomForestRegressor

# Combine the two embeddings
X = np.concatenate([question_embeddings, sample_embeddings], axis=1)

# Flatten the embeddings
X = X.reshape(X.shape[0], -1)

# Get the target variable
y = df['uncertainty_number'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

# Set up the parameter grid
param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30, 40]
}

# Initialize the RF regressor
rf = RandomForestRegressor(random_state=42)

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate the model
print('Training score:', best_rf.score(X_train, y_train))
print('Testing score:', best_rf.score(X_test, y_test))


Fitting 3 folds for each of 20 candidates, totalling 60 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already

In [34]:
best_rf

In [37]:
## XGBosst
import xgboost as xgb
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

xg_reg_params_grid = {
    'objective': ['reg:squarederror'],
    'colsample_bytree': [0.3, 0.5, 0.7],  # fraction of columns to be randomly sampled for each tree.
    'learning_rate': [0.05, 0.1, 0.15],  # step size shrinkage used to prevent overfitting
    'max_depth': [4, 5, 6],  # maximum depth of a tree
    'alpha': [5, 10, 15],  # L1 regularization term on weight (analogous to Lasso regression)
    'n_estimators': [10, 50, 100],  # number of trees to fit.
}

# Initialize XGBoost regressor
xg_reg = xgb.XGBRegressor()

# Initialize Grid Search and fit the model
grid_search = GridSearchCV(estimator=xg_reg, param_grid=xg_reg_params_grid, 
                           scoring='neg_mean_squared_error', cv=3, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=10, objective=reg:squarederror; total time=   0.0s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=10, objective=reg:squarederror; total time=   0.0s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=10, objective=reg:squarederror; total time=   0.0s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=50, objective=reg:squarederror; total time=   0.2s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=50, objective=reg:squarederror; total time=   0.2s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=50, objective=reg:squarederror; total time=   0.2s
[CV] END alpha=5, colsample_bytree=0.3, learning_rate=0.05, max_depth=4, n_estimators=100, objective=reg:squarederr

In [38]:
# Get the best parameters
best_parameters = grid_search.best_params_
print(f"Best parameters: {best_parameters}")

# Get the best estimator and predict
best_estimator = grid_search.best_estimator_
preds = best_estimator.predict(X_test)

# Compute the RMSE of the prediction
rmse = np.sqrt(mean_squared_error(y_test, preds))
print(f"RMSE: {rmse}")

Best parameters: {'alpha': 5, 'colsample_bytree': 0.3, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 50, 'objective': 'reg:squarederror'}
RMSE: 0.39582734237694606
