DA Lab Project 1 - Attempt 1 - LightGBM with Sample Weights

In [2]:
from google.colab import drive
drive.mount('/content/drive')

base_path = "/content/drive/MyDrive/da5401-2025-data-challenge/"

import os
os.chdir(base_path)

print("Current working directory:", os.getcwd())
print("Files:", os.listdir())

import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

print("Libraries imported successfully!")


with open('train_data.json', 'r') as f:
    train_data = json.load(f)

with open('metric_names.json', 'r') as f:
    metric_names = json.load(f)

metric_embeddings_array = np.load('metric_name_embeddings.npy')

df_train = pd.DataFrame(train_data)

df_train['score'] = pd.to_numeric(df_train['score'])

metric_embedding_map = {name: emb for name, emb in zip(metric_names, metric_embeddings_array)}

df_train['metric_embedding'] = df_train['metric_name'].apply(lambda name: metric_embedding_map.get(name))

print("Data loaded and pre-processed.")
print(f"Training data shape: {df_train.shape}")



Mounted at /content/drive
Current working directory: /content/drive/MyDrive/da5401-2025-data-challenge
Files: ['metric_names.json', 'test_data.json', 'metric_name_embeddings.npy', 'train_data.json', 'sample_submission.csv', 'submission_dual_cnn_similarity_only.csv', 'submission_dual_cnn_similarity_augmented.csv', 'submission_dual_cnn_weighted_sampling.csv', 'submission_dual_cnn_simple_768d_weighted_sampling.csv']
Libraries imported successfully!
Data loaded and pre-processed.
Training data shape: (5000, 6)


In [3]:
def combine_text(row):
    system_prompt = row['system_prompt'] if row['system_prompt'] is not None else ""
    return f"User: {row['user_prompt']} \nAssistant: {row['response']} \nSystem: {system_prompt}"


df_train['combined_text'] = df_train.apply(combine_text, axis=1)

print("Loading sentence transformer model...")
model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
print("Model loaded.")

print("Generating text embeddings for training data...")
text_embeddings = model.encode(df_train['combined_text'].tolist())
df_train['text_embedding'] = list(text_embeddings)
print("Text embeddings generated.")

Loading sentence transformer model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded.
Generating text embeddings for training data...
Text embeddings generated.


In [4]:
def create_feature_vector(row):
    metric_emb = row['metric_embedding']
    text_emb = row['text_embedding']

    concatenated = np.concatenate([metric_emb, text_emb])
    abs_diff = np.abs(metric_emb - text_emb)
    product = metric_emb * text_emb

    similarity = np.linalg.norm(metric_emb - text_emb)

    return np.concatenate([concatenated, abs_diff, product, [similarity]])

print("creating final feature vectors...")
X = np.array(df_train.apply(create_feature_vector, axis=1).tolist())
y = df_train['score'].values
print(f"Shape of feature matrix X: {X.shape}")
print(f"Shape of target vector y: {y.shape}")

creating final feature vectors...
Shape of feature matrix X: (5000, 3073)
Shape of target vector y: (5000,)


In [5]:
#sample weights
y_discrete = np.round(y).astype(int)
class_counts = pd.Series(y_discrete).value_counts().sort_index()

total_samples = len(y_discrete)
weights_map = {
    score: total_samples / count
    for score, count in class_counts.items()
}

raw_weights = np.array([weights_map[score] for score in y_discrete])
sample_weights = raw_weights / np.mean(raw_weights)

print("\nSample weights calculated based on inverse frequency of rounded scores.")
print(f"Sample weights array shape: {sample_weights.shape}")


Sample weights calculated based on inverse frequency of rounded scores.
Sample weights array shape: (5000,)


In [6]:

X_train, X_val, y_train, y_val, weights_train, weights_val = train_test_split(
    X, y, sample_weights,
    test_size=0.2, random_state=42,
)

print(f"\nTraining set shape: {X_train.shape}, Validation set shape: {X_val.shape}")


Training set shape: (4000, 3073), Validation set shape: (1000, 3073)


In [7]:
lgb_model = lgb.LGBMRegressor(
    objective='rmse', metric='rmse', n_estimators=2000, learning_rate=0.0005,
    num_leaves=15, lambda_l1=0.5, lambda_l2=0.7, feature_fraction=0.7,
    bagging_fraction=0.7, min_child_samples=50, bagging_freq=1,
    verbose=-1, n_jobs=-1, seed=42
)

lgb_model.fit(
    X_train, y_train,
    sample_weight=weights_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[
        lgb.early_stopping(100, verbose=False),

    ]
)

y_pred = lgb_model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse:.4f}")

Validation RMSE: 2.3454




In [8]:
with open('test_data.json', 'r') as f:
    test_data = json.load(f)
df_test = pd.DataFrame(test_data)

print(f"\nTest data loaded. Shape: {df_test.shape}")

df_test['ID'] = range(1, len(df_test) + 1)
df_test['metric_embedding'] = df_test['metric_name'].apply(lambda name: metric_embedding_map.get(name))
df_test['combined_text'] = df_test.apply(combine_text, axis=1)

test_text_embeddings = model.encode(df_test['combined_text'].tolist())
df_test['text_embedding'] = list(test_text_embeddings)

X_test = np.array(df_test.apply(create_feature_vector, axis=1).tolist())

test_predictions = lgb_model.predict(X_test)

submission_df = pd.DataFrame({'ID': df_test['ID'], 'score': test_predictions})
submission_df['score'] = submission_df['score'].clip(0, 10)
submission_df['score'] = np.round(submission_df['score']).astype(int)
submission_df.to_csv('submission_weighted.csv', index=False)

print("\nSubmission file 'submission_weighted.csv' created")
print(submission_df.head())


Test data loaded. Shape: (3638, 4)





Submission file 'submission_weighted.csv' created
   ID  score
0   1      7
1   2      7
2   3      7
3   4      7
4   5      7
