### 🚀 Task 1: Load Dataset

In [None]:
import pandas as pd

# Load the training dataset
train_path = "/mnt/data/train.csv"
df_train = pd.read_csv(train_path)

# Load the testing dataset
test_path = "/mnt/data/test.csv"
df_test = pd.read_csv(test_path)

# Display both datasets
import ace_tools as tools
tools.display_dataframe_to_user(name="TREC Training Dataset", dataframe=df_train)
tools.display_dataframe_to_user(name="TREC Testing Dataset", dataframe=df_test)

print("✅ Training and Testing datasets loaded successfully!")


### 🚀 Task 2: Train a Doc2Vec Model for Coarse Labels

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Prepare tagged documents
train_tagged = [TaggedDocument(words=row.split(), tags=[str(i)]) for i, row in enumerate(df_train["question"])]
test_tagged = [TaggedDocument(words=row.split(), tags=[str(i + len(df_train))]) for i, row in enumerate(df_test["question"])]

# Train Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=300, window=4, min_count=2, workers=4, epochs=20)
doc2vec_model.build_vocab(train_tagged)
doc2vec_model.train(train_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Extract embeddings
X_train = [doc2vec_model.dv[str(i)] for i in range(len(df_train))]
X_test = [doc2vec_model.dv[str(i + len(df_train))] for i in range(len(df_test))]

# Get labels
y_train = df_train["coarse_label"]
y_test = df_test["coarse_label"]

# Train logistic regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Predict on test set
y_pred = lr_model.predict(X_test)

# Evaluate performance
f1 = f1_score(y_test, y_pred, average="macro")
print(f"✅ Macro F1 Score (Doc2Vec + Logistic Regression): {f1:.4f}")


### 🚀 Task 3: Train LoRA Adapter on BERT (Coarse Labels)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import torch
from datasets import Dataset
from adapters import LoRAConfig

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["question"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BERT model with LoRA Adapter
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df_train["coarse_label"].unique()))
config = LoRAConfig(target_modules=["query", "value"])
model.add_adapter("lora_adapter", config)
model.train_adapter("lora_adapter")

# Training settings
training_args = TrainingArguments(output_dir="./results", per_device_train_batch_size=8, num_train_epochs=3)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset)

# Train model
trainer.train()

print("✅ LoRA Adapter trained successfully on BERT (Coarse Labels)!")


### 🚀 Task 4: Fine-tune BERT on Coarse Labels

In [None]:
# Load BERT model
full_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(df_train["coarse_label"].unique()))

# Training settings
full_training_args = TrainingArguments(output_dir="./full_results", per_device_train_batch_size=8, num_train_epochs=3)
full_trainer = Trainer(model=full_model, args=full_training_args, train_dataset=train_dataset)

# Train model
full_trainer.train()

print("✅ Full BERT Fine-tuned on Coarse Labels!")


### 🚀 Task 5: Evaluate All Models on Test Set

In [None]:
import time

# Evaluate Doc2Vec Model
start_time = time.time()
y_pred_doc2vec = lr_model.predict(X_test)
f1_doc2vec = f1_score(y_test, y_pred_doc2vec, average="macro")
doc2vec_time = time.time() - start_time

# Evaluate LoRA Adapter Model
start_time = time.time()
lora_predictions = trainer.predict(test_dataset)
lora_f1 = f1_score(y_test, lora_predictions.predictions.argmax(axis=-1), average="macro")
lora_time = time.time() - start_time

# Evaluate Fully Fine-tuned BERT
start_time = time.time()
bert_predictions = full_trainer.predict(test_dataset)
bert_f1 = f1_score(y_test, bert_predictions.predictions.argmax(axis=-1), average="macro")
bert_time = time.time() - start_time

# Compare Results
print(f"✅ Doc2Vec Model - Macro F1: {f1_doc2vec:.4f}, Training Time: {doc2vec_time:.2f}s")
print(f"✅ LoRA Adapter Model - Macro F1: {lora_f1:.4f}, Training Time: {lora_time:.2f}s")
print(f"✅ Fine-tuned BERT Model - Macro F1: {bert_f1:.4f}, Training Time: {bert_time:.2f}s")


### 🚀 Task 6: Repeat for Fine Labels

In [None]:
# Update labels for fine-grained classification
y_train_fine = df_train["fine_label"]
y_test_fine = df_test["fine_label"]

# Repeat Doc2Vec + Logistic Regression
lr_model.fit(X_train, y_train_fine)
y_pred_fine_doc2vec = lr_model.predict(X_test)
f1_fine_doc2vec = f1_score(y_test_fine, y_pred_fine_doc2vec, average="macro")

# Repeat LoRA Adapter Training
trainer.train()
lora_predictions_fine = trainer.predict(test_dataset)
lora_fine_f1 = f1_score(y_test_fine, lora_predictions_fine.predictions.argmax(axis=-1), average="macro")

# Repeat Full BERT Training
full_trainer.train()
bert_predictions_fine = full_trainer.predict(test_dataset)
bert_fine_f1 = f1_score(y_test_fine, bert_predictions_fine.predictions.argmax(axis=-1), average="macro")

# Compare Results
print(f"✅ Fine Labels - Doc2Vec F1: {f1_fine_doc2vec:.4f}")
print(f"✅ Fine Labels - LoRA Adapter F1: {lora_fine_f1:.4f}")
print(f"✅ Fine Labels - Fine-tuned BERT F1: {bert_fine_f1:.4f}")
