In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Step 1: Load the dataset
# Ensure that 'programs_with_summary.csv' is in your working directory.
# Expected columns: Program, Program Link, Program Description, Program_Summary.
df = pd.read_csv("programs_with_summary.csv")

# Drop rows where either description or summary might be missing.
df = df.dropna(subset=["Program Description", "Program_Summary"])

# Step 2: Create Training Examples
# For each program, pair the "Program Description" with the "Program_Summary".
train_examples = [
    InputExample(texts=[row["Program Description"], row["Program_Summary"]])
    for _, row in df.iterrows()
]

# Step 3: Create a DataLoader for the training examples.
batch_size = 16
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)

# Step 4: Initialize the SentenceTransformer model.
model = SentenceTransformer('all-MiniLM-L6-v2')

# Step 5: Define the Loss Function.
# Using MultipleNegativesRankingLoss which treats other examples in the batch as negatives.
train_loss = losses.MultipleNegativesRankingLoss(model)

# Step 6: Fine-Tune the Model.
num_epochs = 1  # Increase the number of epochs for better performance.
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)  # 10% of total steps for warm-up

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path='./fine_tuned_academic_program_model'
)

print("Fine-tuning complete. The model is saved at './fine_tuned_academic_program_model'")


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss


Fine-tuning complete. The model is saved at './fine_tuned_academic_program_model'
