# Peer Recommendation System
Course: SI 670: Applied Machine Learning

Name : Yuganshi Agrawal  
uniqname: yuganshi

Name : Sai Sneha Siddapura Venkataramappa  
uniqname: saisneha

### Notebook 04: Baseline Models

This notebook trains baseline models for complementarity prediction.

**Inputs:**
- `data/processed/student_pairs_train.pkl`
- `data/processed/student_pairs_test.pkl`
- `data/features/feature_matrix_scaled.pkl`

**Outputs:**
- `models/checkpoints/xgboost_baseline.pkl`
- `models/checkpoints/logistic_regression_baseline.pkl`
- `results/metrics/baseline_classification_metrics.pkl`
- `results/metrics/baseline_ranking_metrics.pkl`

In [1]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pickle
import json
import numpy as np
import pandas as pd
import random
import multiprocessing as mp
from pathlib import Path
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

import torch

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, classification_report,
    precision_recall_curve, roc_curve, confusion_matrix
)
import xgboost as xgb

RNG_SEED = 42
np.random.seed(RNG_SEED)
random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)

<torch._C.Generator at 0x1519bcbf9cd0>

In [2]:
N_CPU = mp.cpu_count()
DEVICE = 'cpu'
N_GPU = 0

print("Hardware Configuration:")
print(f"  CPUs available: {N_CPU}")

try:
    if torch.cuda.is_available():
        N_GPU = torch.cuda.device_count()
        if N_GPU > 0:
            torch.cuda.set_device(0)
            test_tensor = torch.zeros(1).cuda()
            del test_tensor
            torch.cuda.empty_cache()
            
            DEVICE = 'cuda'
            print(f"  Device: cuda")
            print(f"  GPUs accessible: {N_GPU}")
            
            for i in range(N_GPU):
                try:
                    name = torch.cuda.get_device_name(i)
                    mem = torch.cuda.get_device_properties(i).total_memory / 1e9
                    print(f"    GPU {i}: {name} ({mem:.1f} GB)")
                except Exception as e:
                    print(f"    GPU {i}: Error accessing - {str(e)[:50]}")
        else:
            print(f"  Device: cpu (CUDA available but no GPUs detected)")
    else:
        print(f"  Device: cpu (CUDA not available)")
        
except Exception as e:
    print(f"  Device: cpu (CUDA initialization failed: {str(e)[:80]})")
    DEVICE = 'cpu'
    N_GPU = 0

print()

if DEVICE == 'cuda':
    N_WORKERS = min(4, max(1, N_CPU // 4))
else:
    N_WORKERS = max(1, min(N_CPU - 1, 16))

print(f"Parallel Configuration:")
print(f"  Mode: {'GPU-accelerated' if DEVICE == 'cuda' else 'CPU multiprocessing'}")
print(f"  Workers: {N_WORKERS}")
print()

Hardware Configuration:
  CPUs available: 32
  Device: cpu (CUDA initialization failed: CUDA call failed lazily at initialization with error: device >= 0 && device < nu)

Parallel Configuration:
  Mode: CPU multiprocessing
  Workers: 16



## Directory Setup

In [3]:
BASE_DIR = Path('../670-Project')
DATA_PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
DATA_FEATURES_DIR = BASE_DIR / 'data' / 'features'
MODELS_DIR = BASE_DIR / 'models' / 'checkpoints'
RESULTS_METRICS_DIR = BASE_DIR / 'results' / 'metrics'

MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_METRICS_DIR.mkdir(parents=True, exist_ok=True)

print("Directory structure:")
print(f"  Processed data: {DATA_PROCESSED_DIR}")
print(f"  Features: {DATA_FEATURES_DIR}")
print(f"  Models: {MODELS_DIR}")
print(f"  Metrics: {RESULTS_METRICS_DIR}")
print()


Directory structure:
  Processed data: ../670-Project/data/processed
  Features: ../670-Project/data/features
  Models: ../670-Project/models/checkpoints
  Metrics: ../670-Project/results/metrics



## Configuration

In [4]:
BATCH_SIZE = 10000

print("Configuration:")
print(f"  Batch size for feature creation: {BATCH_SIZE:,}")
print(f"  Random seed: {RNG_SEED}")
print()

Configuration:
  Batch size for feature creation: 10,000
  Random seed: 42



## Load Data

In [5]:
print("Loading data...")
print()

def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

train_pairs = load_pickle(DATA_PROCESSED_DIR / 'student_pairs_train.pkl')
test_pairs = load_pickle(DATA_PROCESSED_DIR / 'student_pairs_test.pkl')
feature_matrix_scaled = load_pickle(DATA_FEATURES_DIR / 'feature_matrix_scaled.pkl')
assessment_features = load_pickle(DATA_FEATURES_DIR / 'assessment_features.pkl')

assess_pivot = assessment_features['assess_pivot']

print("Loaded data:")
print(f"  Train pairs: {len(train_pairs):,}")
print(f"  Test pairs: {len(test_pairs):,}")
print(f"  Feature matrix: {feature_matrix_scaled.shape}")
print(f"  Assessment pivot: {assess_pivot.shape}")
print()

print("Label distribution:")
print(f"  Train - Positive: {train_pairs['label'].sum():,} ({train_pairs['label'].mean()*100:.2f}%)")
print(f"  Test - Positive: {test_pairs['label'].sum():,} ({test_pairs['label'].mean()*100:.2f}%)")
print()


Loading data...

Loaded data:
  Train pairs: 535,953
  Test pairs: 124,047
  Feature matrix: (28785, 100)
  Assessment pivot: (23351, 188)

Label distribution:
  Train - Positive: 107,910 (20.13%)
  Test - Positive: 24,090 (19.42%)



## Pairwise Feature Construction

Create features for each pair by combining student embeddings:
- Absolute difference (captures dissimilarity)
- Element-wise product (captures interaction)
- Assessment distance (cosine-based)

In [6]:
def compute_assessment_distance(sid_i, sid_j, assess_pivot):
    """Compute assessment-based distance between two students."""
    if sid_i not in assess_pivot.index or sid_j not in assess_pivot.index:
        return 0.5
    
    vec_i = assess_pivot.loc[sid_i].values
    vec_j = assess_pivot.loc[sid_j].values
    
    norm_i = np.linalg.norm(vec_i)
    norm_j = np.linalg.norm(vec_j)
    
    if norm_i < 1e-8 or norm_j < 1e-8:
        return 0.5
    
    cos_sim = np.dot(vec_i, vec_j) / (norm_i * norm_j + 1e-8)
    return 1 - cos_sim


def create_pair_features_batch(pairs_batch, feature_matrix, assess_pivot):
    """
    Create pairwise features for a batch of pairs.
    
    Returns DataFrame with features for each pair.
    """
    feature_list = []
    emb_cols = [c for c in feature_matrix.columns if c.startswith('emb_')]
    n_emb = len(emb_cols)
    
    for _, row in pairs_batch.iterrows():
        sid_i = row['id_i']
        sid_j = row['id_j']
        
        if sid_i not in feature_matrix.index or sid_j not in feature_matrix.index:
            feature_list.append(np.zeros(n_emb * 2 + 1))
            continue
        
        emb_i = feature_matrix.loc[sid_i, emb_cols].values
        emb_j = feature_matrix.loc[sid_j, emb_cols].values
        
        abs_diff = np.abs(emb_i - emb_j)
        product = emb_i * emb_j
        
        assess_dist = compute_assessment_distance(sid_i, sid_j, assess_pivot)
        
        combined = np.concatenate([abs_diff, product, [assess_dist]])
        feature_list.append(combined)
    
    feature_array = np.array(feature_list)
    
    col_names = (
        [f'diff_{i}' for i in range(n_emb)] +
        [f'prod_{i}' for i in range(n_emb)] +
        ['assess_dist']
    )
    
    return pd.DataFrame(feature_array, columns=col_names, index=pairs_batch.index)

print("Feature construction functions defined")
print()

Feature construction functions defined



## Create Training Features

Process pairs in batches to create pairwise features.

In [7]:
print("Creating training features...")
print()

n_batches = (len(train_pairs) + BATCH_SIZE - 1) // BATCH_SIZE
print(f"Processing {len(train_pairs):,} pairs in {n_batches} batches...")
print()

train_feature_list = []

for batch_idx in tqdm(range(n_batches), desc="Train batches"):
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(train_pairs))
    
    batch_pairs = train_pairs.iloc[start_idx:end_idx]
    batch_features = create_pair_features_batch(batch_pairs, feature_matrix_scaled, assess_pivot)
    
    train_feature_list.append(batch_features)

train_features = pd.concat(train_feature_list, axis=0)
train_features['label'] = train_pairs['label'].values

print(f"\nTrain features shape: {train_features.shape}")
print()

Creating training features...

Processing 535,953 pairs in 54 batches...



Train batches:   0%|          | 0/54 [00:00<?, ?it/s]


Train features shape: (535953, 2)



## Create Test Features

In [9]:
print("Creating test features...")
print()

n_batches = (len(test_pairs) + BATCH_SIZE - 1) // BATCH_SIZE
print(f"Processing {len(test_pairs):,} pairs in {n_batches} batches...")
print()

test_feature_list = []

for batch_idx in tqdm(range(n_batches), desc="Test batches"):
    start_idx = batch_idx * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(test_pairs))
    
    batch_pairs = test_pairs.iloc[start_idx:end_idx]
    batch_features = create_pair_features_batch(batch_pairs, feature_matrix_scaled, assess_pivot)
    
    test_feature_list.append(batch_features)

test_features = pd.concat(test_feature_list, axis=0)
test_features['label'] = test_pairs['label'].values

print(f"\nTest features shape: {test_features.shape}")
print()

Creating test features...

Processing 124,047 pairs in 13 batches...



Test batches:   0%|          | 0/13 [00:00<?, ?it/s]


Test features shape: (124047, 2)



## Prepare Data for Training

In [10]:
print("Preparing data for training...")
print()

X_train = train_features.drop(columns=['label']).values
y_train = train_features['label'].values

X_test = test_features.drop(columns=['label']).values
y_test = test_features['label'].values

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")
print()

print("Class distribution:")
print(f"  Train - Class 0: {(y_train == 0).sum():,} ({(y_train == 0).mean()*100:.2f}%)")
print(f"  Train - Class 1: {(y_train == 1).sum():,} ({(y_train == 1).mean()*100:.2f}%)")
print(f"  Test - Class 0: {(y_test == 0).sum():,} ({(y_test == 0).mean()*100:.2f}%)")
print(f"  Test - Class 1: {(y_test == 1).sum():,} ({(y_test == 1).mean()*100:.2f}%)")
print()

Preparing data for training...

X_train shape: (535953, 1)
y_train shape: (535953,)
X_test shape: (124047, 1)
y_test shape: (124047,)

Class distribution:
  Train - Class 0: 428,043 (79.87%)
  Train - Class 1: 107,910 (20.13%)
  Test - Class 0: 99,957 (80.58%)
  Test - Class 1: 24,090 (19.42%)



## Train Logistic Regression

Train logistic regression with class balancing.

In [11]:
print("TRAINING LOGISTIC REGRESSION")
print()

lr_model = LogisticRegression(
    max_iter=1000,
    random_state=RNG_SEED,
    class_weight='balanced',
    n_jobs=N_WORKERS
)

print("Training...")
lr_model.fit(X_train, y_train)
print("Training complete")
print()

print("Generating predictions...")
lr_train_probs = lr_model.predict_proba(X_train)[:, 1]
lr_test_probs = lr_model.predict_proba(X_test)[:, 1]
lr_test_preds = lr_model.predict(X_test)
print()

lr_train_auroc = roc_auc_score(y_train, lr_train_probs)
lr_train_auprc = average_precision_score(y_train, lr_train_probs)
lr_test_auroc = roc_auc_score(y_test, lr_test_probs)
lr_test_auprc = average_precision_score(y_test, lr_test_probs)

print("Logistic Regression Results:")
print(f"  Train AUROC: {lr_train_auroc:.4f}")
print(f"  Train AUPRC: {lr_train_auprc:.4f}")
print(f"  Test AUROC: {lr_test_auroc:.4f}")
print(f"  Test AUPRC: {lr_test_auprc:.4f}")
print()

print("Classification Report (Test Set):")
print(classification_report(y_test, lr_test_preds, digits=4))
print()

TRAINING LOGISTIC REGRESSION

Training...
Training complete

Generating predictions...

Logistic Regression Results:
  Train AUROC: 0.6329
  Train AUPRC: 0.2950
  Test AUROC: 0.6455
  Test AUPRC: 0.3020

Classification Report (Test Set):
              precision    recall  f1-score   support

           0     0.8605    0.4498    0.5908     99957
           1     0.2340    0.6976    0.3505     24090

    accuracy                         0.4979    124047
   macro avg     0.5473    0.5737    0.4706    124047
weighted avg     0.7389    0.4979    0.5441    124047




## Train XGBoost

Train XGBoost with class balancing and hyperparameters tuned for imbalanced data.

In [12]:
print("TRAINING XGBOOST")
print()

pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print(f"Class imbalance ratio: {pos_weight:.2f}")
print()

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=pos_weight,
    random_state=RNG_SEED,
    tree_method='hist',
    eval_metric='logloss',
    n_jobs=N_WORKERS
)

print("Training...")
xgb_model.fit(X_train, y_train, verbose=False)
print("Training complete")
print()

print("Generating predictions...")
xgb_train_probs = xgb_model.predict_proba(X_train)[:, 1]
xgb_test_probs = xgb_model.predict_proba(X_test)[:, 1]
xgb_test_preds = xgb_model.predict(X_test)
print()

xgb_train_auroc = roc_auc_score(y_train, xgb_train_probs)
xgb_train_auprc = average_precision_score(y_train, xgb_train_probs)
xgb_test_auroc = roc_auc_score(y_test, xgb_test_probs)
xgb_test_auprc = average_precision_score(y_test, xgb_test_probs)

print("XGBoost Results:")
print(f"  Train AUROC: {xgb_train_auroc:.4f}")
print(f"  Train AUPRC: {xgb_train_auprc:.4f}")
print(f"  Test AUROC: {xgb_test_auroc:.4f}")
print(f"  Test AUPRC: {xgb_test_auprc:.4f}")
print()

print("Classification Report (Test Set):")
print(classification_report(y_test, xgb_test_preds, digits=4))
print()

TRAINING XGBOOST

Class imbalance ratio: 3.97

Training...
Training complete

Generating predictions...

XGBoost Results:
  Train AUROC: 0.7116
  Train AUPRC: 0.3527
  Test AUROC: 0.7163
  Test AUPRC: 0.3444

Classification Report (Test Set):
              precision    recall  f1-score   support

           0     0.8947    0.7169    0.7960     99957
           1     0.3562    0.6499    0.4602     24090

    accuracy                         0.7039    124047
   macro avg     0.6255    0.6834    0.6281    124047
weighted avg     0.7901    0.7039    0.7308    124047




## Feature Importance Analysis

Analyze which features are most important for XGBoost predictions.

In [13]:
print("Feature importance analysis...")
print()

feature_names = train_features.drop(columns=['label']).columns.tolist()
feature_importance = xgb_model.feature_importances_

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(importance_df.head(20).to_string(index=False))
print()

Feature importance analysis...

Top 20 most important features:
    feature  importance
assess_dist         1.0



## Ranking Metrics

Evaluate models on ranking metrics (Recall@K, NDCG@K).
These metrics measure how well models rank complementary students.

In [14]:
def compute_ranking_metrics(test_pairs_df, scores, k_values=[1, 5, 10, 20]):
    """
    Compute ranking metrics for recommendation task.
    
    For each query student, rank candidates by score and compute:
    - Recall@K: Fraction of true complements in top-K
    - NDCG@K: Normalized Discounted Cumulative Gain
    """
    results = {k: {'recall': [], 'ndcg': []} for k in k_values}
    
    test_pairs_with_scores = test_pairs_df.copy()
    test_pairs_with_scores['score'] = scores
    
    query_groups = test_pairs_with_scores[test_pairs_with_scores['label'] == 1].groupby('id_i')
    
    eval_count = 0
    
    for query_sid, group in tqdm(query_groups, desc="Computing ranking metrics"):
        true_complements = set(group['id_j'].tolist())
        
        query_candidates = test_pairs_with_scores[
            test_pairs_with_scores['id_i'] == query_sid
        ].copy()
        
        if len(query_candidates) < 2:
            continue
        
        query_candidates = query_candidates.sort_values('score', ascending=False)
        ranked_ids = query_candidates['id_j'].tolist()
        
        for k in k_values:
            if k > len(ranked_ids):
                continue
            
            top_k = set(ranked_ids[:k])
            
            recall = len(top_k & true_complements) / len(true_complements) if true_complements else 0
            results[k]['recall'].append(recall)
            
            relevance = [1 if sid in true_complements else 0 for sid in ranked_ids[:k]]
            dcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(relevance)])
            idcg = sum([1 / np.log2(i + 2) for i in range(min(k, len(true_complements)))])
            ndcg = dcg / idcg if idcg > 0 else 0
            results[k]['ndcg'].append(ndcg)
        
        eval_count += 1
    
    print(f"  Evaluated {eval_count} queries")
    print()
    
    for k in k_values:
        if len(results[k]['recall']) > 0:
            print(f"  Recall@{k}: {np.mean(results[k]['recall']):.4f} (±{np.std(results[k]['recall']):.4f})")
            print(f"  NDCG@{k}: {np.mean(results[k]['ndcg']):.4f} (±{np.std(results[k]['ndcg']):.4f})")
    
    return results

print("Ranking evaluation functions defined")
print()

Ranking evaluation functions defined



### Logistic Regression Ranking Performance

In [15]:
print("LOGISTIC REGRESSION RANKING METRICS")
print()

lr_ranking_results = compute_ranking_metrics(test_pairs, lr_test_probs)
print()

LOGISTIC REGRESSION RANKING METRICS



Computing ranking metrics:   0%|          | 0/9528 [00:00<?, ?it/s]

  Evaluated 8227 queries

  Recall@1: 0.3371 (±0.4016)
  NDCG@1: 0.5332 (±0.4989)
  Recall@5: 0.6470 (±0.3817)
  NDCG@5: 0.5686 (±0.3246)
  Recall@10: 0.6574 (±0.3407)
  NDCG@10: 0.5281 (±0.2864)
  Recall@20: 0.7832 (±0.2886)
  NDCG@20: 0.5500 (±0.2469)



### XGBoost Ranking Performance

In [16]:
print("XGBOOST RANKING METRICS")
print()

xgb_ranking_results = compute_ranking_metrics(test_pairs, xgb_test_probs)
print()

XGBOOST RANKING METRICS



Computing ranking metrics:   0%|          | 0/9528 [00:00<?, ?it/s]

  Evaluated 8227 queries

  Recall@1: 0.3453 (±0.4014)
  NDCG@1: 0.5517 (±0.4973)
  Recall@5: 0.6603 (±0.3745)
  NDCG@5: 0.5882 (±0.3261)
  Recall@10: 0.6833 (±0.3293)
  NDCG@10: 0.5556 (±0.2875)
  Recall@20: 0.7887 (±0.2776)
  NDCG@20: 0.5701 (±0.2536)



## Confusion Matrix Analysis

In [17]:
print("Confusion matrix analysis...")
print()

lr_cm = confusion_matrix(y_test, lr_test_preds)
xgb_cm = confusion_matrix(y_test, xgb_test_preds)

print("Logistic Regression Confusion Matrix:")
print(f"  TN: {lr_cm[0,0]:,}  FP: {lr_cm[0,1]:,}")
print(f"  FN: {lr_cm[1,0]:,}  TP: {lr_cm[1,1]:,}")
print()

print("XGBoost Confusion Matrix:")
print(f"  TN: {xgb_cm[0,0]:,}  FP: {xgb_cm[0,1]:,}")
print(f"  FN: {xgb_cm[1,0]:,}  TP: {xgb_cm[1,1]:,}")
print()

Confusion matrix analysis...

Logistic Regression Confusion Matrix:
  TN: 44,961  FP: 54,996
  FN: 7,286  TP: 16,804

XGBoost Confusion Matrix:
  TN: 71,660  FP: 28,297
  FN: 8,433  TP: 15,657



## Save Models and Metrics

In [18]:
print("Saving models...")
print()

with open(MODELS_DIR / 'logistic_regression_baseline.pkl', 'wb') as f:
    pickle.dump(lr_model, f)

with open(MODELS_DIR / 'xgboost_baseline.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

print("Models saved:")
print(f"  {MODELS_DIR / 'logistic_regression_baseline.pkl'}")
print(f"  {MODELS_DIR / 'xgboost_baseline.pkl'}")
print()

# Cell 18: Save metrics
print("Saving metrics...")
print()

classification_metrics = {
    'logistic_regression': {
        'train_auroc': float(lr_train_auroc),
        'train_auprc': float(lr_train_auprc),
        'test_auroc': float(lr_test_auroc),
        'test_auprc': float(lr_test_auprc),
        'confusion_matrix': lr_cm.tolist(),
        'test_predictions': lr_test_preds.tolist(),
        'test_probabilities': lr_test_probs.tolist()
    },
    'xgboost': {
        'train_auroc': float(xgb_train_auroc),
        'train_auprc': float(xgb_train_auprc),
        'test_auroc': float(xgb_test_auroc),
        'test_auprc': float(xgb_test_auprc),
        'confusion_matrix': xgb_cm.tolist(),
        'test_predictions': xgb_test_preds.tolist(),
        'test_probabilities': xgb_test_probs.tolist(),
        'feature_importance': importance_df.to_dict('records')
    }
}

with open(RESULTS_METRICS_DIR / 'baseline_classification_metrics.pkl', 'wb') as f:
    pickle.dump(classification_metrics, f)

ranking_metrics = {
    'logistic_regression': {
        k: {
            'recall_mean': float(np.mean(lr_ranking_results[k]['recall'])) if lr_ranking_results[k]['recall'] else 0,
            'recall_std': float(np.std(lr_ranking_results[k]['recall'])) if lr_ranking_results[k]['recall'] else 0,
            'ndcg_mean': float(np.mean(lr_ranking_results[k]['ndcg'])) if lr_ranking_results[k]['ndcg'] else 0,
            'ndcg_std': float(np.std(lr_ranking_results[k]['ndcg'])) if lr_ranking_results[k]['ndcg'] else 0
        }
        for k in [1, 5, 10, 20]
    },
    'xgboost': {
        k: {
            'recall_mean': float(np.mean(xgb_ranking_results[k]['recall'])) if xgb_ranking_results[k]['recall'] else 0,
            'recall_std': float(np.std(xgb_ranking_results[k]['recall'])) if xgb_ranking_results[k]['recall'] else 0,
            'ndcg_mean': float(np.mean(xgb_ranking_results[k]['ndcg'])) if xgb_ranking_results[k]['ndcg'] else 0,
            'ndcg_std': float(np.std(xgb_ranking_results[k]['ndcg'])) if xgb_ranking_results[k]['ndcg'] else 0
        }
        for k in [1, 5, 10, 20]
    }
}

with open(RESULTS_METRICS_DIR / 'baseline_ranking_metrics.pkl', 'wb') as f:
    pickle.dump(ranking_metrics, f)

print("Metrics saved:")
print(f"  {RESULTS_METRICS_DIR / 'baseline_classification_metrics.pkl'}")
print(f"  {RESULTS_METRICS_DIR / 'baseline_ranking_metrics.pkl'}")
print()

Saving models...

Models saved:
  ../670-Project/models/checkpoints/logistic_regression_baseline.pkl
  ../670-Project/models/checkpoints/xgboost_baseline.pkl

Saving metrics...

Metrics saved:
  ../670-Project/results/metrics/baseline_classification_metrics.pkl
  ../670-Project/results/metrics/baseline_ranking_metrics.pkl



## Summary

In [19]:
print("BASELINE MODELS COMPLETE")
print()

print("Models trained:")
print("  1. Logistic Regression")
print("  2. XGBoost")
print()

print("Classification Performance (Test Set):")
print(f"  Logistic Regression - AUROC: {lr_test_auroc:.4f}, AUPRC: {lr_test_auprc:.4f}")
print(f"  XGBoost            - AUROC: {xgb_test_auroc:.4f}, AUPRC: {xgb_test_auprc:.4f}")
print()

print("Ranking Performance (Test Set):")
print(f"  Logistic Regression - Recall@10: {np.mean(lr_ranking_results[10]['recall']):.4f}")
print(f"  XGBoost            - Recall@10: {np.mean(xgb_ranking_results[10]['recall']):.4f}")
print()

if xgb_test_auroc > lr_test_auroc:
    print("Best baseline model: XGBoost")
    improvement = (xgb_test_auroc - lr_test_auroc) / lr_test_auroc * 100
    print(f"  Improvement over LR: +{improvement:.2f}%")
else:
    print("Best baseline model: Logistic Regression")
print()

BASELINE MODELS COMPLETE

Models trained:
  1. Logistic Regression
  2. XGBoost

Classification Performance (Test Set):
  Logistic Regression - AUROC: 0.6455, AUPRC: 0.3020
  XGBoost            - AUROC: 0.7163, AUPRC: 0.3444

Ranking Performance (Test Set):
  Logistic Regression - Recall@10: 0.6574
  XGBoost            - Recall@10: 0.6833

Best baseline model: XGBoost
  Improvement over LR: +10.96%

