# Peer Recommendation System

Course: SI 670: Applied Machine Learning

Name : Yuganshi Agrawal  
uniqname: yuganshi

Name : Sai Sneha Siddapura Venkataramappa  
uniqname: saisneha

### Notebook 03: Label Generation

This notebook creates complementarity labels for student pairs.

**Inputs:**
- `data/features/feature_matrix_scaled.pkl`
- `data/features/features_proc_unique.pkl`
- `data/processed/student_info_clean.pkl`
- `data/processed/student_split.pkl`

**Outputs:**
- `data/processed/student_pairs_train.pkl`
- `data/processed/student_pairs_test.pkl`
- `data/processed/complementarity_definition.json`
- `results/analysis/label_distribution.pkl`

In [1]:
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0'

import pickle
import json
import numpy as np
import pandas as pd
import random
import itertools
import multiprocessing as mp
from pathlib import Path
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torch
import torch.nn.functional as F

RNG_SEED = 42
np.random.seed(RNG_SEED)
random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)

# Cell 2: Hardware detection with robust error handling
N_CPU = mp.cpu_count()
DEVICE = 'cpu'  # Default to CPU
N_GPU = 0

print("Hardware Configuration:")
print(f"  CPUs available: {N_CPU}")

# Try to detect and initialize CUDA
try:
    if torch.cuda.is_available():
        N_GPU = torch.cuda.device_count()
        if N_GPU > 0:
            # Test that we can actually access GPU 0
            torch.cuda.set_device(0)
            test_tensor = torch.zeros(1).cuda()
            del test_tensor
            torch.cuda.empty_cache()
            
            DEVICE = 'cuda'
            print(f"  Device: cuda")
            print(f"  GPUs accessible: {N_GPU}")
            
            for i in range(N_GPU):
                try:
                    name = torch.cuda.get_device_name(i)
                    mem = torch.cuda.get_device_properties(i).total_memory / 1e9
                    print(f"    GPU {i}: {name} ({mem:.1f} GB)")
                except Exception as e:
                    print(f"    GPU {i}: Error accessing - {str(e)[:50]}")
        else:
            print(f"  Device: cpu (CUDA available but no GPUs detected)")
    else:
        print(f"  Device: cpu (CUDA not available)")
        
except Exception as e:
    print(f"  Device: cpu (CUDA initialization failed: {str(e)[:80]})")
    DEVICE = 'cpu'
    N_GPU = 0

print()

Hardware Configuration:
  CPUs available: 32
  Device: cpu (CUDA initialization failed: CUDA call failed lazily at initialization with error: device >= 0 && device < nu)



In [3]:
if DEVICE == 'cuda':
    N_WORKERS = min(4, max(1, N_CPU // 4))  # Fewer workers for GPU mode
else:
    N_WORKERS = max(1, min(N_CPU - 1, 16))  # More workers for CPU mode, cap at 16

print(f"Parallel Configuration:")
print(f"  Mode: {'GPU-accelerated' if DEVICE == 'cuda' else 'CPU multiprocessing'}")
print(f"  Workers: {N_WORKERS}")
print()

Parallel Configuration:
  Mode: CPU multiprocessing
  Workers: 16



## Directory Setup

In [4]:
BASE_DIR = Path('../670-Project')
DATA_PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
DATA_FEATURES_DIR = BASE_DIR / 'data' / 'features'
RESULTS_ANALYSIS_DIR = BASE_DIR / 'results' / 'analysis'

RESULTS_ANALYSIS_DIR.mkdir(parents=True, exist_ok=True)

print("Directory structure:")
print(f"  Processed data: {DATA_PROCESSED_DIR}")
print(f"  Features: {DATA_FEATURES_DIR}")
print(f"  Analysis output: {RESULTS_ANALYSIS_DIR}")
print()

Directory structure:
  Processed data: ../670-Project/data/processed
  Features: ../670-Project/data/features
  Analysis output: ../670-Project/results/analysis



## Configuration

In [5]:
MAX_PAIRS_PER_MODULE = 30000
COMPLEMENTARITY_THRESHOLD_PERCENTILE = 0.80

print("Configuration:")
print(f"  Max pairs per module: {MAX_PAIRS_PER_MODULE:,}")
print(f"  Complementarity threshold: {COMPLEMENTARITY_THRESHOLD_PERCENTILE} percentile")
print(f"  Random seed: {RNG_SEED}")
print()

Configuration:
  Max pairs per module: 30,000
  Complementarity threshold: 0.8 percentile
  Random seed: 42



## Load Data

Load features and metadata from previous notebooks.

In [6]:
print("Loading data...")
print()

def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

feature_matrix_scaled = load_pickle(DATA_FEATURES_DIR / 'feature_matrix_scaled.pkl')
features_proc_unique = load_pickle(DATA_FEATURES_DIR / 'features_proc_unique.pkl')
student_info = load_pickle(DATA_PROCESSED_DIR / 'student_info_clean.pkl')
student_split = load_pickle(DATA_PROCESSED_DIR / 'student_split.pkl')
assessment_features = load_pickle(DATA_FEATURES_DIR / 'assessment_features.pkl')

assess_pivot_z = assessment_features['assess_pivot_z']

train_students = student_split['train_students']
holdout_students = student_split['holdout_students']

with open(DATA_FEATURES_DIR / 'feature_metadata.json', 'r') as f:
    feature_metadata = json.load(f)

print("Loaded data:")
print(f"  Feature matrix: {feature_matrix_scaled.shape}")
print(f"  Unique students: {len(features_proc_unique):,}")
print(f"  Assessment profiles: {assess_pivot_z.shape}")
print(f"  Training students: {len(train_students):,}")
print(f"  Holdout students: {len(holdout_students):,}")
print()

Loading data...

Loaded data:
  Feature matrix: (28785, 100)
  Unique students: 28,785
  Assessment profiles: (23351, 188)
  Training students: 25,907
  Holdout students: 2,878



## Complementarity Definition

Define complementarity as a combination of:
1. **Skill complementarity**: Moderate distance in assessment profiles
2. **Engagement complementarity**: Different activity patterns

Students are complementary when they have:
- Different skill strengths (can help each other)
- Different engagement patterns (can balance each other)

In [7]:
print("Preparing assessment embeddings for complementarity computation...")
print()

assess_cols = [c for c in features_proc_unique.columns if c.startswith('assess_emb_')]
print(f"Assessment embedding columns: {len(assess_cols)}")
print()

Preparing assessment embeddings for complementarity computation...

Assessment embedding columns: 48



In [8]:
def compute_complementarity_batch_gpu(pairs_batch, features_dict, assess_lookup, assess_tensor):
    batch_size = len(pairs_batch)
    comp_scores = np.zeros(batch_size)
    skill_comps = np.zeros(batch_size)
    engage_comps = np.zeros(batch_size)
    
    valid_mask = []
    indices_a = []
    indices_b = []
    
    for i, (sid_a, sid_b) in enumerate(pairs_batch):
        if sid_a in assess_lookup and sid_b in assess_lookup:
            indices_a.append(assess_lookup[sid_a])
            indices_b.append(assess_lookup[sid_b])
            valid_mask.append(i)
    
    if len(valid_mask) > 0:
        idx_a = torch.LongTensor(indices_a).cuda()
        idx_b = torch.LongTensor(indices_b).cuda()
        
        vecs_a = assess_tensor[idx_a]
        vecs_b = assess_tensor[idx_b]
        
        norm_a = F.normalize(vecs_a, dim=1)
        norm_b = F.normalize(vecs_b, dim=1)
        
        correlations = (norm_a * norm_b).sum(dim=1)
        
        distances = torch.sqrt(((norm_a - norm_b) ** 2).sum(dim=1) + 1e-8)
        distance_scores = torch.exp(-((distances - 0.7) ** 2) / 0.3)
        
        complement_patterns = torch.abs(norm_a - norm_b) * (norm_a + norm_b)
        complement_patterns = complement_patterns.mean(dim=1)
        
        skill_comp_batch = (
            0.40 * (1 - correlations) +
            0.35 * distance_scores +
            0.25 * complement_patterns
        ).cpu().numpy()
        
        skill_comps[valid_mask] = np.clip(skill_comp_batch, 0, 1)
    
    for i, (sid_a, sid_b) in enumerate(pairs_batch):
        if sid_a in features_dict and sid_b in features_dict:
            feat_a = features_dict[sid_a]
            feat_b = features_dict[sid_b]
            
            type_div_diff = abs(feat_a['type_diversity'] - feat_b['type_diversity'])
            type_div_score = type_div_diff / (feat_a['type_diversity'] + feat_b['type_diversity'] + 1)
            
            entropy_diff = abs(feat_a['type_entropy'] - feat_b['type_entropy'])
            entropy_score = entropy_diff / (feat_a['type_entropy'] + feat_b['type_entropy'] + 1)
            
            min_activity = min(feat_a['week_mean'], feat_b['week_mean'])
            activity_score = min(min_activity / 10, 1.0)
            
            engage_comps[i] = 0.35 * type_div_score + 0.35 * entropy_score + 0.30 * activity_score
    
    comp_scores = 0.60 * skill_comps + 0.40 * engage_comps
    
    return comp_scores, skill_comps, engage_comps


In [9]:
def compute_complementarity_batch_cpu(pairs_batch, features_dict, assess_array, assess_lookup):
    batch_size = len(pairs_batch)
    comp_scores = np.zeros(batch_size)
    skill_comps = np.zeros(batch_size)
    engage_comps = np.zeros(batch_size)
    
    valid_mask = []
    indices_a = []
    indices_b = []
    
    for i, (sid_a, sid_b) in enumerate(pairs_batch):
        if sid_a in assess_lookup and sid_b in assess_lookup:
            indices_a.append(assess_lookup[sid_a])
            indices_b.append(assess_lookup[sid_b])
            valid_mask.append(i)
    
    if len(valid_mask) > 0:
        vecs_a = assess_array[indices_a]
        vecs_b = assess_array[indices_b]
        
        # Normalize vectors
        norm_a = vecs_a / (np.linalg.norm(vecs_a, axis=1, keepdims=True) + 1e-8)
        norm_b = vecs_b / (np.linalg.norm(vecs_b, axis=1, keepdims=True) + 1e-8)
        
        # Compute correlations
        correlations = (norm_a * norm_b).sum(axis=1)
        
        # Compute distances
        distances = np.sqrt(((norm_a - norm_b) ** 2).sum(axis=1) + 1e-8)
        distance_scores = np.exp(-((distances - 0.7) ** 2) / 0.3)
        
        # Compute complement patterns
        complement_patterns = np.abs(norm_a - norm_b) * (norm_a + norm_b)
        complement_patterns = complement_patterns.mean(axis=1)
        
        # Combine skill components
        skill_comp_batch = (
            0.40 * (1 - correlations) +
            0.35 * distance_scores +
            0.25 * complement_patterns
        )
        
        skill_comps[valid_mask] = np.clip(skill_comp_batch, 0, 1)
    
    # Compute engagement complementarity
    for i, (sid_a, sid_b) in enumerate(pairs_batch):
        if sid_a in features_dict and sid_b in features_dict:
            feat_a = features_dict[sid_a]
            feat_b = features_dict[sid_b]
            
            type_div_diff = abs(feat_a['type_diversity'] - feat_b['type_diversity'])
            type_div_score = type_div_diff / (feat_a['type_diversity'] + feat_b['type_diversity'] + 1)
            
            entropy_diff = abs(feat_a['type_entropy'] - feat_b['type_entropy'])
            entropy_score = entropy_diff / (feat_a['type_entropy'] + feat_b['type_entropy'] + 1)
            
            min_activity = min(feat_a['week_mean'], feat_b['week_mean'])
            activity_score = min(min_activity / 10, 1.0)
            
            engage_comps[i] = 0.35 * type_div_score + 0.35 * entropy_score + 0.30 * activity_score
    
    comp_scores = 0.60 * skill_comps + 0.40 * engage_comps
    
    return comp_scores, skill_comps, engage_comps

In [10]:
def process_pair_chunk_cpu(args):
    """Worker function for multiprocessing on CPU."""
    pairs_chunk, features_dict, assess_array, assess_lookup = args
    return compute_complementarity_batch_cpu(pairs_chunk, features_dict, assess_array, assess_lookup)


print("Complementarity computation functions defined")
print(f"  Mode: {'GPU' if DEVICE == 'cuda' else 'CPU with multiprocessing'}")
print()


Complementarity computation functions defined
  Mode: CPU with multiprocessing



In [12]:
print("Preparing feature lookups...")
print()

assess_index = assess_pivot_z.index.tolist()
assess_lookup = {sid: idx for idx, sid in enumerate(assess_index)}

if DEVICE == 'cuda':
    try:
        assess_tensor = torch.FloatTensor(assess_pivot_z.values).cuda()
        assess_array = None
    except Exception as e:
        N_WORKERS = max(1, min(N_CPU - 1, 16))
        assess_tensor = None
        assess_array = assess_pivot_z.values.astype(np.float32)
else:
    assess_tensor = None
    assess_array = assess_pivot_z.values.astype(np.float32)

features_dict = {}
for _, row in features_proc_unique.iterrows():
    features_dict[row['id_student']] = {
        'type_diversity': row['type_diversity'],
        'type_entropy': row['type_entropy'],
        'week_mean': row['week_mean']
    }

print()
print(f"Assessment lookup: {len(assess_lookup):,} students")
print(f"Features dict: {len(features_dict):,} students")
print()

Preparing feature lookups...


Assessment lookup: 23,351 students
Features dict: 28,785 students



## Generate Student Pairs

Generate pairs within each module-presentation group.
Sample if exceeds maximum pairs per module.

In [14]:
print("Generating student pairs per module...")
print()

COMP_BATCH_SIZE = 5000 if DEVICE == 'cuda' else 2000

pair_data = []
grouped = features_proc_unique.groupby(['code_module', 'code_presentation'])

print(f"Processing {grouped.ngroups} module-presentation groups...")
print(f"  Batch size: {COMP_BATCH_SIZE}")
if DEVICE == 'cpu':
    print(f"  Parallel workers: {N_WORKERS}")
print()

for (mod, pres), group in tqdm(grouped, desc="Creating pairs"):
    group_dedup = group.drop_duplicates(subset=['id_student'], keep='first')
    students = group_dedup['id_student'].unique().tolist()
    
    if len(students) < 2:
        continue
    
    all_pairs = list(itertools.combinations(students, 2))
    
    if len(all_pairs) > MAX_PAIRS_PER_MODULE:
        all_pairs = random.sample(all_pairs, MAX_PAIRS_PER_MODULE)
    
    if DEVICE == 'cuda':
        # GPU processing - sequential batches
        for batch_start in range(0, len(all_pairs), COMP_BATCH_SIZE):
            batch_end = min(batch_start + COMP_BATCH_SIZE, len(all_pairs))
            batch_pairs = all_pairs[batch_start:batch_end]
            
            comp_scores, skill_comps, engage_comps = compute_complementarity_batch_gpu(
                batch_pairs, features_dict, assess_lookup, assess_tensor
            )
            
            for idx, (sid_a, sid_b) in enumerate(batch_pairs):
                pair_data.append({
                    'id_i': sid_a,
                    'id_j': sid_b,
                    'code_module': mod,
                    'code_presentation': pres,
                    'comp_score': comp_scores[idx],
                    'skill_comp': skill_comps[idx],
                    'engage_comp': engage_comps[idx]
                })
    else:
        # CPU multiprocessing - parallel batches
        # Split pairs into chunks for parallel processing
        chunks = []
        for batch_start in range(0, len(all_pairs), COMP_BATCH_SIZE):
            batch_end = min(batch_start + COMP_BATCH_SIZE, len(all_pairs))
            batch_pairs = all_pairs[batch_start:batch_end]
            chunks.append((batch_pairs, features_dict, assess_array, assess_lookup))
        
        # Process chunks in parallel
        with mp.Pool(processes=N_WORKERS) as pool:
            results = pool.map(process_pair_chunk_cpu, chunks)
        
        # Aggregate results
        chunk_idx = 0
        for batch_start in range(0, len(all_pairs), COMP_BATCH_SIZE):
            batch_end = min(batch_start + COMP_BATCH_SIZE, len(all_pairs))
            batch_pairs = all_pairs[batch_start:batch_end]
            
            comp_scores, skill_comps, engage_comps = results[chunk_idx]
            chunk_idx += 1
            
            for idx, (sid_a, sid_b) in enumerate(batch_pairs):
                pair_data.append({
                    'id_i': sid_a,
                    'id_j': sid_b,
                    'code_module': mod,
                    'code_presentation': pres,
                    'comp_score': comp_scores[idx],
                    'skill_comp': skill_comps[idx],
                    'engage_comp': engage_comps[idx]
                })

pairs_df = pd.DataFrame(pair_data)

print()
print(f"Generated {len(pairs_df):,} pairs")
print(f"  Across {grouped.ngroups} module-presentation groups")
print()

if DEVICE == 'cuda':
    torch.cuda.empty_cache()

Generating student pairs per module...

Processing 22 module-presentation groups...
  Batch size: 2000
  Parallel workers: 16



Creating pairs:   0%|          | 0/22 [00:00<?, ?it/s]


Generated 660,000 pairs
  Across 22 module-presentation groups



## Create Binary Labels

Convert complementarity scores to binary labels using percentile threshold.

In [15]:
print("Creating binary labels...")
print()

threshold = pairs_df['comp_score'].quantile(COMPLEMENTARITY_THRESHOLD_PERCENTILE)
pairs_df['label'] = (pairs_df['comp_score'] >= threshold).astype(int)

print(f"Complementarity threshold: {threshold:.4f}")
print()

print("Complementarity score statistics:")
print(f"  Mean: {pairs_df['comp_score'].mean():.4f}")
print(f"  Std: {pairs_df['comp_score'].std():.4f}")
print(f"  Min: {pairs_df['comp_score'].min():.4f}")
print(f"  Max: {pairs_df['comp_score'].max():.4f}")
print(f"  Median: {pairs_df['comp_score'].median():.4f}")
print()

print("Label distribution:")
label_counts = pairs_df['label'].value_counts().sort_index()
for label, count in label_counts.items():
    pct = count / len(pairs_df) * 100
    print(f"  Label {label}: {count:,} ({pct:.2f}%)")
print()

Creating binary labels...

Complementarity threshold: 0.3978

Complementarity score statistics:
  Mean: 1.1179
  Std: 717.8133
  Min: -21677.0041
  Max: 560557.2619
  Median: 0.2512

Label distribution:
  Label 0: 528,000 (80.00%)
  Label 1: 132,000 (20.00%)



## Split Pairs into Train/Test

Split based on student holdout:
- If either student in pair is in holdout set, pair goes to test
- Otherwise pair goes to train

In [16]:
print("Creating train/test split...")
print()

pairs_df['holdout'] = pairs_df.apply(
    lambda r: (r['id_i'] in holdout_students) or (r['id_j'] in holdout_students),
    axis=1
)

train_pairs = pairs_df[~pairs_df['holdout']].reset_index(drop=True)
test_pairs = pairs_df[pairs_df['holdout']].reset_index(drop=True)

print(f"Train pairs: {len(train_pairs):,}")
print(f"  Positive: {train_pairs['label'].sum():,} ({train_pairs['label'].mean()*100:.2f}%)")
print(f"  Negative: {(1-train_pairs['label']).sum():,} ({(1-train_pairs['label'].mean())*100:.2f}%)")
print()

print(f"Test pairs: {len(test_pairs):,}")
print(f"  Positive: {test_pairs['label'].sum():,} ({test_pairs['label'].mean()*100:.2f}%)")
print(f"  Negative: {(1-test_pairs['label']).sum():,} ({(1-test_pairs['label'].mean())*100:.2f}%)")
print()

train_pairs = train_pairs.drop(columns=['holdout'])
test_pairs = test_pairs.drop(columns=['holdout'])

Creating train/test split...

Train pairs: 535,953
  Positive: 107,910 (20.13%)
  Negative: 428,043 (79.87%)

Test pairs: 124,047
  Positive: 24,090 (19.42%)
  Negative: 99,957 (80.58%)



## Label Quality Analysis

Analyze the distribution and quality of generated labels.

In [17]:
print("Label quality analysis...")
print()

label_analysis = {
    'total_pairs': len(pairs_df),
    'train_pairs': len(train_pairs),
    'test_pairs': len(test_pairs),
    'positive_rate_train': float(train_pairs['label'].mean()),
    'positive_rate_test': float(test_pairs['label'].mean()),
    'threshold': float(threshold),
    'threshold_percentile': COMPLEMENTARITY_THRESHOLD_PERCENTILE,
    'comp_score_stats': {
        'mean': float(pairs_df['comp_score'].mean()),
        'std': float(pairs_df['comp_score'].std()),
        'min': float(pairs_df['comp_score'].min()),
        'max': float(pairs_df['comp_score'].max()),
        'median': float(pairs_df['comp_score'].median())
    },
    'skill_comp_stats': {
        'mean': float(pairs_df['skill_comp'].mean()),
        'std': float(pairs_df['skill_comp'].std())
    },
    'engage_comp_stats': {
        'mean': float(pairs_df['engage_comp'].mean()),
        'std': float(pairs_df['engage_comp'].std())
    }
}

print("Skill complementarity statistics:")
print(f"  Mean: {label_analysis['skill_comp_stats']['mean']:.4f}")
print(f"  Std: {label_analysis['skill_comp_stats']['std']:.4f}")
print()

print("Engagement complementarity statistics:")
print(f"  Mean: {label_analysis['engage_comp_stats']['mean']:.4f}")
print(f"  Std: {label_analysis['engage_comp_stats']['std']:.4f}")
print()

pairs_by_module = pairs_df.groupby(['code_module', 'code_presentation']).agg({
    'label': ['count', 'mean']
}).reset_index()
pairs_by_module.columns = ['code_module', 'code_presentation', 'n_pairs', 'positive_rate']

print("Pairs per module:")
print(pairs_by_module.to_string(index=False))
print()

label_analysis['pairs_by_module'] = pairs_by_module.to_dict('records')

Label quality analysis...

Skill complementarity statistics:
  Mean: 0.2305
  Std: 0.1984

Engagement complementarity statistics:
  Mean: 2.4490
  Std: 1794.5335

Pairs per module:
code_module code_presentation  n_pairs  positive_rate
        AAA             2013J    30000       0.374500
        AAA             2014J    30000       0.236633
        BBB             2013B    30000       0.175000
        BBB             2013J    30000       0.125567
        BBB             2014B    30000       0.160700
        BBB             2014J    30000       0.205233
        CCC             2014B    30000       0.368100
        CCC             2014J    30000       0.410000
        DDD             2013B    30000       0.160133
        DDD             2013J    30000       0.160967
        DDD             2014B    30000       0.184367
        DDD             2014J    30000       0.177267
        EEE             2013J    30000       0.154167
        EEE             2014B    30000       0.179133
        E

## Save Results

Save train/test pairs and metadata.

In [18]:
print("Saving pairs...")
print()

with open(DATA_PROCESSED_DIR / 'student_pairs_train.pkl', 'wb') as f:
    pickle.dump(train_pairs, f)

with open(DATA_PROCESSED_DIR / 'student_pairs_test.pkl', 'wb') as f:
    pickle.dump(test_pairs, f)

with open(RESULTS_ANALYSIS_DIR / 'label_distribution.pkl', 'wb') as f:
    pickle.dump(label_analysis, f)

complementarity_definition = {
    'skill_weight': 0.60,
    'engagement_weight': 0.40,
    'skill_components': {
        'correlation': 0.40,
        'distance': 0.35,
        'complement_pattern': 0.25
    },
    'engagement_components': {
        'type_diversity': 0.35,
        'entropy': 0.35,
        'activity_level': 0.30
    },
    'threshold_percentile': COMPLEMENTARITY_THRESHOLD_PERCENTILE,
    'threshold_value': float(threshold),
    'max_pairs_per_module': MAX_PAIRS_PER_MODULE,
    'random_seed': RNG_SEED
}

with open(DATA_PROCESSED_DIR / 'complementarity_definition.json', 'w') as f:
    json.dump(complementarity_definition, f, indent=2)

print("Saved files:")
print(f"  {DATA_PROCESSED_DIR / 'student_pairs_train.pkl'}")
print(f"  {DATA_PROCESSED_DIR / 'student_pairs_test.pkl'}")
print(f"  {DATA_PROCESSED_DIR / 'complementarity_definition.json'}")
print(f"  {RESULTS_ANALYSIS_DIR / 'label_distribution.pkl'}")
print()

sizes = [
    ('student_pairs_train.pkl', DATA_PROCESSED_DIR / 'student_pairs_train.pkl'),
    ('student_pairs_test.pkl', DATA_PROCESSED_DIR / 'student_pairs_test.pkl')
]

for name, path in sizes:
    size_mb = path.stat().st_size / (1024 * 1024)
    print(f"  {name:30s} ({size_mb:.2f} MB)")

print()

Saving pairs...

Saved files:
  ../670-Project/data/processed/student_pairs_train.pkl
  ../670-Project/data/processed/student_pairs_test.pkl
  ../670-Project/data/processed/complementarity_definition.json
  ../670-Project/results/analysis/label_distribution.pkl

  student_pairs_train.pkl        (26.58 MB)
  student_pairs_test.pkl         (6.15 MB)



## Summary

In [19]:
print("LABEL GENERATION COMPLETE")
print()

print("Pairs created:")
print(f"  Total: {len(pairs_df):,}")
print(f"  Train: {len(train_pairs):,} ({len(train_pairs)/len(pairs_df)*100:.1f}%)")
print(f"  Test: {len(test_pairs):,} ({len(test_pairs)/len(pairs_df)*100:.1f}%)")
print()

print("Label distribution:")
print(f"  Positive (complementary): {pairs_df['label'].sum():,} ({pairs_df['label'].mean()*100:.1f}%)")
print(f"  Negative: {(1-pairs_df['label']).sum():,} ({(1-pairs_df['label'].mean())*100:.1f}%)")
print()

print("Complementarity definition:")
print(f"  Skill weight: 60%")
print(f"  Engagement weight: 40%")
print(f"  Threshold: {COMPLEMENTARITY_THRESHOLD_PERCENTILE} percentile ({threshold:.4f})")
print()

LABEL GENERATION COMPLETE

Pairs created:
  Total: 660,000
  Train: 535,953 (81.2%)
  Test: 124,047 (18.8%)

Label distribution:
  Positive (complementary): 132,000 (20.0%)
  Negative: 528,000 (80.0%)

Complementarity definition:
  Skill weight: 60%
  Engagement weight: 40%
  Threshold: 0.8 percentile (0.3978)

