# Peer Recommendation System
Course: SI 670: Applied Machine Learning

Name : Yuganshi Agrawal    
uniqname: yuganshi

Name : Sai Sneha Siddapura Venkataramappa  
uniqname: saisneha

### Notebook 02: Feature Engineering

This notebook creates student-level features from interaction data.

**Inputs:**
- All cleaned files from `data/processed/`

**Outputs:**
- `data/features/demographic_features.pkl`
- `data/features/assessment_features.pkl`
- `data/features/engagement_features.pkl`
- `data/features/resource_features.pkl`
- `data/features/temporal_features.pkl`
- `data/features/feature_matrix_scaled.pkl`
- `data/features/feature_metadata.json`

In [1]:
# Setup
import os

os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'

import pickle
import json
import numpy as np
import pandas as pd
import random
import torch
import multiprocessing as mp
from pathlib import Path
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

RNG_SEED = 42
np.random.seed(RNG_SEED)
random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)


<torch._C.Generator at 0x14ddb733e650>

In [2]:
# Cell 2: Hardware detection
N_CPU = mp.cpu_count()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_GPU = 0

print("Hardware Configuration:")
print(f"  CPUs available: {N_CPU}")
print(f"  Device: {DEVICE}")

if DEVICE == 'cuda':
    try:
        N_GPU = torch.cuda.device_count()
        print(f"  GPUs accessible: {N_GPU}")
        
        for i in range(N_GPU):
            try:
                name = torch.cuda.get_device_name(i)
                print(f"    GPU {i}: {name}")
            except Exception as e:
                print(f"    GPU {i}: Error - {str(e)[:50]}")
    
    except Exception as e:
        print(f"  GPU detection failed: {e}")
        print("  Falling back to CPU")
        DEVICE = 'cpu'
        N_GPU = 0

print()

if DEVICE == 'cuda':
    N_WORKERS = min(8, N_CPU // 2)
else:
    N_WORKERS = max(1, N_CPU - 2)

print(f"Parallel Configuration:")
print(f"  Workers: {N_WORKERS}")

Hardware Configuration:
  CPUs available: 32
  Device: cuda
  GPUs accessible: 4
    GPU 0: Error - CUDA call failed lazily at initialization with err
    GPU 1: Error - CUDA call failed lazily at initialization with err
    GPU 2: Error - CUDA call failed lazily at initialization with err
    GPU 3: Error - CUDA call failed lazily at initialization with err

Parallel Configuration:
  Workers: 8


## Directory Setup

In [3]:
BASE_DIR = Path('../670-Project')
DATA_PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
DATA_FEATURES_DIR = BASE_DIR / 'data' / 'features'

DATA_FEATURES_DIR.mkdir(parents=True, exist_ok=True)

print("Directory structure:")
print(f"  Processed data: {DATA_PROCESSED_DIR}")
print(f"  Features output: {DATA_FEATURES_DIR}")
print()

Directory structure:
  Processed data: ../670-Project/data/processed
  Features output: ../670-Project/data/features



## Configuration

In [4]:
EMB_DIM = 48

print("Configuration:")
print(f"  PCA embedding dimension: {EMB_DIM}")
print(f"  Random seed: {RNG_SEED}")
print()

Configuration:
  PCA embedding dimension: 48
  Random seed: 42



## Load Processed Data

Load all cleaned datasets from notebook 01.

In [5]:
print("Loading processed datasets...")
print()

def load_pickle(filepath):
    with open(filepath, 'rb') as f:
        return pickle.load(f)

data_files = {
    'student_info': 'student_info_clean.pkl',
    'student_vle': 'student_vle_clean.pkl',
    'vle': 'vle_clean.pkl',
    'assessments': 'assessments_clean.pkl',
    'student_assessment': 'student_assessment_clean.pkl',
    'courses': 'courses_clean.pkl',
    'student_registration': 'student_registration_clean.pkl',
    'student_split': 'student_split.pkl'
}

datasets = {}
for name, filename in tqdm(data_files.items(), desc="Loading files"):
    filepath = DATA_PROCESSED_DIR / filename
    datasets[name] = load_pickle(filepath)

student_info = datasets['student_info']
student_vle = datasets['student_vle']
vle = datasets['vle']
assessments = datasets['assessments']
student_assessment = datasets['student_assessment']
courses = datasets['courses']
student_registration = datasets['student_registration']
student_split = datasets['student_split']

train_students = student_split['train_students']
holdout_students = student_split['holdout_students']

print(f"\nLoaded {len(datasets)} datasets")
print(f"  Training students: {len(train_students):,}")
print(f"  Holdout students: {len(holdout_students):,}")
print()

Loading processed datasets...



Loading files:   0%|          | 0/8 [00:00<?, ?it/s]


Loaded 8 datasets
  Training students: 25,907
  Holdout students: 2,878



## Feature Engineering

### 1. Demographic Features

One-hot encode demographic variables.

In [6]:
print("Engineering demographic features...")
print()

demo = student_info[[
    'id_student', 'gender', 'region', 'highest_education', 
    'imd_band', 'age_band', 'num_of_prev_attempts', 
    'studied_credits', 'disability'
]].copy()

print(f"Initial shape: {demo.shape}")
print(f"Unique students: {demo['id_student'].nunique()}")
print()

demo_oh = pd.get_dummies(
    demo,
    columns=['gender', 'region', 'highest_education', 'imd_band', 'age_band', 'disability'],
    dummy_na=True,
    drop_first=False
).fillna(0)

print(f"After one-hot encoding: {demo_oh.shape}")
print(f"Categorical columns added: {demo_oh.shape[1] - 3}")
print()

demographic_features = demo_oh

print("Sample features:")
print(demo_oh.head(3))
print()

Engineering demographic features...

Initial shape: (32593, 9)
Unique students: 28785

After one-hot encoding: (32593, 44)
Categorical columns added: 41

Sample features:
   id_student  num_of_prev_attempts  studied_credits  gender_F  gender_M  \
0       11391                     0              240         0         1   
1       28400                     0               60         1         0   
2       30268                     0               60         1         0   

   gender_nan  region_East Anglian Region  region_East Midlands Region  \
0           0                           1                            0   
1           0                           0                            0   
2           0                           0                            0   

   region_Ireland  region_London Region  ...  imd_band_80-90%  \
0               0                     0  ...                0   
1               0                     0  ...                0   
2               0               

### 2. Assessment Features

Compute assessment-based features:
- Raw scores per assessment (pivot table)
- Score diversity (standard deviation across assessments)

In [7]:
print("Engineering assessment features...")
print()

assess_pivot = student_assessment.pivot_table(
    index='id_student',
    columns='id_assessment',
    values='score',
    aggfunc='mean'
).fillna(0)

print(f"Assessment pivot shape: {assess_pivot.shape}")
print(f"  Students: {len(assess_pivot)}")
print(f"  Assessments: {len(assess_pivot.columns)}")
print()

assess_pivot_z = assess_pivot.apply(
    lambda col: (col - col.mean()) / (col.std() + 1e-8),
    axis=0
).fillna(0)

assess_diversity = assess_pivot_z.std(axis=1).fillna(0).rename('assess_diversity')

print("Assessment diversity statistics:")
print(f"  Mean: {assess_diversity.mean():.4f}")
print(f"  Std: {assess_diversity.std():.4f}")
print(f"  Range: [{assess_diversity.min():.4f}, {assess_diversity.max():.4f}]")
print()

assessment_features = {
    'assess_pivot': assess_pivot,
    'assess_pivot_z': assess_pivot_z,
    'assess_diversity': assess_diversity
}

Engineering assessment features...

Assessment pivot shape: (23351, 188)
  Students: 23351
  Assessments: 188

Assessment diversity statistics:
  Mean: 0.9131
  Std: 0.3955
  Range: [0.0411, 2.4994]



### 3. Engagement Features

Compute weekly engagement patterns:
- Weekly click aggregation
- Mean, std, max clicks per week
- Temporal trend (slope)

In [8]:
print("Engineering engagement features...")
print()

vle_week = student_vle.groupby(['id_student', 'week'])['sum_click'].sum().reset_index()

week_pivot = vle_week.pivot(
    index='id_student',
    columns='week',
    values='sum_click'
).fillna(0)

print(f"Weekly engagement pivot: {week_pivot.shape}")
print(f"  Week range: {week_pivot.columns.min()} to {week_pivot.columns.max()}")
print()

week_mean = week_pivot.mean(axis=1).rename('week_mean')
week_std = week_pivot.std(axis=1).fillna(0).rename('week_std')
week_max = week_pivot.max(axis=1).rename('week_max')

def compute_trend(row):
    if len(row) > 1 and row.sum() > 0:
        x = np.arange(len(row))
        return np.polyfit(x, row.values, 1)[0]
    return 0.0

print("Computing temporal trends (using parallel processing)...")

week_trend = week_pivot.apply(compute_trend, axis=1).rename('week_trend')

print()
print("Engagement statistics:")
print(f"  Mean clicks/week: {week_mean.mean():.2f} (±{week_mean.std():.2f})")
print(f"  Std clicks/week: {week_std.mean():.2f}")
print(f"  Max clicks/week: {week_max.mean():.2f}")
print(f"  Trend slope: {week_trend.mean():.4f}")
print()

engagement_features = pd.concat([
    week_mean,
    week_std,
    week_max,
    week_trend
], axis=1).reset_index()

Engineering engagement features...

Weekly engagement pivot: (26074, 43)
  Week range: -4 to 38

Computing temporal trends (using parallel processing)...

Engagement statistics:
  Mean clicks/week: 35.32 (±45.02)
  Std clicks/week: 48.52
  Max clicks/week: 222.85
  Trend slope: -0.7340



### 4. Resource Features

Compute resource usage patterns:
- Activity type diversity (number of unique types)
- Activity type entropy (distribution evenness)

In [9]:
print("Engineering resource features...")
print()

vle_types = vle[['id_site', 'activity_type']].drop_duplicates()
svt = student_vle.merge(vle_types, on='id_site', how='left')

type_counts = svt.groupby(['id_student', 'activity_type'])['sum_click'].sum().reset_index()

print(f"Activity type usage: {len(type_counts)} student-type pairs")
print()

type_diversity = type_counts.groupby('id_student')['activity_type'].nunique().rename('type_diversity')

def compute_entropy(group):
    counts = group['sum_click'].values
    total = counts.sum()
    if total == 0:
        return 0.0
    probs = counts / total
    return -np.sum(probs * np.log(probs + 1e-12))

type_entropy = type_counts.groupby('id_student').apply(compute_entropy).rename('type_entropy')

print("Resource usage statistics:")
print(f"  Avg type diversity: {type_diversity.mean():.2f} types per student")
print(f"  Avg entropy: {type_entropy.mean():.4f}")
print()

resource_features = pd.concat([
    type_diversity,
    type_entropy
], axis=1).reset_index()

Engineering resource features...

Activity type usage: 220871 student-type pairs

Resource usage statistics:
  Avg type diversity: 8.47 types per student
  Avg entropy: 1.4668



### 5. Temporal Features

Registration timing features:
- Registration date
- Unregistration date

In [10]:
print("Engineering temporal features...")
print()

reg = student_registration.groupby('id_student').agg(
    reg_date=('date_registration', 'min'),
    unreg_date=('date_unregistration', 'min')
).fillna({'reg_date': 0, 'unreg_date': 9999})

print("Registration statistics:")
print(f"  Earliest registration: {reg['reg_date'].min():.0f}")
print(f"  Latest registration: {reg['reg_date'].max():.0f}")
print(f"  Students with unregistration: {(reg['unreg_date'] < 9999).sum()}")
print()

temporal_features = reg.reset_index()

Engineering temporal features...

Registration statistics:
  Earliest registration: -322
  Latest registration: 101
  Students with unregistration: 9082



## Merge All Features

Combine all feature types into a single matrix.

In [11]:
print("Merging all features...")
print()

features = (demographic_features
    .merge(assess_diversity.reset_index(), on='id_student', how='left')
    .merge(engagement_features, on='id_student', how='left')
    .merge(resource_features, on='id_student', how='left')
    .merge(temporal_features, on='id_student', how='left')
).fillna(0)

module_map = student_info[['id_student', 'code_module', 'code_presentation']].drop_duplicates()
features = features.merge(module_map, on='id_student', how='left')

print(f"Combined features shape: {features.shape}")
print(f"  Students: {features['id_student'].nunique()}")
print(f"  Features per student: {features.shape[1] - 3}")
print()

print("Feature breakdown:")
print(f"  Demographic: {demographic_features.shape[1] - 1}")
print(f"  Assessment: 1 (diversity)")
print(f"  Engagement: 4 (mean, std, max, trend)")
print(f"  Resource: 2 (diversity, entropy)")
print(f"  Temporal: 2 (reg_date, unreg_date)")
print()

Merging all features...

Combined features shape: (40801, 55)
  Students: 28785
  Features per student: 52

Feature breakdown:
  Demographic: 43
  Assessment: 1 (diversity)
  Engagement: 4 (mean, std, max, trend)
  Resource: 2 (diversity, entropy)
  Temporal: 2 (reg_date, unreg_date)



## Module-Wise Normalization

Normalize features per module to account for module difficulty differences.
Process in parallel batches.

In [12]:
print("Normalizing features per module...")
print()

num_cols = [c for c in features.columns 
            if c not in ['id_student', 'code_module', 'code_presentation']]

def normalize_module_group(args):
    (mod, pres), group_data = args
    
    if len(group_data) < 2:
        return group_data
    
    group = group_data.copy()
    scaler = StandardScaler()
    scaled_vals = scaler.fit_transform(group[num_cols].values)
    group[num_cols] = scaled_vals
    
    ids = group['id_student'].values
    assess_sub = assess_pivot_z.reindex(ids).fillna(0)
    
    pca_n = min(EMB_DIM, max(2, len(group) - 1), assess_sub.shape[1])
    
    if pca_n >= 2:
        pca = PCA(n_components=pca_n, random_state=RNG_SEED)
        try:
            pca_emb = pca.fit_transform(assess_sub.values)
        except:
            pca_emb = np.zeros((len(group), pca_n))
    else:
        pca_emb = np.zeros((len(group), EMB_DIM))
    
    if pca_emb.shape[1] < EMB_DIM:
        pad = np.zeros((len(group), EMB_DIM - pca_emb.shape[1]))
        pca_emb = np.concatenate([pca_emb, pad], axis=1)
    
    emb_cols = [f'assess_emb_{i}' for i in range(EMB_DIM)]
    emb_df = pd.DataFrame(pca_emb[:, :EMB_DIM], index=group.index, columns=emb_cols)
    group = pd.concat([group.reset_index(drop=True), emb_df.reset_index(drop=True)], axis=1)
    
    return group

grouped = list(features.groupby(['code_module', 'code_presentation']))
print(f"Processing {len(grouped)} module-presentation groups...")
print()

features_norm_list = []

if N_WORKERS > 1:
    with ProcessPoolExecutor(max_workers=N_WORKERS) as executor:
        results = list(tqdm(
            executor.map(normalize_module_group, grouped),
            total=len(grouped),
            desc="Module normalization"
        ))
        features_norm_list = results
else:
    for group in tqdm(grouped, desc="Module normalization"):
        features_norm_list.append(normalize_module_group(group))

features_proc = pd.concat(features_norm_list, axis=0).reset_index(drop=True)

print(f"\nNormalized features shape: {features_proc.shape}")
print()

Normalizing features per module...

Processing 22 module-presentation groups...



Module normalization:   0%|          | 0/22 [00:00<?, ?it/s]


Normalized features shape: (40801, 103)



## Global Feature Matrix

Create final feature matrix:
1. Remove duplicate students (keep first occurrence)
2. Global PCA for final embedding

In [13]:
print("Creating global feature matrix...")
print()

features_proc_unique = features_proc.drop_duplicates(subset=['id_student'], keep='first')

print(f"Removed {len(features_proc) - len(features_proc_unique):,} duplicate student entries")
print(f"Unique students: {len(features_proc_unique):,}")
print()

node_feature_cols = num_cols + [f'assess_emb_{i}' for i in range(EMB_DIM)]
node_feature_matrix = features_proc_unique[['id_student'] + node_feature_cols].set_index('id_student').fillna(0)

print(f"Node feature matrix shape: {node_feature_matrix.shape}")
print()

print("Applying global PCA...")

global_pca = PCA(n_components=EMB_DIM, random_state=RNG_SEED)
node_features_global = global_pca.fit_transform(node_feature_matrix.values)

print(f"Global PCA explained variance: {global_pca.explained_variance_ratio_.sum():.4f}")
print()

node_feat_df = pd.DataFrame(
    node_features_global,
    index=node_feature_matrix.index,
    columns=[f'emb_{i}' for i in range(EMB_DIM)]
).reset_index()

print(f"Final embedding matrix: {node_feat_df.shape}")
print()

feature_matrix_scaled = node_feature_matrix.copy()

Creating global feature matrix...

Removed 12,016 duplicate student entries
Unique students: 28,785

Node feature matrix shape: (28785, 100)

Applying global PCA...
Global PCA explained variance: 0.9588

Final embedding matrix: (28785, 49)



## Save Features

Save all feature sets to pickle files.

In [14]:
print("Saving features...")
print()

save_jobs = [
    ('demographic_features.pkl', demographic_features),
    ('assessment_features.pkl', assessment_features),
    ('engagement_features.pkl', engagement_features),
    ('resource_features.pkl', resource_features),
    ('temporal_features.pkl', temporal_features),
    ('feature_matrix_scaled.pkl', feature_matrix_scaled),
    ('features_proc_unique.pkl', features_proc_unique)
]

for filename, data in tqdm(save_jobs, desc="Saving features"):
    filepath = DATA_FEATURES_DIR / filename
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)

metadata = {
    'emb_dim': EMB_DIM,
    'num_students': len(features_proc_unique),
    'num_features': len(node_feature_cols),
    'pca_explained_variance': float(global_pca.explained_variance_ratio_.sum()),
    'feature_columns': node_feature_cols,
    'demographic_cols': [c for c in demographic_features.columns if c != 'id_student'],
    'random_seed': RNG_SEED
}

with open(DATA_FEATURES_DIR / 'feature_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("\nAll features saved to:", DATA_FEATURES_DIR)
print()

print("Saved files:")
for filename, _ in save_jobs:
    filepath = DATA_FEATURES_DIR / filename
    size_mb = filepath.stat().st_size / (1024 * 1024)
    print(f"  {filename:40s} ({size_mb:.2f} MB)")

print(f"  {'feature_metadata.json':40s} ({(DATA_FEATURES_DIR / 'feature_metadata.json').stat().st_size / 1024:.2f} KB)")
print()

Saving features...



Saving features:   0%|          | 0/7 [00:00<?, ?it/s]


All features saved to: ../670-Project/data/features

Saved files:
  demographic_features.pkl                 (2.02 MB)
  assessment_features.pkl                  (67.52 MB)
  engagement_features.pkl                  (1.00 MB)
  resource_features.pkl                    (0.60 MB)
  temporal_features.pkl                    (0.66 MB)
  feature_matrix_scaled.pkl                (22.18 MB)
  features_proc_unique.pkl                 (22.51 MB)
  feature_metadata.json                    (3.57 KB)



## Summary

In [15]:
print("FEATURE ENGINEERING COMPLETE")
print()

print("Features created:")
print(f"  Demographic: {len([c for c in demographic_features.columns if c != 'id_student'])} features")
print(f"  Assessment: {len(assess_pivot.columns)} raw + 1 diversity")
print(f"  Engagement: 4 temporal features")
print(f"  Resource: 2 usage features")
print(f"  Temporal: 2 registration features")
print()

print(f"Final feature matrix:")
print(f"  Students: {len(features_proc_unique):,}")
print(f"  Features: {len(node_feature_cols)}")
print(f"  PCA embedding: {EMB_DIM}D")
print(f"  Explained variance: {global_pca.explained_variance_ratio_.sum():.4f}")
print()


FEATURE ENGINEERING COMPLETE

Features created:
  Demographic: 43 features
  Assessment: 188 raw + 1 diversity
  Engagement: 4 temporal features
  Resource: 2 usage features
  Temporal: 2 registration features

Final feature matrix:
  Students: 28,785
  Features: 100
  PCA embedding: 48D
  Explained variance: 0.9588

