# Peer Recommendation System

Course: SI 670: Applied Machine Learning

Name : Yuganshi Agrawal  
uniqname: yuganshi

Name : Sai Sneha Siddapura Venkataramappa  
uniqname: saisneha

### Notebook 01: Data Preparation

This notebook loads, cleans, and validates the OULAD dataset.

**Inputs:**
- Raw OULAD CSV files from `data/raw/`

**Outputs:**
- Cleaned datasets saved to `data/processed/`
- Train/test split of students saved
- Data quality report

## Setup

In [1]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'
if 'CUDA_VISIBLE_DEVICES' not in os.environ:
    os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2'

import pickle
import numpy as np
import pandas as pd
import random
import torch
import multiprocessing as mp
from pathlib import Path
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor
import warnings
warnings.filterwarnings('ignore')

RNG_SEED = 42
np.random.seed(RNG_SEED)
random.seed(RNG_SEED)
torch.manual_seed(RNG_SEED)

# Cell 2: Hardware detection
N_CPU = mp.cpu_count()
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_GPU = 0

print("Hardware Configuration:")
print(f"  CPUs available: {N_CPU}")
print(f"  Device: {DEVICE}")

if DEVICE == 'cuda':
    try:
        N_GPU = torch.cuda.device_count()
        print(f"  GPUs accessible: {N_GPU}")
        
        for i in range(N_GPU):
            try:
                name = torch.cuda.get_device_name(i)
                print(f"    GPU {i}: {name}")
            except Exception as e:
                print(f"    GPU {i}: Error - {str(e)[:50]}")
    
    except Exception as e:
        print(f"  GPU detection failed: {e}")
        print("  Falling back to CPU")
        DEVICE = 'cpu'
        N_GPU = 0

print()

if DEVICE == 'cuda':
    N_WORKERS = min(8, N_CPU // 2)
else:
    N_WORKERS = max(1, N_CPU - 2)

print(f"Parallel Configuration:")
print(f"  Workers: {N_WORKERS}")

Hardware Configuration:
  CPUs available: 32
  Device: cuda
  GPUs accessible: 4
    GPU 0: Error - CUDA call failed lazily at initialization with err
    GPU 1: Error - CUDA call failed lazily at initialization with err
    GPU 2: Error - CUDA call failed lazily at initialization with err
    GPU 3: Error - CUDA call failed lazily at initialization with err

Parallel Configuration:
  Workers: 8



## Directory Setup

In [3]:
BASE_DIR = Path('../670-Project')
DATA_RAW_DIR = BASE_DIR / 'data' / 'raw'
DATA_PROCESSED_DIR = BASE_DIR / 'data' / 'processed'
DATA_FEATURES_DIR = BASE_DIR / 'data' / 'features'
MODELS_DIR = BASE_DIR / 'models'
RESULTS_DIR = BASE_DIR / 'results'

DATA_PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
DATA_FEATURES_DIR.mkdir(parents=True, exist_ok=True)
(MODELS_DIR / 'checkpoints').mkdir(parents=True, exist_ok=True)
(MODELS_DIR / 'embeddings').mkdir(parents=True, exist_ok=True)
(RESULTS_DIR / 'metrics').mkdir(parents=True, exist_ok=True)
(RESULTS_DIR / 'predictions').mkdir(parents=True, exist_ok=True)
(RESULTS_DIR / 'analysis').mkdir(parents=True, exist_ok=True)

print("Directory structure created")
print(f"  Raw data: {DATA_RAW_DIR}")
print(f"  Processed data: {DATA_PROCESSED_DIR}")
print(f"  Features: {DATA_FEATURES_DIR}")
print(f"  Models: {MODELS_DIR}")
print(f"  Results: {RESULTS_DIR}")
print()

Directory structure created
  Raw data: ../670-Project/data/raw
  Processed data: ../670-Project/data/processed
  Features: ../670-Project/data/features
  Models: ../670-Project/models
  Results: ../670-Project/results



## Configuration

In [4]:
HOLDOUT_STUDENT_FRAC = 0.10

EXPECTED_FILES = [
    'studentInfo.csv',
    'studentVle.csv',
    'vle.csv',
    'assessments.csv',
    'studentAssessment.csv',
    'courses.csv',
    'studentRegistration.csv'
]

print("Configuration:")
print(f"  Holdout fraction: {HOLDOUT_STUDENT_FRAC}")
print(f"  Random seed: {RNG_SEED}")
print()

Configuration:
  Holdout fraction: 0.1
  Random seed: 42



## Data Loading

Load all OULAD CSV files with validation.

In [5]:
print("Checking for required files...")
missing_files = []
for filename in EXPECTED_FILES:
    filepath = DATA_RAW_DIR / filename
    if not filepath.exists():
        missing_files.append(filename)
    else:
        print(f"  Found: {filename}")

if missing_files:
    print(f"\nERROR: Missing files: {missing_files}")
    print(f"Please place OULAD CSV files in: {DATA_RAW_DIR}")
    raise FileNotFoundError(f"Missing required files: {missing_files}")

print("\nAll required files found")
print()

Checking for required files...
  Found: studentInfo.csv
  Found: studentVle.csv
  Found: vle.csv
  Found: assessments.csv
  Found: studentAssessment.csv
  Found: courses.csv
  Found: studentRegistration.csv

All required files found



In [6]:
print("Loading datasets...")

datasets = {}
load_times = {}

for filename in tqdm(EXPECTED_FILES, desc="Loading CSVs"):
    name = filename.replace('.csv', '')
    filepath = DATA_RAW_DIR / filename
    
    import time
    start = time.time()
    datasets[name] = pd.read_csv(filepath)
    load_times[name] = time.time() - start

student_info = datasets['studentInfo']
student_vle = datasets['studentVle']
vle = datasets['vle']
assessments = datasets['assessments']
student_assessment = datasets['studentAssessment']
courses = datasets['courses']
student_registration = datasets['studentRegistration']

print("\nLoad complete")
print()

Loading datasets...


Loading CSVs:   0%|          | 0/7 [00:00<?, ?it/s]


Load complete



## Data Overview

In [7]:
print("Dataset sizes:")
for name, df in datasets.items():
    print(f"  {name:25s}: {len(df):>10,} rows, {len(df.columns):>3} columns (loaded in {load_times[name]:.2f}s)")
print()

print("Key statistics:")
print(f"  Unique students: {student_info['id_student'].nunique():,}")
print(f"  Unique modules: {student_info['code_module'].nunique()}")
print(f"  Unique presentations: {student_info['code_presentation'].nunique()}")
print(f"  Module-presentation pairs: {student_info.groupby(['code_module', 'code_presentation']).ngroups}")
print(f"  Total VLE interactions: {len(student_vle):,}")
print(f"  Total assessments: {len(assessments)}")
print(f"  Total assessment submissions: {len(student_assessment):,}")
print()

Dataset sizes:
  studentInfo              :     32,593 rows,  12 columns (loaded in 0.04s)
  studentVle               : 10,655,280 rows,   6 columns (loaded in 4.08s)
  vle                      :      6,364 rows,   6 columns (loaded in 0.01s)
  assessments              :        206 rows,   6 columns (loaded in 0.00s)
  studentAssessment        :    173,912 rows,   5 columns (loaded in 0.06s)
  courses                  :         22 rows,   3 columns (loaded in 0.00s)
  studentRegistration      :     32,593 rows,   5 columns (loaded in 0.02s)

Key statistics:
  Unique students: 28,785
  Unique modules: 7
  Unique presentations: 4
  Module-presentation pairs: 22
  Total VLE interactions: 10,655,280
  Total assessments: 206
  Total assessment submissions: 173,912



## Data Quality Analysis

In [8]:
print("Missing value analysis:")
print()

quality_report = {}

for name, df in datasets.items():
    missing = df.isnull().sum()
    missing_pct = (missing / len(df) * 100).round(2)
    
    if missing.sum() > 0:
        print(f"{name}:")
        for col in missing[missing > 0].index:
            print(f"  {col}: {missing[col]:,} ({missing_pct[col]:.2f}%)")
        print()
    
    quality_report[name] = {
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'missing_values': missing.to_dict(),
        'missing_percentages': missing_pct.to_dict()
    }

if not any(df.isnull().sum().sum() > 0 for df in datasets.values()):
    print("No missing values found in any dataset")
    print()

Missing value analysis:

studentInfo:
  imd_band: 1,111 (3.41%)

vle:
  week_from: 5,243 (82.39%)
  week_to: 5,243 (82.39%)

assessments:
  date: 11 (5.34%)

studentAssessment:
  score: 173 (0.10%)

studentRegistration:
  date_registration: 45 (0.14%)
  date_unregistration: 22,521 (69.10%)



In [9]:
print("Duplicate analysis:")
print()

for name, df in datasets.items():
    n_duplicates = df.duplicated().sum()
    
    if n_duplicates > 0:
        print(f"{name}:")
        print(f"  Total duplicates: {n_duplicates:,} ({n_duplicates/len(df)*100:.2f}%)")
        print()
    
    quality_report[name]['duplicate_rows'] = int(n_duplicates)

if not any(df.duplicated().sum() > 0 for df in datasets.values()):
    print("No duplicate rows found in any dataset")
    print()

duplicates_in_student_info = student_info.duplicated(subset=['id_student'], keep=False).sum()
if duplicates_in_student_info > 0:
    print(f"WARNING: studentInfo has {duplicates_in_student_info:,} duplicate student IDs")
    print("  This is expected due to multiple module enrollments")
    print()

quality_report['studentInfo']['duplicate_students'] = int(duplicates_in_student_info)

Duplicate analysis:

studentVle:
  Total duplicates: 787,170 (7.39%)

  This is expected due to multiple module enrollments



## Data Cleaning

Clean datasets and add derived columns.

In [10]:
print("Cleaning student_vle...")

if 'week' not in student_vle.columns and 'date' in student_vle.columns:
    student_vle['week'] = (student_vle['date'] // 7).astype(int)
    print("  Added 'week' column derived from 'date'")
else:
    print("  'week' column already exists")

print(f"  Week range: {student_vle['week'].min()} to {student_vle['week'].max()}")
print()

Cleaning student_vle...
  Added 'week' column derived from 'date'
  Week range: -4 to 38



In [11]:
print("Cleaning student_assessment...")

initial_count = len(student_assessment)
student_assessment_clean = student_assessment.copy()

if 'score' in student_assessment_clean.columns:
    invalid_scores = student_assessment_clean['score'].notna() & (
        (student_assessment_clean['score'] < 0) | (student_assessment_clean['score'] > 100)
    )
    n_invalid = invalid_scores.sum()
    
    if n_invalid > 0:
        print(f"  Found {n_invalid:,} scores outside valid range [0, 100]")
        print(f"    Setting to NaN")
        student_assessment_clean.loc[invalid_scores, 'score'] = np.nan
    else:
        print("  All scores within valid range [0, 100]")

final_count = len(student_assessment_clean)
print(f"  Rows retained: {final_count:,} / {initial_count:,}")
print()

student_assessment = student_assessment_clean

Cleaning student_assessment...
  All scores within valid range [0, 100]
  Rows retained: 173,912 / 173,912



In [12]:
print("Cleaning student_registration...")

student_registration_clean = student_registration.copy()

if 'date_registration' in student_registration_clean.columns:
    missing_reg = student_registration_clean['date_registration'].isnull().sum()
    if missing_reg > 0:
        print(f"  Filling {missing_reg:,} missing registration dates with 0")
        student_registration_clean['date_registration'].fillna(0, inplace=True)

if 'date_unregistration' in student_registration_clean.columns:
    missing_unreg = student_registration_clean['date_unregistration'].isnull().sum()
    if missing_unreg > 0:
        print(f"  Filling {missing_unreg:,} missing unregistration dates with 9999")
        student_registration_clean['date_unregistration'].fillna(9999, inplace=True)

print()

student_registration = student_registration_clean

Cleaning student_registration...
  Filling 45 missing registration dates with 0
  Filling 22,521 missing unregistration dates with 9999



## Train/Test Split

Split students into train and holdout sets for evaluation.

In [13]:
print("Creating train/test student split...")
print()

all_students = student_info['id_student'].unique()
n_students = len(all_students)
n_holdout = int(n_students * HOLDOUT_STUDENT_FRAC)

print(f"Total students: {n_students:,}")
print(f"Holdout fraction: {HOLDOUT_STUDENT_FRAC}")
print(f"Holdout students: {n_holdout:,}")
print(f"Training students: {n_students - n_holdout:,}")
print()

holdout_students = set(np.random.choice(all_students, size=n_holdout, replace=False))
train_students = set(all_students) - holdout_students

print(f"Verification:")
print(f"  Train set size: {len(train_students):,}")
print(f"  Holdout set size: {len(holdout_students):,}")
print(f"  Overlap: {len(train_students & holdout_students)}")
print()

student_split = {
    'train_students': train_students,
    'holdout_students': holdout_students,
    'n_train': len(train_students),
    'n_holdout': len(holdout_students),
    'holdout_fraction': HOLDOUT_STUDENT_FRAC,
    'random_seed': RNG_SEED
}

Creating train/test student split...

Total students: 28,785
Holdout fraction: 0.1
Holdout students: 2,878
Training students: 25,907

Verification:
  Train set size: 25,907
  Holdout set size: 2,878
  Overlap: 0



## Save Processed Data

In [14]:
print("Saving processed datasets...")
print()

save_jobs = [
    ('student_info_clean.pkl', student_info),
    ('student_vle_clean.pkl', student_vle),
    ('vle_clean.pkl', vle),
    ('assessments_clean.pkl', assessments),
    ('student_assessment_clean.pkl', student_assessment),
    ('courses_clean.pkl', courses),
    ('student_registration_clean.pkl', student_registration),
    ('student_split.pkl', student_split),
    ('data_quality_report.pkl', quality_report)
]

for filename, data in tqdm(save_jobs, desc="Saving files"):
    filepath = DATA_PROCESSED_DIR / filename
    with open(filepath, 'wb') as f:
        pickle.dump(data, f)

print("\nAll files saved to:", DATA_PROCESSED_DIR)
print()

print("Saved files:")
for filename, _ in save_jobs:
    filepath = DATA_PROCESSED_DIR / filename
    size_mb = filepath.stat().st_size / (1024 * 1024)
    print(f"  {filename:40s} ({size_mb:.2f} MB)")

Saving processed datasets...



Saving files:   0%|          | 0/9 [00:00<?, ?it/s]


All files saved to: ../670-Project/data/processed

Saved files:
  student_info_clean.pkl                   (1.32 MB)
  student_vle_clean.pkl                    (447.16 MB)
  vle_clean.pkl                            (0.18 MB)
  assessments_clean.pkl                    (0.01 MB)
  student_assessment_clean.pkl             (6.64 MB)
  courses_clean.pkl                        (0.00 MB)
  student_registration_clean.pkl           (0.87 MB)
  student_split.pkl                        (0.52 MB)
  data_quality_report.pkl                  (0.00 MB)


## Summary

In [16]:
print("DATA PREPARATION COMPLETE")

print("Processed datasets:")
for name, df in datasets.items():
    print(f"  {name}: {len(df):,} rows")
print()

print("Student split:")
print(f"  Training: {len(train_students):,} students")
print(f"  Holdout: {len(holdout_students):,} students")
print()

print("Data quality:")
total_missing = sum(
    sum(report['missing_values'].values()) 
    for report in quality_report.values()
)
print(f"  Total missing values: {total_missing:,}")
print(f"  Datasets with duplicates: {sum(1 for r in quality_report.values() if r['duplicate_rows'] > 0)}")

DATA PREPARATION COMPLETE
Processed datasets:
  studentInfo: 32,593 rows
  studentVle: 10,655,280 rows
  vle: 6,364 rows
  assessments: 206 rows
  studentAssessment: 173,912 rows
  courses: 22 rows
  studentRegistration: 32,593 rows

Student split:
  Training: 25,907 students
  Holdout: 2,878 students

Data quality:
  Total missing values: 34,347
  Datasets with duplicates: 1
