In [1]:
# Cell 1: Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import preprocessing functions
from preprocessing import (
    load_raw_data, 
    clean_data, 
    compute_student_features,
    create_topic_vectors,
    save_processed_data
)

sns.set_style("whitegrid")

# Cell 2: Load selected courses
with open('../data/selected_courses.txt', 'r') as f:
    selected_courses = [line.strip() for line in f.readlines()]

print(f"Selected courses: {selected_courses}")

# Cell 3: Run preprocessing
print("\n" + "="*60)
print("RUNNING PREPROCESSING PIPELINE")
print("="*60 + "\n")

# Load raw data
data = load_raw_data("../data/raw")

# Clean data
data = clean_data(data)

# Compute student features (filter to selected courses)
df_features = compute_student_features(data, course_filter=selected_courses)

# Create topic vectors
topic_vectors = create_topic_vectors(data, df_features, n_topics=5)

# Save processed data
save_processed_data(df_features, topic_vectors, "../data/processed")

print("\n✓ Preprocessing complete!")

Selected courses: ['AAA', 'GGG', 'CCC']

RUNNING PREPROCESSING PIPELINE

Loading raw OULAD data...
✓ Loaded 7 datasets
Cleaning data...
  Removed 787170 duplicates from student_vle
✓ Data cleaned
Computing student features...
  Filtering to courses: ['AAA', 'GGG', 'CCC']
  Processing 7488 students...
    Processed 500/7488 students...
    Processed 1000/7488 students...
    Processed 1500/7488 students...
    Processed 2000/7488 students...
    Processed 2500/7488 students...
    Processed 3000/7488 students...
    Processed 3500/7488 students...
    Processed 4000/7488 students...
    Processed 4500/7488 students...
    Processed 5000/7488 students...
    Processed 5500/7488 students...
    Processed 6000/7488 students...
    Processed 6500/7488 students...
    Processed 7000/7488 students...
✓ Computed features for 7488 students
Creating topic vectors with 5 topics...
  Found 3 assessment types: ['TMA' 'CMA' 'Exam']
✓ Created topic vectors: shape (7488, 5)
Saving processed data...
  