In [61]:
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')


In [62]:
train_data = pd.read_csv(r"C:\Users\pragy\cloud_architecture\neural_symbolic\data\train_data.csv")
test_data = pd.read_csv(r"C:\Users\pragy\cloud_architecture\neural_symbolic\data\test_data.csv")

In [63]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [64]:
train_data.columns

Index(['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time',
       'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used',
       'attempt_count', 'answer_given', 'problem_completed', 'correct',
       'next_correct', 'content_source', 'skills', 'problem_type',
       'tutoring_types', 'student_answer_count', 'mean_correct',
       'mean_time_on_task', 'class_id', 'account_creation_date',
       'started_problem_sets_count', 'completed_problem_sets_count',
       'started_skill_builders_count', 'mastered_skill_builders_count',
       'answered_problems_count', 'mean_problem_correctness',
       'mean_problem_time_on_task', 'mean_class_score'],
      dtype='object')

In [65]:
col = 'next_correct'
def function(df: pd.DataFrame, col: str) -> pd.DataFrame:
    cols = [c for c in df.columns if c!=col] + [col]
    df = df[cols]
    return df

train_data = function(train_data,col)
test_data = function(test_data,col)

train_data.columns

Index(['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time',
       'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used',
       'attempt_count', 'answer_given', 'problem_completed', 'correct',
       'content_source', 'skills', 'problem_type', 'tutoring_types',
       'student_answer_count', 'mean_correct', 'mean_time_on_task', 'class_id',
       'account_creation_date', 'started_problem_sets_count',
       'completed_problem_sets_count', 'started_skill_builders_count',
       'mastered_skill_builders_count', 'answered_problems_count',
       'mean_problem_correctness', 'mean_problem_time_on_task',
       'mean_class_score', 'next_correct'],
      dtype='object')

In [66]:
train_data.head(20)

Unnamed: 0,log_id,student_id,assignment_id,problem_id,start_time,time_on_task,answer_before_tutoring,fraction_of_hints_used,attempt_count,answer_given,...,account_creation_date,started_problem_sets_count,completed_problem_sets_count,started_skill_builders_count,mastered_skill_builders_count,answered_problems_count,mean_problem_correctness,mean_problem_time_on_task,mean_class_score,next_correct
0,16475574,2259,1426268,1694676,2021-05-18 17:03:26.559000+00:00,,,,0,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,True
1,16654914,2259,1443864,1631611,2021-05-28 17:09:41.198000+00:00,60.625,True,,1,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
2,16654914,2259,1443864,1631613,2021-05-28 17:12:38.346000+00:00,44.138,True,,1,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
3,16711054,2259,1445768,1632407,2021-06-04 17:25:57.552000+00:00,75.82,True,,2,True,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
4,16711054,2259,1445768,1632408,2021-06-04 17:27:14.550000+00:00,18.387,True,,2,True,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
5,16711054,2259,1445768,1632430,2021-06-04 17:27:41.921000+00:00,45.407,True,,1,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,True
6,16492425,2358,1426307,1762888,2021-05-19 12:24:11.169000+00:00,11.377,False,,1,True,...,2019-01-28 10:35:46.830000-05:00,4,4,0,0,21,0.1,28.96019,0.0,False
7,16492425,2358,1426307,1762889,2021-05-19 12:24:23.215000+00:00,5.911,False,,1,True,...,2019-01-28 10:35:46.830000-05:00,4,4,0,0,21,0.1,28.96019,0.0,False
8,16492425,2358,1426307,1762890,2021-05-19 12:24:30.011000+00:00,6.844,False,,1,True,...,2019-01-28 10:35:46.830000-05:00,4,4,0,0,21,0.1,28.96019,0.0,False
9,16492425,2358,1426307,1762891,2021-05-19 12:24:37.861000+00:00,7.862,False,,1,True,...,2019-01-28 10:35:46.830000-05:00,4,4,0,0,21,0.1,28.96019,0.0,False


In [67]:
!pip install torch torch-geometric



In [68]:
print("Dataset shape: ",train_data.shape)
print("Column names: ", train_data.columns.tolist())
print("Unique students: ",train_data['student_id'].nunique())
print("Unique problems: ",train_data['problem_id'].nunique())

Dataset shape:  (110474, 30)
Column names:  ['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time', 'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used', 'attempt_count', 'answer_given', 'problem_completed', 'correct', 'content_source', 'skills', 'problem_type', 'tutoring_types', 'student_answer_count', 'mean_correct', 'mean_time_on_task', 'class_id', 'account_creation_date', 'started_problem_sets_count', 'completed_problem_sets_count', 'started_skill_builders_count', 'mastered_skill_builders_count', 'answered_problems_count', 'mean_problem_correctness', 'mean_problem_time_on_task', 'mean_class_score', 'next_correct']
Unique students:  10000
Unique problems:  21777


### Node types are of three
1. Student nodes = 10000 students
2. Problem nodes = 21777 problems 
3. Skill nodes = length of all_skills

### Edge types are of two
1. student-problem edges 
2. problem-skill edges 

In [69]:
train = train_data

In [70]:
def analyze_graph_structure(df):
    """
    Analyze the data structure and extract unique skills
    """
    all_skills = set()
    for skills_str in df['skills'].dropna():
        if isinstance(skills_str, str):
            skills = [s.strip() for s in skills_str.split(',')]
            all_skills.update(skills)
    
    print("=== GRAPH STRUCTURE ANALYSIS ===")
    print(f"Total rows: {len(df)}")
    print(f"Unique students: {df['student_id'].nunique()}")
    print(f"Unique problems: {df['problem_id'].nunique()}")
    print(f"Unique skills: {len(all_skills)}")
    
    attempts_per_student = df.groupby('student_id').size()
    print(f"\nStudent attempt distribution:")
    print(f"Min attempts: {attempts_per_student.min()}")
    print(f"Max attempts: {attempts_per_student.max()}")
    print(f"Mean attempts: {attempts_per_student.mean():.2f}")
    print(f"Median attempts: {attempts_per_student.median():.2f}")
    
    return list(all_skills)


In [71]:
def create_node_mappings(df, unique_skills):
    """
    Create mappings from IDs to node indices
    """
    # Student ID to index mapping
    unique_students = df['student_id'].unique()
    student_to_idx = {student_id: idx for idx, student_id in enumerate(unique_students)}
    
    # Problem ID to index mapping
    unique_problems = df['problem_id'].unique()
    problem_to_idx = {problem_id: idx for idx, problem_id in enumerate(unique_problems)}
    
    # Skill to index mapping
    skill_to_idx = {skill: idx for idx, skill in enumerate(unique_skills)}
    
    print("=== NODE MAPPINGS CREATED ===")
    print(f"Students: {len(student_to_idx)} mappings")
    print(f"Problems: {len(problem_to_idx)} mappings")
    print(f"Skills: {len(skill_to_idx)} mappings")
    
    return student_to_idx, problem_to_idx, skill_to_idx

In [72]:
def create_adaptive_temporal_sequences(df, base_sequence_length=5, max_sequences_per_student=3, device='cuda'):
    """
    Create adaptive sequences that handle variable student attempt counts
    Each student gets multiple sequences of different lengths based on their attempt history
    """
    print("=== CREATING ADAPTIVE TEMPORAL SEQUENCES ===")
    
    # Sort by student and time
    df_sorted = df.sort_values(['student_id', 'start_time']).reset_index(drop=True)
    
    sequences = []
    student_stats = {
        'total_students': 0,
        'students_with_sequences': 0,
        'sequence_lengths': [],
        'students_by_attempts': defaultdict(int)
    }
    
    for student_id in df_sorted['student_id'].unique():
        student_data = df_sorted[df_sorted['student_id'] == student_id]
        num_attempts = len(student_data)
        
        student_stats['total_students'] += 1
        student_stats['students_by_attempts'][num_attempts] += 1
        
        # Only create sequences if student has enough attempts
        if num_attempts >= base_sequence_length + 1:
            student_stats['students_with_sequences'] += 1
            
            # Determine sequence lengths to create for this student
            max_possible_length = num_attempts - 1  # Need 1 for prediction
            
            if max_sequences_per_student == 1:
                # Single sequence with maximum available history
                sequence_lengths = [min(max_possible_length, base_sequence_length * 2)]
            else:
                # Multiple sequences with different lengths
                if max_possible_length <= base_sequence_length:
                    sequence_lengths = [max_possible_length]
                else:
                    # Create sequences of varying lengths
                    step = max(1, (max_possible_length - base_sequence_length) // (max_sequences_per_student - 1))
                    sequence_lengths = list(range(base_sequence_length, max_possible_length + 1, step))
                    sequence_lengths = sequence_lengths[:max_sequences_per_student]
                    
                    # Always include the maximum length
                    if max_possible_length not in sequence_lengths:
                        sequence_lengths[-1] = max_possible_length
            
            # Create sequences for this student
            for seq_len in sequence_lengths:
                # Use most recent seq_len attempts as history
                sequence = student_data.iloc[-seq_len-1:-1].copy()  # Last seq_len attempts (excluding target)
                target = student_data.iloc[-1]['next_correct']  # Last attempt as target
                
                sequences.append({
                    'sequence_id': len(sequences),
                    'student_id': student_id,
                    'sequence_length': seq_len,
                    'sequence_data': sequence,
                    'target': target,
                    'prediction_problem_id': student_data.iloc[-1]['problem_id']
                })
                
                student_stats['sequence_lengths'].append(seq_len)
    
    print(f"\n=== SEQUENCE CREATION SUMMARY ===")
    print(f"Total students: {student_stats['total_students']}")
    print(f"Students with sequences: {student_stats['students_with_sequences']}")
    print(f"Total sequences created: {len(sequences)}")
    print(f"Average sequences per valid student: {len(sequences) / student_stats['students_with_sequences']:.2f}")
    print(f"Sequence length range: {min(student_stats['sequence_lengths'])}-{max(student_stats['sequence_lengths'])}")
    
    return sequences



In [73]:
def extract_temporal_node_features(sequences, df, student_to_idx, problem_to_idx, skill_to_idx, device='cuda'):
    """
    Extract node features that capture temporal learning patterns for variable sequences
    """
    print("=== EXTRACTING TEMPORAL NODE FEATURES ===")
    
    if device == 'cuda' and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        device = 'cpu'
    
    # === SEQUENCE-BASED STUDENT FEATURES (Dynamic) ===
    sequence_student_features = []
    
    for seq in sequences:
        sequence_data = seq['sequence_data']
        seq_len = seq['sequence_length']
        student_id = seq['student_id']
        
        # Get additional student info from original data
        student_info = df[df['student_id'] == student_id].iloc[0]
        
        # Skill diversity and mastery
        unique_skills = set()
        for skills_str in sequence_data['skills'].dropna():
            if isinstance(skills_str, str):
                unique_skills.update([s.strip() for s in skills_str.split(',')])
        
        # Calculate temporal features from the sequence
        features = [
            # Static student features
            float(student_info['class_id']) if pd.notna(student_info['class_id']) else 0.0,
            float(seq_len),  # Sequence length as feature
            
            # Performance metrics within sequence
            float(sequence_data['correct'].mean()),  # Average correctness
            float(sequence_data['time_on_task'].mean()),  # Average time
            float(sequence_data['fraction_of_hints_used'].mean()),  # Average hint usage
            
            # Learning progression indicators
            float(sequence_data['correct'].sum()),  # Total correct in sequence
            float(sequence_data['correct'].iloc[-1] - sequence_data['correct'].iloc[0]) if len(sequence_data) > 1 else 0.0,  # Recent improvement
            
            # Skill diversity
            float(len(unique_skills)),  # Number of different skills attempted
            
            # Content source diversity
            float(sequence_data['content_source'].nunique()),
            
            # Performance consistency
            float(sequence_data['correct'].std()) if len(sequence_data) > 1 else 0.0,  # Variability
            
            # Additional temporal features
            float(sequence_data['attempt_count'].mean()) if 'attempt_count' in sequence_data.columns else 1.0,
            float((sequence_data['answer_given'] == True).sum()) if 'answer_given' in sequence_data.columns else 0.0,
        ]
        
        sequence_student_features.append(features)
    
    sequence_student_features = torch.tensor(sequence_student_features, dtype=torch.float32, device=device)
    print(f"Sequence-based student features: {sequence_student_features.shape} on {device}")
    
    # === PROBLEM FEATURES (Static but Enhanced) ===
    problem_features_list = []
    
    # Encode categorical features
    le_content = LabelEncoder()
    le_problem_type = LabelEncoder()
    df_copy = df.copy()
    df_copy['content_source_encoded'] = le_content.fit_transform(df_copy['content_source'].fillna('Unknown'))
    df_copy['problem_type_encoded'] = le_problem_type.fit_transform(df_copy['problem_type'].fillna('Unknown'))
    
    for problem_id in problem_to_idx.keys():
        problem_data = df_copy[df_copy['problem_id'] == problem_id].iloc[0]
        
        # Get skills for this problem
        problem_skills = []
        if pd.notna(problem_data['skills']):
            problem_skills = [s.strip() for s in str(problem_data['skills']).split(',')]
        
        features = [
            # Basic problem statistics
            float(problem_data['student_answer_count']) if pd.notna(problem_data['student_answer_count']) else 0.0,
            float(problem_data['mean_correct']) if pd.notna(problem_data['mean_correct']) else 0.5,
            float(problem_data['mean_time_on_task']) if pd.notna(problem_data['mean_time_on_task']) else 100.0,
            
            # Problem complexity indicators
            float(len(problem_skills)),  # Number of skills required
            
            # Encoded categorical features
            float(problem_data['content_source_encoded']),
            float(problem_data['problem_type_encoded']),
        ]
        
        problem_features_list.append(features)
    
    problem_features = torch.tensor(problem_features_list, dtype=torch.float32, device=device)
    print(f"Problem features: {problem_features.shape} on {device}")
    
    # === SKILL FEATURES (Enhanced with Temporal Context) ===
    skill_features_list = []
    
    # Pre-calculate skill statistics from sequences
    skill_stats = defaultdict(lambda: {'attempts': [], 'times': [], 'sequence_contexts': []})
    
    for seq in sequences:
        for _, row in seq['sequence_data'].iterrows():
            if pd.notna(row['skills']):
                problem_skills = [s.strip() for s in str(row['skills']).split(',')]
                for skill in problem_skills:
                    if skill in skill_to_idx:
                        skill_stats[skill]['attempts'].append(row['correct'])
                        skill_stats[skill]['times'].append(row['time_on_task'])
                        skill_stats[skill]['sequence_contexts'].append(seq['sequence_length'])
    
    for skill in skill_to_idx.keys():
        if skill in skill_stats:
            attempts = skill_stats[skill]['attempts']
            times = skill_stats[skill]['times']
            contexts = skill_stats[skill]['sequence_contexts']
            
            features = [
                float(len(attempts)),  # Frequency across all sequences
                float(np.mean(attempts)) if attempts else 0.5,  # Average correctness
                float(np.mean(times)) if times else 100.0,  # Average time
                float(np.std(attempts)) if len(attempts) > 1 else 0.5,  # Performance variability
                float(np.mean(contexts)) if contexts else 5.0,  # Average sequence context length
            ]
        else:
            # Default features for skills not seen in sequences
            features = [0.0, 0.5, 100.0, 0.5, 5.0]
        
        skill_features_list.append(features)
    
    skill_features = torch.tensor(skill_features_list, dtype=torch.float32, device=device)
    print(f"Skill features: {skill_features.shape} on {device}")
    
    return sequence_student_features, problem_features, skill_features


In [74]:
def create_temporal_edges(sequences, df, student_to_idx, problem_to_idx, skill_to_idx, device='cuda'):
    """
    Create edges that capture temporal learning patterns and relationships
    """
    print("=== CREATING TEMPORAL EDGES ===")
    
    if device == 'cuda' and not torch.cuda.is_available():
        device = 'cpu'
    
    # === SEQUENCE-PROBLEM EDGES (Main Prediction Edges) ===
    sequence_problem_edges = []
    sequence_problem_edge_features = []
    
    for seq_idx, seq in enumerate(sequences):
        target_problem_id = seq['prediction_problem_id']
        
        if target_problem_id in problem_to_idx:
            target_problem_idx = problem_to_idx[target_problem_id]
            sequence_data = seq['sequence_data']
            
            # Edge from sequence to target problem
            sequence_problem_edges.append([seq_idx, target_problem_idx])
            
            # Rich edge features capturing learning context
            target_problem_data = df[df['problem_id'] == target_problem_id].iloc[0]
            
            # Historical performance context
            historical_performance = float(sequence_data['correct'].mean())
            recent_performance = float(sequence_data['correct'].tail(3).mean()) if len(sequence_data) >= 3 else historical_performance
            
            # Skill overlap analysis
            seq_skills = set()
            for skills_str in sequence_data['skills'].dropna():
                if isinstance(skills_str, str):
                    seq_skills.update([s.strip() for s in skills_str.split(',')])
            
            target_skills = set()
            if pd.notna(target_problem_data['skills']):
                target_skills.update([s.strip() for s in str(target_problem_data['skills']).split(',')])
            
            skill_overlap = len(seq_skills.intersection(target_skills))
            total_skills = len(seq_skills.union(target_skills))
            skill_overlap_ratio = float(skill_overlap / total_skills) if total_skills > 0 else 0.0
            
            # Content source familiarity
            seq_sources = set(sequence_data['content_source'].dropna())
            target_source = target_problem_data['content_source']
            source_familiarity = float(1.0 if target_source in seq_sources else 0.0)
            
            # Time-based features
            avg_time_in_seq = float(sequence_data['time_on_task'].mean())
            avg_hints_in_seq = float(sequence_data['fraction_of_hints_used'].mean())
            
            # Learning trajectory features
            if len(sequence_data) > 1:
                performance_trend = float(sequence_data['correct'].diff().tail(3).mean())
            else:
                performance_trend = 0.0
            
            edge_features = [
                historical_performance,
                recent_performance,
                skill_overlap_ratio,
                float(skill_overlap),
                source_familiarity,
                avg_time_in_seq,
                avg_hints_in_seq,
                performance_trend,
                float(seq['sequence_length']),
            ]
            
            sequence_problem_edge_features.append(edge_features)
    
    sequence_problem_edges = torch.tensor(sequence_problem_edges, dtype=torch.long, device=device).T
    sequence_problem_edge_features = torch.tensor(sequence_problem_edge_features, dtype=torch.float32, device=device)
    
    print(f"Sequence-Problem edges: {sequence_problem_edges.shape} on {device}")
    print(f"Sequence-Problem edge features: {sequence_problem_edge_features.shape} on {device}")
    
    # === PROBLEM-SKILL EDGES ===
    problem_skill_pairs = []
    for problem_id in problem_to_idx.keys():
        problem_data = df[df['problem_id'] == problem_id].iloc[0]
        if pd.notna(problem_data['skills']):
            problem_idx = problem_to_idx[problem_id]
            skills_list = [s.strip() for s in str(problem_data['skills']).split(',')]
            
            for skill in skills_list:
                if skill in skill_to_idx:
                    skill_idx = skill_to_idx[skill]
                    problem_skill_pairs.append([problem_idx, skill_idx])
    
    # Remove duplicates and convert to tensor
    problem_skill_edges = torch.tensor(
        list(set(map(tuple, problem_skill_pairs))), 
        dtype=torch.long, device=device
    ).T if problem_skill_pairs else torch.empty((2, 0), dtype=torch.long, device=device)
    
    print(f"Problem-Skill edges: {problem_skill_edges.shape} on {device}")
    
    return sequence_problem_edges, sequence_problem_edge_features, problem_skill_edges


In [75]:
def build_temporal_heterogeneous_graph(sequence_features, problem_features, skill_features,
                                     seq_prob_edges, seq_prob_edge_features, prob_skill_edges, 
                                     device='cuda'):
    """
    Build the complete temporal heterogeneous graph for TEG-NeSyNet
    """
    print("=== BUILDING TEMPORAL HETEROGENEOUS GRAPH ===")
    
    # Initialize heterogeneous graph
    data = HeteroData()
    
    # Add node features
    data['sequence'].x = sequence_features  # Dynamic student states based on sequences
    data['problem'].x = problem_features    # Static problem characteristics
    data['skill'].x = skill_features        # Skill difficulty and context features
    
    # Add main prediction edges (sequence -> problem)
    data['sequence', 'predicts', 'problem'].edge_index = seq_prob_edges
    data['sequence', 'predicts', 'problem'].edge_attr = seq_prob_edge_features
    
    # Add problem-skill requirement edges
    if prob_skill_edges.size(1) > 0:
        data['problem', 'requires', 'skill'].edge_index = prob_skill_edges
    
    # Add reverse edges for better message passing
    data['problem', 'predicted_by', 'sequence'].edge_index = seq_prob_edges.flip(0)
    
    if prob_skill_edges.size(1) > 0:
        data['skill', 'required_by', 'problem'].edge_index = prob_skill_edges.flip(0)
    
    # Device handling for display
    device_name = str(device).upper()
    print(f"Temporal graph created on {device_name}:")
    print(f"- Sequence nodes (dynamic student states): {data['sequence'].x.shape}")
    print(f"- Problem nodes: {data['problem'].x.shape}")
    print(f"- Skill nodes: {data['skill'].x.shape}")
    print(f"- Prediction edges: {data['sequence', 'predicts', 'problem'].edge_index.shape[1]}")
    if 'problem' in data and 'requires' in data[('problem', 'requires', 'skill')]:
        print(f"- Requirement edges: {data['problem', 'requires', 'skill'].edge_index.shape[1]}")
    
    return data


In [76]:
def apply_standard_scaling_pytorch_only_fixed(graph_data, device='cuda'):
    """
    Apply standard scaling with NaN handling
    """
    print("=== APPLYING STANDARD SCALING (FIXED) ===")
    
    scalers = {}
    
    # Scale sequence features with NaN handling
    seq_features = graph_data['sequence'].x
    # Replace any NaN/Inf with zeros first
    seq_features = torch.nan_to_num(seq_features, nan=0.0, posinf=0.0, neginf=0.0)
    
    seq_mean = seq_features.mean(dim=0, keepdim=True)
    seq_std = seq_features.std(dim=0, keepdim=True)
    seq_std = torch.where(seq_std < 1e-6, torch.ones_like(seq_std), seq_std)  # Avoid near-zero std
    graph_data['sequence'].x = (seq_features - seq_mean) / seq_std
    scalers['sequence'] = {'mean': seq_mean, 'std': seq_std}
    
    # Scale problem features
    prob_features = graph_data['problem'].x
    prob_features = torch.nan_to_num(prob_features, nan=0.0, posinf=0.0, neginf=0.0)
    prob_mean = prob_features.mean(dim=0, keepdim=True)
    prob_std = prob_features.std(dim=0, keepdim=True)
    prob_std = torch.where(prob_std < 1e-6, torch.ones_like(prob_std), prob_std)
    graph_data['problem'].x = (prob_features - prob_mean) / prob_std
    scalers['problem'] = {'mean': prob_mean, 'std': prob_std}
    
    # Scale skill features
    skill_features = graph_data['skill'].x
    skill_features = torch.nan_to_num(skill_features, nan=0.0, posinf=0.0, neginf=0.0)
    skill_mean = skill_features.mean(dim=0, keepdim=True)
    skill_std = skill_features.std(dim=0, keepdim=True)
    skill_std = torch.where(skill_std < 1e-6, torch.ones_like(skill_std), skill_std)
    graph_data['skill'].x = (skill_features - skill_mean) / skill_std
    scalers['skill'] = {'mean': skill_mean, 'std': skill_std}
    
    # Scale edge features
    edge_features = graph_data['sequence', 'predicts', 'problem'].edge_attr
    edge_features = torch.nan_to_num(edge_features, nan=0.0, posinf=0.0, neginf=0.0)
    edge_mean = edge_features.mean(dim=0, keepdim=True)
    edge_std = edge_features.std(dim=0, keepdim=True)
    edge_std = torch.where(edge_std < 1e-6, torch.ones_like(edge_std), edge_std)
    graph_data['sequence', 'predicts', 'problem'].edge_attr = (edge_features - edge_mean) / edge_std
    scalers['edge'] = {'mean': edge_mean, 'std': edge_std}
    
    print(f"✅ All features standardized and NaN-cleaned")
    print(f"Sequence features: mean≈{graph_data['sequence'].x.mean().item():.3f}, std≈{graph_data['sequence'].x.std().item():.3f}")
    
    return graph_data, scalers

In [77]:
def enhance_temporal_encoding_pytorch_only_fixed(graph_data, sequences, device='cuda'):
    """
    Add temporal encoding with robust NaN handling
    """
    print("=== ENHANCING TEMPORAL ENCODING (FIXED) ===")
    
    enhanced_sequence_features = []
    
    for i, seq in enumerate(sequences):
        sequence_data = seq['sequence_data']
        current_features = graph_data['sequence'].x[i]
        
        # Robust feature calculation with NaN handling
        try:
            seq_len = float(len(sequence_data))
            success_rate = float(sequence_data['correct'].sum() / len(sequence_data))
            
            # Handle potential NaN
            if pd.isna(success_rate):
                success_rate = 0.0
                
        except:
            seq_len = 5.0  # Default
            success_rate = 0.0
        
        seq_len_feature = torch.tensor([seq_len], dtype=torch.float32, device=device)
        success_rate_feature = torch.tensor([success_rate], dtype=torch.float32, device=device)
        
        enhanced_features = torch.cat([current_features, seq_len_feature, success_rate_feature])
        enhanced_sequence_features.append(enhanced_features)
    
    enhanced_features_tensor = torch.stack(enhanced_sequence_features)
    # Clean any remaining NaN
    enhanced_features_tensor = torch.nan_to_num(enhanced_features_tensor, nan=0.0)
    graph_data['sequence'].x = enhanced_features_tensor
    
    print(f"✅ Enhanced temporal features: {enhanced_features_tensor.shape}")
    return graph_data

In [78]:
def validate_features_pytorch_only(graph_data):
    """
    Validate feature quality using pure PyTorch
    """
    print("=== FEATURE VALIDATION (PYTORCH ONLY) ===")
    
    issues = []
    
    # Check for NaN/Inf values
    for node_type in graph_data.node_types:
        features = graph_data[node_type].x
        if torch.isnan(features).any():
            issues.append(f"NaN values in {node_type} features")
        if torch.isinf(features).any():
            issues.append(f"Inf values in {node_type} features")
    
    # Check edge features
    for edge_type in graph_data.edge_types:
        if hasattr(graph_data[edge_type], 'edge_attr'):
            edge_features = graph_data[edge_type].edge_attr
            if torch.isnan(edge_features).any():
                issues.append(f"NaN values in {edge_type} edge features")
    
    if issues:
        print("⚠️  Issues found:")
        for issue in issues:
            print(f"   - {issue}")
    else:
        print("✅ All features validated successfully")
    
    return len(issues) == 0

In [79]:
def build_complete_temporal_teg_nesynet_fixed(df, base_sequence_length=5, max_sequences_per_student=3, device='cuda'):
    """Fixed version with robust NaN handling"""
    print("=== BUILDING FIXED TEMPORAL TEG-NESYNET ===")
    
    # Steps 1-6: Same as before
    unique_skills = analyze_graph_structure(df)
    student_to_idx, problem_to_idx, skill_to_idx = create_node_mappings(df, unique_skills)
    sequences = create_adaptive_temporal_sequences(df, base_sequence_length, max_sequences_per_student, device)
    sequence_features, problem_features, skill_features = extract_temporal_node_features(
        sequences, df, student_to_idx, problem_to_idx, skill_to_idx, device
    )
    seq_prob_edges, seq_prob_edge_features, prob_skill_edges = create_temporal_edges(
        sequences, df, student_to_idx, problem_to_idx, skill_to_idx, device
    )
    
    graph_data = build_temporal_heterogeneous_graph(
        sequence_features, problem_features, skill_features,
        seq_prob_edges, seq_prob_edge_features, prob_skill_edges, device
    )
    
    # Fixed feature processing
    print("\n=== FEATURE PROCESSING (FIXED) ===")
    graph_data, scalers = apply_standard_scaling_pytorch_only_fixed(graph_data, device)
    graph_data = enhance_temporal_encoding_pytorch_only_fixed(graph_data, sequences, device)
    graph_data, final_scalers = apply_standard_scaling_pytorch_only_fixed(graph_data, device)
    is_valid = validate_features_pytorch_only(graph_data)
    
    # Extract targets
    targets = torch.tensor([seq['target'] for seq in sequences], dtype=torch.float32, device=device)
    
    print(f"\n=== COMPLETION ===")
    print(f"Feature processing valid: {is_valid}")
    
    return {
        'graph_data': graph_data,
        'targets': targets,
        'sequences': sequences,
        'scalers': final_scalers,
        'mappings': {
            'student_to_idx': student_to_idx,
            'problem_to_idx': problem_to_idx,
            'skill_to_idx': skill_to_idx
        }
    }

In [80]:
import torch
import os
from datetime import datetime

def save_teg_nesynet_model(result, save_dir="./models", model_name=None):
    """
    Save the complete TEG-NeSyNet model with metadata
    """
    # Create save directory
    os.makedirs(save_dir, exist_ok=True)
    
    # Generate model name with timestamp if not provided
    if model_name is None:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_name = f"teg_nesynet_{timestamp}"
    
    # Prepare save data
    save_data = {
        # Core model components
        'graph_data': result['graph_data'],
        'targets': result['targets'],
        'sequences': result['sequences'],
        'scalers': result['scalers'],
        'mappings': result['mappings'],
        
        # Model metadata
        'metadata': {
            'model_name': model_name,
            'save_timestamp': datetime.now().isoformat(),
            'total_sequences': len(result['sequences']),
            'num_students': len(result['mappings']['student_to_idx']),
            'num_problems': len(result['mappings']['problem_to_idx']),
            'num_skills': len(result['mappings']['skill_to_idx']),
            'device': str(result['graph_data']['sequence'].x.device),
            'node_types': list(result['graph_data'].node_types),
            'edge_types': list(result['graph_data'].edge_types),
            'feature_shapes': {
                'sequence_features': list(result['graph_data']['sequence'].x.shape),
                'problem_features': list(result['graph_data']['problem'].x.shape),
                'skill_features': list(result['graph_data']['skill'].x.shape)
            }
        }
    }
    
    # Save paths
    model_path = os.path.join(save_dir, f"{model_name}.pt")
    metadata_path = os.path.join(save_dir, f"{model_name}_metadata.txt")
    
    # Save the model
    torch.save(save_data, model_path)
    
    # Save human-readable metadata
    with open(metadata_path, 'w') as f:
        f.write("TEG-NeSyNet Model Metadata\n")
        f.write("=" * 50 + "\n")
        for key, value in save_data['metadata'].items():
            f.write(f"{key}: {value}\n")
    
    print(f"✅ Model saved successfully!")
    print(f"Model file: {model_path}")
    print(f"Metadata file: {metadata_path}")
    print(f"Model size: {os.path.getsize(model_path) / (1024*1024):.2f} MB")
    
    return model_path, metadata_path

# Run the fixed pipeline and save
print("Running fixed pipeline...")
result_fixed = build_complete_temporal_teg_nesynet_fixed(
    train, 
    base_sequence_length=5, 
    max_sequences_per_student=3, 
    device=device
)

# Save the model
model_path, metadata_path = save_teg_nesynet_model(
    result_fixed, 
    save_dir="./teg_nesynet_models", 
    model_name="teg_nesynet_temporal_v1"
)

print(f"\n🎉 TEG-NeSyNet model ready for GNN training!")
print(f"Training samples: {len(result_fixed['sequences'])}")
print(f"Graph nodes: {result_fixed['graph_data']['sequence'].x.shape[0] + result_fixed['graph_data']['problem'].x.shape[0] + result_fixed['graph_data']['skill'].x.shape[0]}")


Running fixed pipeline...
=== BUILDING FIXED TEMPORAL TEG-NESYNET ===
=== GRAPH STRUCTURE ANALYSIS ===
Total rows: 110474
Unique students: 10000
Unique problems: 21777
Unique skills: 762

Student attempt distribution:
Min attempts: 5
Max attempts: 20
Mean attempts: 11.05
Median attempts: 10.00
=== NODE MAPPINGS CREATED ===
Students: 10000 mappings
Problems: 21777 mappings
Skills: 762 mappings
=== CREATING ADAPTIVE TEMPORAL SEQUENCES ===

=== SEQUENCE CREATION SUMMARY ===
Total students: 10000
Students with sequences: 8808
Total sequences created: 23820
Average sequences per valid student: 2.70
Sequence length range: 5-19
=== EXTRACTING TEMPORAL NODE FEATURES ===
Sequence-based student features: torch.Size([23820, 12]) on cuda
Problem features: torch.Size([21777, 6]) on cuda
Skill features: torch.Size([762, 5]) on cuda
=== CREATING TEMPORAL EDGES ===
Sequence-Problem edges: torch.Size([2, 23820]) on cuda
Sequence-Problem edge features: torch.Size([23820, 9]) on cuda
Problem-Skill edges: