In [56]:
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import HeteroData
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')


In [57]:
train_data = pd.read_csv(r"C:\Users\pragy\cloud_architecture\neural_symbolic\data\train_data.csv")
test_data = pd.read_csv(r"C:\Users\pragy\cloud_architecture\neural_symbolic\data\test_data.csv")

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [59]:
train_data.columns

Index(['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time',
       'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used',
       'attempt_count', 'answer_given', 'problem_completed', 'correct',
       'next_correct', 'content_source', 'skills', 'problem_type',
       'tutoring_types', 'student_answer_count', 'mean_correct',
       'mean_time_on_task', 'class_id', 'account_creation_date',
       'started_problem_sets_count', 'completed_problem_sets_count',
       'started_skill_builders_count', 'mastered_skill_builders_count',
       'answered_problems_count', 'mean_problem_correctness',
       'mean_problem_time_on_task', 'mean_class_score'],
      dtype='object')

In [60]:
col = 'next_correct'
def function(df: pd.DataFrame, col: str) -> pd.DataFrame:
    cols = [c for c in df.columns if c!=col] + [col]
    df = df[cols]
    return df

train_data = function(train_data,col)
test_data = function(test_data,col)

train_data.columns

Index(['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time',
       'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used',
       'attempt_count', 'answer_given', 'problem_completed', 'correct',
       'content_source', 'skills', 'problem_type', 'tutoring_types',
       'student_answer_count', 'mean_correct', 'mean_time_on_task', 'class_id',
       'account_creation_date', 'started_problem_sets_count',
       'completed_problem_sets_count', 'started_skill_builders_count',
       'mastered_skill_builders_count', 'answered_problems_count',
       'mean_problem_correctness', 'mean_problem_time_on_task',
       'mean_class_score', 'next_correct'],
      dtype='object')

In [61]:
train_data.head()

Unnamed: 0,log_id,student_id,assignment_id,problem_id,start_time,time_on_task,answer_before_tutoring,fraction_of_hints_used,attempt_count,answer_given,...,account_creation_date,started_problem_sets_count,completed_problem_sets_count,started_skill_builders_count,mastered_skill_builders_count,answered_problems_count,mean_problem_correctness,mean_problem_time_on_task,mean_class_score,next_correct
0,16475574,2259,1426268,1694676,2021-05-18 17:03:26.559000+00:00,,,,0,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,True
1,16654914,2259,1443864,1631611,2021-05-28 17:09:41.198000+00:00,60.625,True,,1,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
2,16654914,2259,1443864,1631613,2021-05-28 17:12:38.346000+00:00,44.138,True,,1,False,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
3,16711054,2259,1445768,1632407,2021-06-04 17:25:57.552000+00:00,75.82,True,,2,True,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False
4,16711054,2259,1445768,1632408,2021-06-04 17:27:14.550000+00:00,18.387,True,,2,True,...,2019-01-27 20:47:08.261000-05:00,3,2,0,0,10,0.333333,45.2447,0.5,False


In [39]:
!pip install torch torch-geometric



In [62]:
print("Dataset shape: ",train_data.shape)
print("Column names: ", train_data.columns.tolist())
print("Unique students: ",train_data['student_id'].nunique())
print("Unique problems: ",train_data['problem_id'].nunique())

Dataset shape:  (110474, 30)
Column names:  ['log_id', 'student_id', 'assignment_id', 'problem_id', 'start_time', 'time_on_task', 'answer_before_tutoring', 'fraction_of_hints_used', 'attempt_count', 'answer_given', 'problem_completed', 'correct', 'content_source', 'skills', 'problem_type', 'tutoring_types', 'student_answer_count', 'mean_correct', 'mean_time_on_task', 'class_id', 'account_creation_date', 'started_problem_sets_count', 'completed_problem_sets_count', 'started_skill_builders_count', 'mastered_skill_builders_count', 'answered_problems_count', 'mean_problem_correctness', 'mean_problem_time_on_task', 'mean_class_score', 'next_correct']
Unique students:  10000
Unique problems:  21777


### Node types are of three
1. Student nodes = 10000 students
2. Problem nodes = 21777 problems 
3. Skill nodes = length of all_skills

### Edge types are of two
1. student-problem edges 
2. problem-skill edges 

In [63]:
def analyze_graph_structure(df):
    all_skills = set()
    for skills_str in df['skills'].dropna():
        if isinstance(skills_str,str):
            skills = [s.strip() for s in skills_str.split(',')]
            all_skills.update(skills)

    return list(all_skills)


In [64]:
def create_node_mappings(df, unique_skills):
    unique_students = df['student_id'].unique()
    student_to_idx = {student_id: idx for idx, student_id in enumerate(unique_students)}

    unique_problems = df['problem_id'].unique()
    problem_to_idx = {problem_id: idx for idx, problem_id in enumerate(unique_problems)}

    skill_to_idx = {skill: idx for idx,skill in enumerate(unique_skills)}

    print("Mappings")
    print("Students: ",len(student_to_idx))
    print("Problem: ", len(problem_to_idx))
    print("Skill: ", len(skill_to_idx))

    return student_to_idx, problem_to_idx, skill_to_idx

In [65]:
def extract_node_features_optimized(df, student_to_idx, problem_to_idx, skill_to_idx, device='cuda'):
    """
    Optimized feature extraction with CUDA support
    """
    print("=== EXTRACTING NODE FEATURES (OPTIMIZED + CUDA) ===")
    
    # Check device availability
    if device == 'cuda' and not torch.cuda.is_available():
        print("CUDA not available, falling back to CPU")
        device = 'cpu'
    
    print(f"Using device: {device}")
    
    # === STUDENT NODE FEATURES (VECTORIZED) ===
    student_feature_names = [
        'class_id', 'started_problem_sets_count', 'completed_problem_sets_count',
        'started_skill_builders_count', 'mastered_skill_builders_count', 
        'answered_problems_count', 'mean_problem_correctness', 
        'mean_problem_time_on_task', 'mean_class_score'
    ]
    
    # Get most recent record per student (vectorized)
    student_latest = df.sort_values('start_time').groupby('student_id').tail(1)
    
    # Create ordered student features based on mapping
    student_features_list = []
    for student_id in student_to_idx.keys():
        if student_id in student_latest['student_id'].values:
            student_row = student_latest[student_latest['student_id'] == student_id].iloc[0]
            features = [float(student_row[col]) if pd.notna(student_row[col]) else 0.0 
                       for col in student_feature_names]
        else:
            features = [0.0] * len(student_feature_names)
        student_features_list.append(features)
    
    # Convert to CUDA tensor
    student_features = torch.tensor(student_features_list, dtype=torch.float32, device=device)
    print(f"Student features shape: {student_features.shape} on {device}")
    
    # === PROBLEM NODE FEATURES (VECTORIZED) ===
    problem_feature_names = ['student_answer_count', 'mean_correct', 'mean_time_on_task']
    
    # Encode categorical features (vectorized)
    le_content = LabelEncoder()
    le_problem_type = LabelEncoder()
    df_copy = df.copy()
    df_copy['content_source_encoded'] = le_content.fit_transform(df_copy['content_source'].fillna('Unknown'))
    df_copy['problem_type_encoded'] = le_problem_type.fit_transform(df_copy['problem_type'].fillna('Unknown'))
    
    # Get first occurrence per problem (vectorized)
    problem_first = df_copy.groupby('problem_id').first().reset_index()
    
    # Create ordered problem features
    problem_features_list = []
    for problem_id in problem_to_idx.keys():
        if problem_id in problem_first['problem_id'].values:
            problem_row = problem_first[problem_first['problem_id'] == problem_id].iloc[0]
            features = ([float(problem_row[col]) if pd.notna(problem_row[col]) else 0.0 
                        for col in problem_feature_names] +
                       [float(problem_row['content_source_encoded']), 
                        float(problem_row['problem_type_encoded'])])
        else:
            features = [0.0] * (len(problem_feature_names) + 2)
        problem_features_list.append(features)
    
    # Convert to CUDA tensor
    problem_features = torch.tensor(problem_features_list, dtype=torch.float32, device=device)
    print(f"Problem features shape: {problem_features.shape} on {device}")
    
    # === SKILL NODE FEATURES (OPTIMIZED) ===
    # Pre-process skills for faster lookup
    skills_expanded = []
    for idx, row in df.iterrows():
        if pd.notna(row['skills']):
            problem_skills = [s.strip() for s in str(row['skills']).split(',')]
            for skill in problem_skills:
                if skill in skill_to_idx:
                    skills_expanded.append({
                        'skill': skill,
                        'problem_id': row['problem_id'],
                        'correct': row['correct'] if pd.notna(row['correct']) else None
                    })
    
    skills_df = pd.DataFrame(skills_expanded)
    
    # Vectorized skill statistics
    skill_stats = skills_df.groupby('skill').agg({
        'problem_id': 'count',  # frequency
        'correct': 'mean'       # average correctness
    }).rename(columns={'problem_id': 'frequency', 'correct': 'avg_correctness'})
    
    # Create ordered skill features
    skill_features_list = []
    for skill in skill_to_idx.keys():
        if skill in skill_stats.index:
            freq = float(skill_stats.loc[skill, 'frequency'])
            avg_corr = float(skill_stats.loc[skill, 'avg_correctness']) if pd.notna(skill_stats.loc[skill, 'avg_correctness']) else 0.5
        else:
            freq, avg_corr = 0.0, 0.5
        skill_features_list.append([freq, avg_corr])
    
    # Convert to CUDA tensor
    skill_features = torch.tensor(skill_features_list, dtype=torch.float32, device=device)
    print(f"Skill features shape: {skill_features.shape} on {device}")
    
    return student_features, problem_features, skill_features

In [66]:
def create_edges_optimized(df, student_to_idx, problem_to_idx, skill_to_idx, device='cuda'):
    """
    Optimized edge creation with CUDA support
    """
    print("=== CREATING EDGES (OPTIMIZED + CUDA) ===")
    
    if device == 'cuda' and not torch.cuda.is_available():
        device = 'cpu'
    
    # === STUDENT-PROBLEM EDGES (VECTORIZED) ===
    edge_feature_names = [
        'time_on_task', 'fraction_of_hints_used', 'attempt_count',
        'answer_before_tutoring', 'answer_given', 'problem_completed', 'correct'
    ]
    
    # Vectorized edge creation
    df_edges = df.copy()
    df_edges['student_idx'] = df_edges['student_id'].map(student_to_idx)
    df_edges['problem_idx'] = df_edges['problem_id'].map(problem_to_idx)
    
    # Remove rows where mapping failed
    df_edges = df_edges.dropna(subset=['student_idx', 'problem_idx'])
    
    # Extract edge indices
    student_problem_edges = df_edges[['student_idx', 'problem_idx']].values.T
    
    # Extract edge features (vectorized)
    edge_features_df = df_edges[edge_feature_names].fillna(0)
    # Convert boolean columns to float
    for col in edge_features_df.columns:
        if edge_features_df[col].dtype == 'bool':
            edge_features_df[col] = edge_features_df[col].astype(float)
    
    student_problem_edge_features = edge_features_df.values
    
    # Convert to CUDA tensors
    student_problem_edges = torch.tensor(student_problem_edges, dtype=torch.long, device=device)
    student_problem_edge_features = torch.tensor(student_problem_edge_features, dtype=torch.float32, device=device)
    
    print(f"Student-Problem edges: {student_problem_edges.shape} on {device}")
    print(f"Edge features: {student_problem_edge_features.shape} on {device}")
    
    # === PROBLEM-SKILL EDGES (OPTIMIZED) ===
    problem_skill_pairs = []
    for idx, row in df.iterrows():
        if pd.notna(row['skills']) and row['problem_id'] in problem_to_idx:
            problem_idx = problem_to_idx[row['problem_id']]
            skills_list = [s.strip() for s in str(row['skills']).split(',')]
            
            for skill in skills_list:
                if skill in skill_to_idx:
                    skill_idx = skill_to_idx[skill]
                    problem_skill_pairs.append([problem_idx, skill_idx])
    
    # Remove duplicates and convert to tensor
    problem_skill_edges = torch.tensor(
        list(set(map(tuple, problem_skill_pairs))), 
        dtype=torch.long, device=device
    ).T
    
    print(f"Problem-Skill edges: {problem_skill_edges.shape} on {device}")
    
    return student_problem_edges, student_problem_edge_features, problem_skill_edges

In [70]:
def build_heterogeneous_graph_cuda(student_features, problem_features, skill_features,
                                  student_problem_edges, student_problem_edge_features,
                                  problem_skill_edges, device='cuda'):
    """
    Build PyTorch Geometric graph on CUDA
    """
    from torch_geometric.data import HeteroData
    
    # More robust device handling
    if hasattr(device, 'type'):
        device_name = device.type.upper()  # For torch.device objects
    else:
        device_name = str(device).upper()  # For string device
    
    print(f"=== BUILDING GRAPH ON {device_name} ===")
    
    # Initialize heterogeneous graph
    data = HeteroData()
    
    # Add node features (already on correct device)
    data['student'].x = student_features
    data['problem'].x = problem_features
    data['skill'].x = skill_features
    
    # Add edges (already on correct device)
    data['student', 'attempts', 'problem'].edge_index = student_problem_edges
    data['student', 'attempts', 'problem'].edge_attr = student_problem_edge_features
    data['problem', 'requires', 'skill'].edge_index = problem_skill_edges
    
    # Add reverse edges
    data['problem', 'attempted_by', 'student'].edge_index = student_problem_edges.flip(0)
    data['skill', 'required_by', 'problem'].edge_index = problem_skill_edges.flip(0)
    
    print(f"Graph created successfully!")
    print(f"Student nodes: {data['student'].x.shape}")
    print(f"Problem nodes: {data['problem'].x.shape}")
    print(f"Skill nodes: {data['skill'].x.shape}")
    print(f"Graph is on device: {data['student'].x.device}")
    
    return data


In [71]:
unique_skills = analyze_graph_structure(train_data)
student_to_idx, problem_to_idx, skill_to_idx = create_node_mappings(train_data, unique_skills)

print("Creation of node mappings completed...\n")

student_features, problem_features, skill_features = extract_node_features_optimized(
    train_data, student_to_idx, problem_to_idx, skill_to_idx, device=device
)

print("Extraction node features completed...\n")

student_problem_edges, student_problem_edge_features, problem_skill_edges = create_edges_optimized(
    train_data, student_to_idx, problem_to_idx, skill_to_idx, device=device
)

print("Creation of edges completed...\n")

graph_data = build_heterogeneous_graph_cuda(
    student_features, problem_features, skill_features,
    student_problem_edges, student_problem_edge_features, 
    problem_skill_edges, device=device
)

print("Building graph completed...\n")

torch.save({
    'graph_data': graph_data,
    'student_to_idx': student_to_idx,
    'problem_to_idx': problem_to_idx,
    'skill_to_idx': skill_to_idx,
    'device': device
}, 'teg_nesymet_graph_cuda.pt')

print("Graph saving completed")

Mappings
Students:  10000
Problem:  21777
Skill:  762
Creation of node mappings completed...

=== EXTRACTING NODE FEATURES (OPTIMIZED + CUDA) ===
Using device: cuda
Student features shape: torch.Size([10000, 9]) on cuda
Problem features shape: torch.Size([21777, 5]) on cuda
Skill features shape: torch.Size([762, 2]) on cuda
Extraction node features completed...

=== CREATING EDGES (OPTIMIZED + CUDA) ===
Student-Problem edges: torch.Size([2, 110474]) on cuda
Edge features: torch.Size([110474, 7]) on cuda
Problem-Skill edges: torch.Size([2, 14283]) on cuda
Creation of edges completed...

=== BUILDING GRAPH ON CUDA ===
Graph created successfully!
Student nodes: torch.Size([10000, 9])
Problem nodes: torch.Size([21777, 5])
Skill nodes: torch.Size([762, 2])
Graph is on device: cuda:0
Building graph completed...

Graph saving completed


In [74]:
def analyze_graph_structure(graph_data):
    """
    Print detailed graph statistics and structure
    """
    print("=== GRAPH STRUCTURE ANALYSIS ===")
    
    # Basic information
    print(f"Node types: {list(graph_data.node_types)}")
    print(f"Edge types: {list(graph_data.edge_types)}")
    
    # Node counts
    for node_type in graph_data.node_types:
        print(f"{node_type} nodes: {graph_data[node_type].x.shape[0]} "
              f"(features: {graph_data[node_type].x.shape[1]})")
    
    # Edge counts  
    for edge_type in graph_data.edge_types:
        edge_count = graph_data[edge_type].edge_index.shape[1]
        print(f"{edge_type} edges: {edge_count}")
        
        if hasattr(graph_data[edge_type], 'edge_attr'):
            edge_features = graph_data[edge_type].edge_attr.shape[1]
            print(f"  └─ Edge features: {edge_features}")
    
    # Convert to homogeneous for additional analysis
    homo_data = graph_data.to_homogeneous()
    G = to_networkx(homo_data, to_undirected=True)
    
    print(f"\nHomogeneous Graph Analysis:")
    print(f"- Total nodes: {G.number_of_nodes()}")
    print(f"- Total edges: {G.number_of_edges()}")
    print(f"- Average degree: {sum(dict(G.degree()).values()) / G.number_of_nodes():.2f}")
    print(f"- Is connected: {nx.is_connected(G)}")
    print(f"- Number of connected components: {nx.number_connected_components(G)}")

# Analyze your graph structure
analyze_graph_structure(graph_data)


=== GRAPH STRUCTURE ANALYSIS ===
Node types: ['student', 'problem', 'skill']
Edge types: [('student', 'attempts', 'problem'), ('problem', 'requires', 'skill'), ('problem', 'attempted_by', 'student'), ('skill', 'required_by', 'problem')]
student nodes: 10000 (features: 9)
problem nodes: 21777 (features: 5)
skill nodes: 762 (features: 2)
('student', 'attempts', 'problem') edges: 110474
  └─ Edge features: 7
('problem', 'requires', 'skill') edges: 14283
('problem', 'attempted_by', 'student') edges: 110474
('skill', 'required_by', 'problem') edges: 14283

Homogeneous Graph Analysis:
- Total nodes: 32539
- Total edges: 124275
- Average degree: 7.64
- Is connected: False
- Number of connected components: 205
