In [None]:
import pandas as pd
from pyBKT.models import Model
from tqdm import tqdm

In [None]:
def read_and_preprocess(dataset_name):
    if dataset_name=="HamptonAlg":
        df = pd.read_csv("data/raw_data/HamptonAlg_processed.csv")
        print(f"Number of rows before processing: {len(df)}")
        df = df[df['production'] != 'BLANK']  # Remove 'BLANK'
        print(f"Number of rows after : {len(df)}")
        production_mapping = {prod: idx for idx, prod in enumerate(df['production'].unique())}
        #print(df['production'].unique())
        df['knowledge'] = df['production'].map(production_mapping)
        pd.Series(production_mapping).to_csv('data/raw_data/HamptonAlg_production_mapping.csv')
        
        return df
    
    elif dataset_name=="Assistment_challenge":
        df = pd.read_csv("data/raw_data/assistant_competition.csv")
        print(f"Number of rows before processing: {len(df)}")
       
        print(f"Number of rows after : {len(df)}")
        skill_mapping = {prod: idx for idx, prod in enumerate(df['skill'].unique())}
        #print(df['skill'].unique())
        df['knowledge'] = df['skill'].map(skill_mapping)
        
       # df['skill'] = df['skill'].map(skill_mapping)
        pd.Series(skill_mapping).to_csv('data/raw_data/assistant_competition_skill_mapping.csv')
        
        return df
    else:
        print("no such dataset!")
        return

In [None]:
def train_test_split(dataset_name,knowledge_column_name, student_column_name):
    
    test_df = pd.DataFrame()
    train_df = pd.DataFrame()
    
    df = read_and_preprocess(dataset_name)
    
    skill_count = 1
    
    for skill, skill_group in df.groupby(knowledge_column_name):
        print(f"Skill: {skill} — Total records: {len(skill_group)}")

        # Get all unique students for this skill
        unique_students = skill_group[student_column_name].unique()
        
        if len(unique_students)<5:
            continue

        # Sample 20% of the students
        n_sample = max(1, int(len(unique_students) * 0.2))  # ensure at least 1
        sampled_students = pd.Series(unique_students).sample(n=n_sample, random_state=42)

        # Split the skill_group based on whether student is in test or train
        test_part = skill_group[skill_group[student_column_name].isin(sampled_students)]
        train_part = skill_group[~skill_group[student_column_name].isin(sampled_students)]

        # Append to overall DataFrames
        test_df = pd.concat([test_df, test_part])
        train_df = pd.concat([train_df, train_part])

            

    # Step 3: Reset index (optional)
    test_df.reset_index(drop=True, inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    
    print(f"Train set size: {len(train_df)}")
    print(f"Test set size: {len(test_df)}")
    
    # Step 5: Compare skills in train and test
    train_skills = set(train_df[knowledge_column_name].unique())
    test_skills = set(test_df[knowledge_column_name].unique())

    missing_skills = test_skills - train_skills

    if not missing_skills:
        print("✅ All skills in the test set are also present in the training set.")
        train_df.to_csv(dataset_name+ "_train.csv", index=False)
        test_df.to_csv(dataset_name +"_test.csv", index=False)
        
    else:
        print("❌ The following skills are in the test set but missing in the training set:")
        print(missing_skills)
        

In [None]:
train_test_split("HamptonAlg","knowledge","student")
train_test_split("Assistment_challenge","knowledge","studentId")