# assit data generation

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

#  file路径

test_path = '/content/Assistment_challenge_test.csv'
train_path = '/content/Assistment_challenge_train.csv'
merged_output_path = '/content/Assistment_challenge_merged.csv'
train_output_path = '/content/Assistment_challenge_train_new.csv'
test_output_path = '/content/Assistment_challenge_test_new.csv'

#  1. 读取data集

try:
    df_test = pd.read_csv(test_path)
    df_train = pd.read_csv(train_path)
except FileNotFoundError as e:
    print(f"Error: File not found - {e}")
    exit()

#  2. 检查列名一致性

print("Test dataset columns:", df_test.columns.tolist())
print("Train dataset columns:", df_train.columns.tolist())

if list(df_test.columns) != list(df_train.columns):
    print("Warning: Column names are not identical. Aligning to common columns.")
    common_columns = df_test.columns.intersection(df_train.columns).tolist()
    print("Common columns:", common_columns)
    df_test = df_test[common_columns]
    df_train = df_train[common_columns]
else:
    print("Column names are identical. Proceeding with merge.")

#  3. 检查data量

print(f"\nTest dataset rows: {len(df_test)}")
print(f"Train dataset rows: {len(df_train)}")

#  4. 检查 studentId 和 skill 列是否存在

id_column = 'studentId'
skill_column = 'skill'
if id_column not in df_test.columns or id_column not in df_train.columns:
    print(f"Error: Column '{id_column}' not found in one or both datasets.")
    exit()
if skill_column not in df_test.columns or skill_column not in df_train.columns:
    print(f"Error: Column '{skill_column}' not found in one or both datasets.")
    exit()

#  5. 检查train集和test集的 studentId 是否有重叠

test_students = set(df_test[id_column].unique())
train_students = set(df_train[id_column].unique())
intersection = test_students.intersection(train_students)
if len(intersection) > 0:
    print(f"\nWarning: Found {len(intersection)} overlapping studentIds between test and train datasets.")
    print("Merging may result in duplicate records. Proceeding with merge.")
else:
    print(f"\nNo overlapping studentIds found in original datasets.")

#  6. mergedata集

merged_df = pd.concat([df_train, df_test], ignore_index=True)
print(f"\nMerged dataset rows: {len(merged_df)}")
print(f"Merged dataset unique studentIds: {merged_df['studentId'].nunique()}")
print(f"Merged dataset unique skills: {merged_df['skill'].nunique()}")
print("Merged dataset columns:", merged_df.columns.tolist())

#  7. 检查merge后是否有重复行

duplicates = merged_df.duplicated().sum()
if duplicates > 0:
    print(f"\nWarning: Found {duplicates} duplicate rows in the merged dataset.")
    #  可选择去重

    #  merged_df = merged_df.drop_duplicates()

    #  print(f"After removing duplicates, merged dataset rows: {len(merged_df)}")

else:
    print("\nNo duplicate rows found in the merged dataset.")

#  8. process缺失值

merged_df = merged_df.dropna(subset=[id_column, skill_column])
print(f"\nAfter removing rows with missing studentId or skill, rows: {len(merged_df)}")

#  9. savemerge后的data集

merged_df.to_csv(merged_output_path, index=False)
print(f"\nMerged dataset saved to: {merged_output_path}")

#  10. 按 studentId 进行 2:8 划分，确保train集覆盖所有技能

unique_students = merged_df['studentId'].unique()
all_skills = set(merged_df['skill'].unique())
train_students = []
test_students = []

#  按技能分组，找到每个技能对应的student

student_skills = merged_df.groupby('studentId')['skill'].unique().apply(set)
rare_skills = set()
for skill in all_skills:
    students_with_skill = student_skills[student_skills.apply(lambda x: skill in x)].index
    if len(students_with_skill) <= 5:  #  假设技能出现在少于 5 个student为稀有技能

        rare_skills.add(skill)
        train_students.extend(students_with_skill)

#  去重并转换为集合

train_students = list(set(train_students))
remaining_students = [s for s in unique_students if s not in train_students]

#  对剩余student进行 2:8 划分

train_size = int(0.8 * len(unique_students)) - len(train_students)
if train_size > 0 and remaining_students:
    train_remaining, test_remaining = train_test_split(
        remaining_students,
        train_size=train_size,
        random_state=42
    )
    train_students.extend(train_remaining)
    test_students = test_remaining
else:
    test_students = remaining_students

#  11. 确保train集覆盖所有技能

train_df = merged_df[merged_df['studentId'].isin(train_students)]
train_skills = set(train_df['skill'].unique())
if train_skills != all_skills:
    print("\nWarning: Training set does not cover all skills. Adjusting...")
    missing_skills = all_skills - train_skills
    for skill in missing_skills:
        students_with_skill = merged_df[merged_df['skill'] == skill]['studentId'].unique()
        if students_with_skill.size > 0:
            train_students.append(students_with_skill[0])
            test_students = [s for s in test_students if s not in students_with_skill]
    train_df = merged_df[merged_df['studentId'].isin(train_students)]
    train_skills = set(train_df['skill'].unique())

#  12. 生成test集

test_df = merged_df[merged_df['studentId'].isin(test_students)]

#  13. 验证 studentId 无重叠

train_students_set = set(train_students)
test_students_set = set(test_students)
intersection = train_students_set.intersection(test_students_set)
if len(intersection) > 0:
    print(f"Error: Found {len(intersection)} overlapping studentIds in new split.")
    exit()
else:
    print("\nNo overlapping studentIds between new train and test datasets.")

#  14. 验证技能覆盖

test_skills = set(test_df['skill'].unique())
if not test_skills.issubset(train_skills):
    print("\nError: Test set contains skills not in training set.")
    exit()
else:
    print("\nAll skills in test set are present in training set.")

#  15. 检查划分后的data量和比例

total_students = len(unique_students)
train_student_count = len(train_students)
test_student_count = len(test_students)
train_student_percentage = (train_student_count / total_students) * 100
test_student_percentage = (test_student_count / total_students) * 100

print(f"\nNew train dataset: {len(train_df)} rows, {train_student_count} studentIds ({train_student_percentage:.2f}%)")
print(f"New test dataset: {len(test_df)} rows, {test_student_count} studentIds ({test_student_percentage:.2f}%)")
print(f"Training set unique skills: {len(train_skills)}")
print(f"Test set unique skills: {len(test_skills)}")

#  16. 验证行数比例

total_rows = len(merged_df)
train_row_percentage = (len(train_df) / total_rows) * 100
test_row_percentage = (len(test_df) / total_rows) * 100
print(f"New train dataset row proportion: {train_row_percentage:.2f}%")
print(f"New test dataset row proportion: {test_row_percentage:.2f}%")

if abs(test_row_percentage - 20) < 5 and abs(train_row_percentage - 80) < 5:
    print("The new split is approximately 2:8 (test:train) by rows.")
else:
    print(f"The new split does not closely match 2:8 by rows. Actual split: {test_row_percentage:.2f}:{train_row_percentage:.2f}")

#  17. 检查列一致性

if list(train_df.columns) == list(test_df.columns):
    print("\nNew train and test datasets have identical columns.")
else:
    print("\nError: New train and test datasets have different columns.")
    exit()

#  18. save划分后的data集

train_df.to_csv(train_output_path, index=False)
test_df.to_csv(test_output_path, index=False)
print(f"\nNew train dataset saved to: {train_output_path}")
print(f"New test dataset saved to: {test_output_path}")

#  19. 最终验证

print("\nFinal verification:")
print(f"New train dataset rows: {len(train_df)}, unique studentIds: {train_df['studentId'].nunique()}, unique skills: {train_df['skill'].nunique()}")
print(f"New test dataset rows: {len(test_df)}, unique studentIds: {test_df['studentId'].nunique()}, unique skills: {test_df['skill'].nunique()}")
print(f"No duplicate studentIds confirmed: {len(train_df[train_df['studentId'].isin(test_students)]) == 0}")

Test dataset columns: ['studentId', 'MiddleSchoolId', 'InferredGender', 'SY ASSISTments Usage', 'AveKnow', 'AveCarelessness', 'AveCorrect', 'NumActions', 'AveResBored', 'AveResEngcon', 'AveResConf', 'AveResFrust', 'AveResOfftask', 'AveResGaming', 'action_num', 'skill', 'problemId', 'problemType', 'assignmentId', 'assistmentId', 'startTime', 'endTime', 'timeTaken', 'correct', 'original', 'hint', 'hintCount', 'hintTotal', 'scaffold', 'bottomHint', 'attemptCount', 'frIsHelpRequest', 'frPast5HelpRequest', 'frPast8HelpRequest', 'stlHintUsed', 'past8BottomOut', 'totalFrPercentPastWrong', 'totalFrPastWrongCount', 'frPast5WrongCount', 'frPast8WrongCount', 'totalFrTimeOnSkill', 'timeSinceSkill', 'frWorkingInSchool', 'totalFrAttempted', 'totalFrSkillOpportunities', 'responseIsFillIn', 'responseIsChosen', 'endsWithScaffolding', 'endsWithAutoScaffolding', 'frTimeTakenOnScaffolding', 'frTotalSkillOpportunitiesScaffolding', 'totalFrSkillOpportunitiesByScaffolding', 'frIsHelpRequestScaffolding', 'tim

# hampton data generation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

#  file路径

test_path = '/content/HamptonAlg_test.csv'
train_path = '/content/HamptonAlg_train.csv'
merged_output_path = '/content/HamptonAlg_merged.csv'
train_output_path = '/content/HamptonAlg_train_new.csv'
test_output_path = '/content/HamptonAlg_test_new.csv'

#  1. 读取data集

try:
    df_test = pd.read_csv(test_path)
    df_train = pd.read_csv(train_path)
except FileNotFoundError as e:
    print(f"Error: File not found - {e}")
    exit()

#  2. 检查列名一致性

print("Test dataset columns:", df_test.columns.tolist())
print("Train dataset columns:", df_train.columns.tolist())

if list(df_test.columns) != list(df_train.columns):
    print("Warning: Column names are not identical. Aligning to common columns.")
    common_columns = df_test.columns.intersection(df_train.columns).tolist()
    print("Common columns:", common_columns)
    df_test = df_test[common_columns]
    df_train = df_train[common_columns]
else:
    print("Column names are identical. Proceeding with merge.")

#  3. 检查data量

print(f"\nTest dataset rows: {len(df_test)}")
print(f"Train dataset rows: {len(df_train)}")

#  4. 检查 student 和 knowledge 列是否存在

id_column = 'student'
skill_column = 'knowledge'
if id_column not in df_test.columns or id_column not in df_train.columns:
    print(f"Error: Column '{id_column}' not found in one or both datasets.")
    exit()
if skill_column not in df_test.columns or skill_column not in df_train.columns:
    print(f"Error: Column '{skill_column}' not found in one or both datasets.")
    exit()

#  5. 检查train集和test集的 student 是否有重叠

test_students = set(df_test[id_column].unique())
train_students = set(df_train[id_column].unique())
intersection = test_students.intersection(train_students)
if len(intersection) > 0:
    print(f"\nWarning: Found {len(intersection)} overlapping students between test and train datasets.")
    print("Merging may result in duplicate records. Proceeding with merge.")
else:
    print(f"\nNo overlapping students found in original datasets.")

#  6. mergedata集

merged_df = pd.concat([df_train, df_test], ignore_index=True)
print(f"\nMerged dataset rows: {len(merged_df)}")
print(f"Merged dataset unique students: {merged_df['student'].nunique()}")
print(f"Merged dataset unique knowledge: {merged_df['knowledge'].nunique()}")
print("Merged dataset columns:", merged_df.columns.tolist())

#  7. 检查merge后是否有重复行

duplicates = merged_df.duplicated().sum()
if duplicates > 0:
    print(f"\nWarning: Found {duplicates} duplicate rows in the merged dataset.")
    #  可选择去重

    #  merged_df = merged_df.drop_duplicates()

    #  print(f"After removing duplicates, merged dataset rows: {len(merged_df)}")

else:
    print("\nNo duplicate rows found in the merged dataset.")

#  8. process缺失值

merged_df = merged_df.dropna(subset=[id_column, skill_column])
print(f"\nAfter removing rows with missing student or knowledge, rows: {len(merged_df)}")

#  9. savemerge后的data集

merged_df.to_csv(merged_output_path, index=False)
print(f"\nMerged dataset saved to: {merged_output_path}")

#  10. 按 student 进行 2:8 划分，确保train集覆盖所有 knowledge

unique_students = merged_df['student'].unique()
all_knowledge = set(merged_df['knowledge'].unique())
train_students = []
test_students = []

#  按 knowledge 分组，找到每个 knowledge 对应的student

student_knowledge = merged_df.groupby('student')['knowledge'].unique().apply(set)
rare_knowledge = set()
for knowledge in all_knowledge:
    students_with_knowledge = student_knowledge[student_knowledge.apply(lambda x: knowledge in x)].index
    if len(students_with_knowledge) <= 5:  #  假设 knowledge 出现在少于 5 个student为稀有

        rare_knowledge.add(knowledge)
        train_students.extend(students_with_knowledge)

#  去重并转换为集合

train_students = list(set(train_students))
remaining_students = [s for s in unique_students if s not in train_students]

#  对剩余student进行 2:8 划分

train_size = int(0.8 * len(unique_students)) - len(train_students)
if train_size > 0 and remaining_students:
    train_remaining, test_remaining = train_test_split(
        remaining_students,
        train_size=train_size,
        random_state=42
    )
    train_students.extend(train_remaining)
    test_students = test_remaining
else:
    test_students = remaining_students

#  11. 确保train集覆盖所有 knowledge

train_df = merged_df[merged_df['student'].isin(train_students)]
train_knowledge = set(train_df['knowledge'].unique())
if train_knowledge != all_knowledge:
    print("\nWarning: Training set does not cover all knowledge. Adjusting...")
    missing_knowledge = all_knowledge - train_knowledge
    for knowledge in missing_knowledge:
        students_with_knowledge = merged_df[merged_df['knowledge'] == knowledge]['student'].unique()
        if students_with_knowledge.size > 0:
            train_students.append(students_with_knowledge[0])
            test_students = [s for s in test_students if s not in students_with_knowledge]
    train_df = merged_df[merged_df['student'].isin(train_students)]
    train_knowledge = set(train_df['knowledge'].unique())

#  12. 生成test集

test_df = merged_df[merged_df['student'].isin(test_students)]

#  13. 验证 student 无重叠

train_students_set = set(train_students)
test_students_set = set(test_students)
intersection = train_students_set.intersection(test_students_set)
if len(intersection) > 0:
    print(f"Error: Found {len(intersection)} overlapping students in new split.")
    exit()
else:
    print("\nNo overlapping students between new train and test datasets.")

#  14. 验证 knowledge 覆盖

test_knowledge = set(test_df['knowledge'].unique())
if not test_knowledge.issubset(train_knowledge):
    print("\nError: Test set contains knowledge not in training set.")
    exit()
else:
    print("\nAll knowledge in test set are present in training set.")

#  15. 检查划分后的data量和比例

total_students = len(unique_students)
train_student_count = len(train_students)
test_student_count = len(test_students)
train_student_percentage = (train_student_count / total_students) * 100
test_student_percentage = (test_student_count / total_students) * 100

print(f"\nNew train dataset: {len(train_df)} rows, {train_student_count} students ({train_student_percentage:.2f}%)")
print(f"New test dataset: {len(test_df)} rows, {test_student_count} students ({test_student_percentage:.2f}%)")
print(f"Training set unique knowledge: {len(train_knowledge)}")
print(f"Test set unique knowledge: {len(test_knowledge)}")

#  16. 验证行数比例

total_rows = len(merged_df)
train_row_percentage = (len(train_df) / total_rows) * 100
test_row_percentage = (len(test_df) / total_rows) * 100
print(f"New train dataset row proportion: {train_row_percentage:.2f}%")
print(f"New test dataset row proportion: {test_row_percentage:.2f}%")

if abs(test_row_percentage - 20) < 5 and abs(train_row_percentage - 80) < 5:
    print("The new split is approximately 2:8 (test:train) by rows.")
else:
    print(f"The new split does not closely match 2:8 by rows. Actual split: {test_row_percentage:.2f}:{train_row_percentage:.2f}")

#  17. 检查列一致性

if list(train_df.columns) == list(test_df.columns):
    print("\nNew train and test datasets have identical columns.")
else:
    print("\nError: New train and test datasets have different columns.")
    exit()

#  18. save划分后的data集

train_df.to_csv(train_output_path, index=False)
test_df.to_csv(test_output_path, index=False)
print(f"\nNew train dataset saved to: {train_output_path}")
print(f"New test dataset saved to: {test_output_path}")

#  19. 最终验证

print("\nFinal verification:")
print(f"New train dataset rows: {len(train_df)}, unique students: {train_df['student'].nunique()}, unique knowledge: {train_df['knowledge'].nunique()}")
print(f"New test dataset rows: {len(test_df)}, unique students: {test_df['student'].nunique()}, unique knowledge: {test_df['knowledge'].nunique()}")
print(f"No duplicate students confirmed: {len(train_df[train_df['student'].isin(test_students)]) == 0}")

Test dataset columns: ['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']
Train dataset columns: ['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']
Column names are identical. Proceeding with merge.

Test dataset rows: 41167
Train dataset rows: 199070

Merging may result in duplicate records. Proceeding with merge.

Merged dataset rows: 240237
Merged dataset unique students: 59
Merged dataset unique knowledge: 87
Merged dataset columns: ['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']

No duplicate rows found in the merged dataset.

After removing rows with mis

# hint abuse

In [3]:
import pandas as pd
import numpy as np
import os
from google.colab import files

#  Define global data directory for unified path management

DATA_DIR = '/content/poisoned_datasets/'

#  Function to load dataset

def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False)
    return df

#  Function to simulate Hint Abuse Attack

def simulate_hint_abuse(df, poison_ratio):
    """Simulate Hint Abuse Attack on the dataset."""
    #  Create a deep copy to ensure data independence

    df_poisoned = df.copy()

    #  Initialize gaming_label column with explicit integer type

    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  #  Nullable integer type


    #  Assert key columns exist

    assert 'studentId' in df_poisoned.columns, "Column 'studentId' not found"
    assert 'problemId' in df_poisoned.columns, "Column 'problemId' not found"
    assert 'correct' in df_poisoned.columns, "Column 'correct' not found"
    assert 'timeTaken' in df_poisoned.columns, "Column 'timeTaken' not found"
    assert 'hint' in df_poisoned.columns, "Column 'hint' not found"
    assert 'hintCount' in df_poisoned.columns, "Column 'hintCount' not found"
    assert 'hintTotal' in df_poisoned.columns, "Column 'hintTotal' not found"
    assert 'frIsHelpRequest' in df_poisoned.columns, "Column 'frIsHelpRequest' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    #  Debug: Check initial unique values in gaming_label

    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    #  Randomly select students and problems

    n_poison = int(len(df) * poison_ratio)  #  Total number of records to poison

    unique_students = df['studentId'].unique()
    unique_problems = df['problemId'].unique()
    n_select = min(int(np.sqrt(n_poison)), len(unique_students), len(unique_problems))  #  Balance selection


    #  Debug: Check selection counts

    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique problems: {len(unique_problems)}, selected: {n_select}")

    #  Handle case where no records can be poisoned

    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_problems = np.random.choice(unique_problems, size=n_select, replace=False)

    #  Identify records to poison

    mask = (df['studentId'].isin(selected_students)) & (df['problemId'].isin(selected_problems))
    poison_indices = df.index[mask].tolist()

    #  Debug: Check poison indices

    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    #  Debug: Check final poison indices

    print(f"Number of final poison indices: {len(poison_indices)}")

    #  Generate new rows for hint requests

    new_rows = []
    columns = df_poisoned.columns.tolist()  #  Ensure all columns are included

    if len(poison_indices) > 0:
        for idx in poison_indices:
            base_row = df_poisoned.loc[idx].copy()
            #  Generate 3-5 hint requests

            hint_count = np.random.randint(3, 6)  #  3-5 hints

            interval = np.random.uniform(0.5, 1.0, hint_count)  #  0.5-1 second intervals

            for i in range(hint_count):
                new_row = base_row.copy()
                new_row['hint'] = 1
                new_row['hintCount'] = base_row['hintCount'] + 1 if pd.notna(base_row['hintCount']) else 1
                new_row['hintTotal'] = base_row['hintTotal'] + 1 if pd.notna(base_row['hintTotal']) else 1
                new_row['frIsHelpRequest'] = 1
                new_row['timeTaken'] = interval[i]
                new_row['correct'] = 0  #  No answer yet

                new_row['gaming_label'] = 1
                #  Ensure new_row is a DataFrame with correct columns

                new_row_df = pd.Series(new_row, index=columns).to_frame().T
                new_rows.append(new_row_df)
            #  Update original row (submit correct answer or no answer)

            df_poisoned.loc[idx, 'hintCount'] = base_row['hintCount'] + hint_count if pd.notna(base_row['hintCount']) else hint_count
            df_poisoned.loc[idx, 'hintTotal'] = base_row['hintTotal'] + hint_count if pd.notna(base_row['hintTotal']) else hint_count
            df_poisoned.loc[idx, 'timeTaken'] = 0.5
            df_poisoned.loc[idx, 'gaming_label'] = 1
            #  50% chance to submit no answer

            if np.random.random() < 0.5:
                df_poisoned.loc[idx, 'correct'] = np.nan
                df_poisoned.loc[idx, 'frIsHelpRequest'] = 1  #  Still a help request

            else:
                df_poisoned.loc[idx, 'correct'] = 1
                df_poisoned.loc[idx, 'frIsHelpRequest'] = 0  #  Correct answer, not a help request


        #  Convert new_rows to DataFrame

        if new_rows:
            new_rows_df = pd.concat(new_rows, ignore_index=True)
            #  Ensure data types match

            for col in df_poisoned.columns:
                if col in new_rows_df.columns:
                    new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
            df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    #  Debug: Check unique values in gaming_label after poisoning

    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    #  Verify changes with assertions

    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nHintCount distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['hintCount'].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts().head())

    return df_poisoned

#  Main execution

def main():
    """Main function to load data and simulate attacks."""
    #  Create directory for poisoned datasets

    os.makedirs(DATA_DIR, exist_ok=True)

    #  Load dataset

    assistment_file = '/content/Assistment_challenge_train_new.csv'
    df = load_data(assistment_file)

    #  Ensure relevant columns are numeric

    for col in ['correct', 'timeTaken', 'hint', 'hintCount', 'hintTotal', 'frIsHelpRequest']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    #  Print dataset information

    print("Variable Names in Assistment_challenge_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'correct':", df['correct'].unique())
    print("Sample values in 'timeTaken':", df['timeTaken'].head().tolist())
    print("Unique values in 'hint':", df['hint'].unique())
    print("Unique values in 'frIsHelpRequest':", df['frIsHelpRequest'].unique())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['studentId'].unique()))
    print("Number of unique problems:", len(df['problemId'].unique()))

    #  Simulate Hint Abuse Attack for each poison ratio

    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Hint Abuse Attack with {int(ratio*100)}% Poisoning...")
        #  Create a fresh copy for each iteration

        df_copy = df.copy()
        poisoned_df = simulate_hint_abuse(df_copy, ratio)

        #  Save poisoned dataset

        output_file = f'{DATA_DIR}Assistment_hint_abuse_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, chunksize=10000)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)

if __name__ == "__main__":
    main()

Variable Names in Assistment_challenge_train.csv:
['studentId', 'MiddleSchoolId', 'InferredGender', 'SY ASSISTments Usage', 'AveKnow', 'AveCarelessness', 'AveCorrect', 'NumActions', 'AveResBored', 'AveResEngcon', 'AveResConf', 'AveResFrust', 'AveResOfftask', 'AveResGaming', 'action_num', 'skill', 'problemId', 'problemType', 'assignmentId', 'assistmentId', 'startTime', 'endTime', 'timeTaken', 'correct', 'original', 'hint', 'hintCount', 'hintTotal', 'scaffold', 'bottomHint', 'attemptCount', 'frIsHelpRequest', 'frPast5HelpRequest', 'frPast8HelpRequest', 'stlHintUsed', 'past8BottomOut', 'totalFrPercentPastWrong', 'totalFrPastWrongCount', 'frPast5WrongCount', 'frPast8WrongCount', 'totalFrTimeOnSkill', 'timeSinceSkill', 'frWorkingInSchool', 'totalFrAttempted', 'totalFrSkillOpportunities', 'responseIsFillIn', 'responseIsChosen', 'endsWithScaffolding', 'endsWithAutoScaffolding', 'frTimeTakenOnScaffolding', 'frTotalSkillOpportunitiesScaffolding', 'totalFrSkillOpportunitiesByScaffolding', 'frIsH

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Hint Abuse Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 35022
Number of unique students: 1352, selected: 187
Number of unique problems: 1452, selected: 187
Number of candidate poison indices: 2467
Number of final poison indices: 2467
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

HintCount distribution after 25% poisoning:
hintCount
0    64174
1    32647
2    19346
3    17952
4     7831
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    137623
1     12355
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_hint_abuse_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Hint Abuse Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 70045
Number of unique students: 1352, selected: 264
Number of unique problems: 1452, selected: 264
Number of candidate poison indices: 4558
Number of final poison indices: 4558
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

HintCount distribution after 50% poisoning:
hintCount
0    63224
1    35961
2    20854
3    18928
4     9343
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    135532
1     22757
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_hint_abuse_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# randon error

In [5]:
import pandas as pd
import numpy as np
import os

#  Define global data directory for unified path management

DATA_DIR = '/content/poisoned_datasets/'

#  Function to load dataset

def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False, dtype={'knowledge': float})
    print("Initial knowledge distribution:\n", df['knowledge'].describe())
    print("Unique values in knowledge:", df['knowledge'].unique())
    return df

#  Function to simulate DPA

def simulate_dpa(df, poison_ratio):
    """Simulate Data Poisoning Attack (DPA) on the dataset."""
    #  Deep copy

    df_poisoned = df.copy()
    #  Initialize gaming_label

    if 'gaming_label' not in df_poisoned.columns:
        df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].fillna(0).astype('Int64')
    #  Ensure required columns exist

    for col in ['correct', 'timeTaken', 'responseIsChosen', 'responseIsFillIn']:
        assert col in df_poisoned.columns, f"Column '{col}' not found"
    #  Number of rows to poison

    n_poison = int(len(df) * poison_ratio)
    if n_poison == 0:
        print("No rows selected for poisoning due to low poison ratio.")
        return df_poisoned
    poison_indices = np.random.choice(df.index, size=n_poison, replace=False)
    #  Flip correctness for original 0/1

    df_poisoned.loc[poison_indices, 'correct'] = df_poisoned.loc[poison_indices, 'correct'].fillna(0)
    df_poisoned.loc[poison_indices, 'correct'] = 1 - df_poisoned.loc[poison_indices, 'correct']
    #  Set timeTaken to 1

    df_poisoned.loc[poison_indices, 'timeTaken'] = df_poisoned.loc[poison_indices, 'timeTaken'].fillna(1).astype(float)
    #  For fill-in responses, assign random 0/1 instead of 0~99

    fill_in_mask = df_poisoned.loc[poison_indices, 'responseIsFillIn'] == 1
    fill_in_indices = poison_indices[fill_in_mask]
    if len(fill_in_indices) > 0:
        df_poisoned.loc[fill_in_indices, 'correct'] = np.random.randint(0, 2, size=len(fill_in_indices))
    #  Mark poisoned rows

    df_poisoned.loc[poison_indices, 'gaming_label'] = 1
    #  Ensure all correct are 0/1 integers

    df_poisoned['correct'] = df_poisoned['correct'].fillna(0).astype(int)
    #  Protect knowledge column

    if 'knowledge' in df_poisoned.columns:
        df_poisoned['knowledge'] = df_poisoned['knowledge'].astype(float)
    #  Debug info

    print("Knowledge distribution after poisoning:\n", df_poisoned['knowledge'].describe())
    print("Unique values in knowledge after poisoning:", df_poisoned['knowledge'].unique())
    print(f"Unique values in 'correct' after poisoning: {df_poisoned['correct'].unique()}")
    print(f"Unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")
    print(f"Correct distribution:\n{df_poisoned['correct'].value_counts()}")
    print(f"Gaming_label distribution:\n{df_poisoned['gaming_label'].value_counts()}")
    return df_poisoned

#  Main execution

def main():
    os.makedirs(DATA_DIR, exist_ok=True)
    assistment_file = '/content/Assistment_challenge_train_new.csv'
    df = load_data(assistment_file)
    #  Ensure numeric

    for col in ['correct', 'timeTaken', 'responseIsChosen', 'responseIsFillIn']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if 'knowledge' in df.columns:
        df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce').astype(float)
    #  Simulate DPA

    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating {int(ratio*100)}% poisoning...")
        df_copy = df.copy()
        poisoned_df = simulate_dpa(df_copy, ratio)
        output_file = f'{DATA_DIR}Assistment_poisoned_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, float_format='%.2f')
        print(f"Saved: {output_file}")
        #  files.download(output_file)  # Uncomment to download in Colab


if __name__ == "__main__":
    main()

Initial knowledge distribution:
 count    140088.000000
mean          8.286142
std           8.564302
min           0.000000
25%           2.000000
50%           5.000000
75%          12.000000
max          33.000000
Name: knowledge, dtype: float64
Unique values in knowledge: [ 0.  1.  2.  3.  4.  5.  6. nan  7.  8.  9. 10. 11. 12. 13. 14. 21. 24.
 25. 26. 27. 28. 29. 30. 31. 32. 33.]

Simulating 5% poisoning...
Knowledge distribution after poisoning:
 count    140088.000000
mean          8.286142
std           8.564302
min           0.000000
25%           2.000000
50%           5.000000
75%          12.000000
max          33.000000
Name: knowledge, dtype: float64
Unique values in knowledge after poisoning: [ 0.  1.  2.  3.  4.  5.  6. nan  7.  8.  9. 10. 11. 12. 13. 14. 21. 24.
 25. 26. 27. 28. 29. 30. 31. 32. 33.]
Unique values in 'correct' after poisoning: [0 1]
Unique values in 'gaming_label': <IntegerArray>
[0, 1]
Length: 2, dtype: Int64
Correct distribution:
correct
0    88504
1 

In [4]:
import pandas as pd
import numpy as np
import os
from google.colab import files

#  Define global data directory for unified path management

DATA_DIR = '/content/poisoned_datasets/'

#  Function to load dataset

def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path)
    return df

#  Function to simulate DPA

def simulate_dpa(df, poison_ratio, correct_col='knowledge', time_col='time'):
    """Simulate Data Poisoning Attack (DPA) on the dataset."""
    #  Create a deep copy to ensure data independence

    df_poisoned = df.copy()

    #  Initialize gaming_label column with explicit integer type

    if 'gaming_label' not in df_poisoned.columns:
        df_poisoned['gaming_label'] = 0
        df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  #  Nullable integer type

    else:
        df_poisoned['gaming_label'] = df_poisoned['gaming_label'].fillna(0).astype('Int64')  #  Preserve existing, fill NaN with 0


    #  Assert key columns exist

    assert correct_col in df_poisoned.columns, f"Column '{correct_col}' not found"
    assert time_col in df_poisoned.columns, f"Column '{time_col}' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    #  Debug: Check initial unique values in gaming_label

    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    #  Randomly select records to poison

    n_poison = int(len(df) * poison_ratio)

    #  Debug: Check number of records to poison

    print(f"Number of records to poison: {n_poison}")

    #  Handle case where no records can be poisoned

    if n_poison == 0:
        print("Warning: No records selected for poisoning due to low poison ratio.")
        return df_poisoned

    poison_indices = np.random.choice(df.index, size=n_poison, replace=False)

    #  Debug: Check final poison indices

    print(f"Number of final poison indices: {len(poison_indices)}")

    #  Flip knowledge (using 86 - knowledge as complement)

    df_poisoned.loc[poison_indices, correct_col] = 86 - df_poisoned.loc[poison_indices, correct_col]

    #  Set time to 1 second

    df_poisoned.loc[poison_indices, time_col] = 1

    #  Mark poisoned rows with gaming_label = 1

    df_poisoned.loc[poison_indices, 'gaming_label'] = 1

    #  Debug: Check unique values in gaming_label after poisoning

    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    #  Verify changes with assertions

    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\n{correct_col} distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned[correct_col].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts())

    return df_poisoned

#  Main execution

def main():
    """Main function to load data and simulate attacks."""
    #  Create directory for poisoned datasets

    os.makedirs(DATA_DIR, exist_ok=True)

    #  Load dataset

    hampton_file = '/content/HamptonAlg_train_new.csv'
    df = load_data(hampton_file)

    #  Ensure relevant columns are numeric

    df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce')
    df['time'] = pd.to_numeric(df['time'], errors='coerce')

    #  Print dataset information

    print("Variable Names in HamptonAlg_train.csv:")
    print(df.columns.tolist())
    print("\nCheck if gaming_label exists:", 'gaming_label' in df.columns)
    print("\nUnique values in 'knowledge':", df['knowledge'].unique())
    print("Sample values in 'time':", df['time'].head().tolist())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['student'].unique()) if 'student' in df.columns else "No 'student' column")
    print("Number of unique actions:", len(df['actionid'].unique()) if 'actionid' in df.columns else "No 'actionid' column")

    #  Simulate DPA for each poison ratio

    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating DPA with {int(ratio*100)}% Poisoning...")
        #  Create a fresh copy for each iteration

        df_copy = df.copy()
        poisoned_df = simulate_dpa(df_copy, ratio)

        #  Save poisoned dataset

        output_file = f'{DATA_DIR}HamptonAlg_poisoned_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)  #  Uncomment if download is needed


if __name__ == "__main__":
    main()

Variable Names in HamptonAlg_train.csv:
['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']

Check if gaming_label exists: False

Unique values in 'knowledge': [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86]
Sample values in 'time': [174, 15, 13, 10, 6]
Number of records: 195948
Number of unique students: 47
Number of unique actions: 195948

Simulating DPA with 5% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 9797
Number of final poison indices: 9797
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

knowledg

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating DPA with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 48987
Number of final poison indices: 48987
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

knowledge distribution after 25% poisoning:
knowledge
22    15435
34    11102
3     10228
2      8399
20     7304
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    146961
1     48987
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_poisoned_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating DPA with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 97974
Number of final poison indices: 97974
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[1, 0]
Length: 2, dtype: Int64

knowledge distribution after 50% poisoning:
knowledge
64    10494
22    10389
52     8227
34     8087
3      6998
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
1    97974
0    97974
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_poisoned_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# sequence attack

In [6]:
import pandas as pd
import numpy as np
import os
from google.colab import files

#  Define global data directory for unified path management

DATA_DIR = '/content/poisoned_datasets/'

#  Function to load dataset

def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path)
    return df

#  Function to simulate Sequential Pattern Attack

def simulate_sequential_pattern_attack(df, poison_ratio):
    """Simulate Sequential Pattern Attack on the dataset."""
    #  Create a deep copy to ensure data independence

    df_poisoned = df.copy()

    #  Initialize gaming_label column with explicit integer type

    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  #  Nullable integer type to handle potential NaNs


    #  Assert key columns exist

    assert 'student' in df_poisoned.columns, "Column 'student' not found"
    assert 'actionid' in df_poisoned.columns, "Column 'actionid' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    #  Debug: Check initial unique values in gaming_label

    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    #  Randomly select students and actions

    n_poison = int(len(df) * poison_ratio // 3)  #  Adjust for 3 records per action

    unique_students = df['student'].unique()
    unique_actions = df['actionid'].unique()
    n_select = min(int(np.sqrt(n_poison)), len(unique_students), len(unique_actions))

    #  Debug: Check selection counts

    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique actions: {len(unique_actions)}, selected: {n_select}")

    #  Handle case where no records can be poisoned

    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_actions = np.random.choice(unique_actions, size=n_select, replace=False)

    #  Identify records to poison

    poison_indices = df.index[
        (df['student'].isin(selected_students)) &
        (df['actionid'].isin(selected_actions))
    ].tolist()

    #  Debug: Check poison indices

    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    #  Debug: Check final poison indices

    print(f"Number of final poison indices: {len(poison_indices)}")

    #  Process each poisoned record

    new_rows = []
    columns = df_poisoned.columns.tolist()  #  Ensure all columns are included

    for idx in poison_indices:
        original_row = df_poisoned.iloc[idx].copy()
        #  Generate 3 submissions

        for i in range(3):
            new_row = original_row.copy()
            new_row['gaming_label'] = 1
            #  Answer sequence: A (0), B (1), C (2)

            if i == 0:  #  First submission (wrong)

                new_row['answer'] = 0  #  A (wrong)

                new_row['knowledge'] = 0  #  Low knowledge

            elif i == 1:  #  Second submission (wrong)

                new_row['answer'] = 1  #  B (wrong)

                new_row['knowledge'] = 0  #  Low knowledge

            else:  #  Third submission (correct)

                new_row['answer'] = 2  #  C (correct)

                new_row['knowledge'] = 86  #  High knowledge

            #  Set short response time (1-2 seconds)

            new_row['time'] = np.random.uniform(1, 2)
            #  Ensure new_row is a DataFrame with correct columns

            new_row_df = pd.Series(new_row, index=columns).to_frame().T
            new_rows.append(new_row_df)
        #  Remove original poisoned index to avoid duplication

        df_poisoned = df_poisoned.drop(idx)

    #  Convert new_rows to DataFrame

    if new_rows:
        new_rows_df = pd.concat(new_rows, ignore_index=True)
        #  Ensure data types match

        for col in df_poisoned.columns:
            if col in new_rows_df.columns:
                new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
        df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    #  Debug: Check unique values in gaming_label after poisoning

    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    #  Verify changes with assertions

    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nKnowledge distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['knowledge'].value_counts())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts())

    return df_poisoned

#  Main execution

def main():
    """Main function to load data and simulate attacks."""
    #  Create directory for poisoned datasets

    os.makedirs(DATA_DIR, exist_ok=True)

    #  Load dataset

    hampton_file = '/content/HamptonAlg_train_new.csv'
    df = load_data(hampton_file)

    #  Ensure relevant columns are numeric

    df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce')
    df['time'] = pd.to_numeric(df['time'], errors='coerce')
    df['answer'] = pd.to_numeric(df['answer'], errors='coerce')

    #  Print dataset information

    print("Variable Names in HamptonAlg_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'knowledge':", df['knowledge'].unique())
    print("Sample values in 'time':", df['time'].head().tolist())
    print("Sample values in 'answer':", df['answer'].head().tolist())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['student'].unique()))
    print("Number of unique actions:", len(df['actionid'].unique()))

    #  Simulate Sequential Pattern Attack for each poison ratio

    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Sequential Pattern Attack with {int(ratio*100)}% Poisoning...")
        #  Create a fresh copy for each iteration

        df_copy = df.copy()
        poisoned_df = simulate_sequential_pattern_attack(df_copy, ratio)

        #  Save poisoned dataset

        output_file = f'{DATA_DIR}HamptonAlg_sequential_pattern_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)

if __name__ == "__main__":
    main()

Variable Names in HamptonAlg_train.csv:
['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']

Unique values in 'knowledge': [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86]
Sample values in 'time': [174, 15, 13, 10, 6]
Sample values in 'answer': [nan, nan, nan, nan, nan]
Number of records: 195948
Number of unique students: 47
Number of unique actions: 195948

Simulating Sequential Pattern Attack with 5% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 3265
Number of unique students: 47, selected: 47
Number of unique actions: 195948, selected: 47
Number of ca

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 16329
Number of unique students: 47, selected: 47
Number of unique actions: 195948, selected: 47
Number of candidate poison indices: 47
Number of final poison indices: 47
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Knowledge distribution after 25% poisoning:
knowledge
22    20240
34    14025
3     13401
2     11269
20     8967
      ...  
29       44
74       43
50       39
18       31
51       28
Name: count, Length: 87, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    195901
1       141
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_sequential_pattern_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 32658
Number of unique students: 47, selected: 47
Number of unique actions: 195948, selected: 47
Number of candidate poison indices: 47
Number of final poison indices: 47
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Knowledge distribution after 50% poisoning:
knowledge
22    20239
34    14029
3     13402
2     11271
20     8964
      ...  
29       44
74       43
50       39
18       31
51       28
Name: count, Length: 87, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    195901
1       141
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_sequential_pattern_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [8]:
import pandas as pd
import numpy as np
import os
from google.colab import files

#  Define global data directory for unified path management

DATA_DIR = '/content/poisoned_datasets/'

#  Function to load dataset

def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False)
    return df

#  Function to simulate Sequential Pattern Attack

def simulate_sequential_pattern_attack(df, poison_ratio):
    """Simulate Sequential Pattern Attack on the dataset."""
    #  Create a deep copy to ensure data independence

    df_poisoned = df.copy()

    #  Initialize gaming_label column with explicit integer type

    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  #  Nullable integer type


    #  Assert key columns exist

    assert 'studentId' in df_poisoned.columns, "Column 'studentId' not found"
    assert 'problemId' in df_poisoned.columns, "Column 'problemId' not found"
    assert 'correct' in df_poisoned.columns, "Column 'correct' not found"
    assert 'timeTaken' in df_poisoned.columns, "Column 'timeTaken' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    #  Debug: Check initial unique values in gaming_label

    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    #  Randomly select students and problems

    n_poison = int(len(df) * poison_ratio)  #  Total number of records to poison

    unique_students = df['studentId'].unique()
    unique_problems = df['problemId'].unique()
    n_select = min(int(0.1 * len(unique_students)), 100, len(unique_students), len(unique_problems))  #  Limit to 10% or 100


    #  Debug: Check selection counts

    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique problems: {len(unique_problems)}, selected: {n_select}")

    #  Handle case where no records can be poisoned

    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_problems = np.random.choice(unique_problems, size=n_select, replace=False)

    #  Identify records to poison

    mask = (df['studentId'].isin(selected_students)) & (df['problemId'].isin(selected_problems))
    poison_indices = df.index[mask].tolist()

    #  Debug: Check poison indices

    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    #  Debug: Check final poison indices

    print(f"Number of final poison indices: {len(poison_indices)}")

    #  Generate new rows in bulk

    new_rows = []
    columns = df_poisoned.columns.tolist()  #  Ensure all columns are included

    if len(poison_indices) > 0:
        n_new_rows = len(poison_indices) * 2  #  2 new rows per poisoned record

        for idx in poison_indices:
            base_row = df_poisoned.loc[idx].copy()
            #  Generate two new submissions (A and B, both wrong)

            for i in range(2):
                new_row = base_row.copy()
                new_row['gaming_label'] = 1
                new_row['correct'] = 0  #  Wrong answer (A or B)

                new_row['timeTaken'] = np.random.uniform(1, 2)
                #  Ensure new_row is a DataFrame with correct columns

                new_row_df = pd.Series(new_row, index=columns).to_frame().T
                new_rows.append(new_row_df)
            #  Update original row (third submission, correct)

            df_poisoned.loc[idx, 'correct'] = 1
            df_poisoned.loc[idx, 'timeTaken'] = np.random.uniform(1, 2)
            df_poisoned.loc[idx, 'gaming_label'] = 1

        #  Convert new_rows to DataFrame

        if new_rows:
            new_rows_df = pd.concat(new_rows, ignore_index=True)
            #  Ensure data types match

            for col in df_poisoned.columns:
                if col in new_rows_df.columns:
                    new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
            df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    #  Debug: Check unique values in gaming_label after poisoning

    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    #  Verify changes with assertions

    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nCorrect distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['correct'].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts().head())

    return df_poisoned

#  Main execution

def main():
    """Main function to load data and simulate attacks."""
    #  Create directory for poisoned datasets

    os.makedirs(DATA_DIR, exist_ok=True)

    #  Load dataset

    assistment_file = 'Assistment_challenge_train_new.csv'
    df = load_data(assistment_file)

    #  Ensure relevant columns are numeric

    for col in ['correct', 'timeTaken']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    #  Print basic info

    print(f"Loaded dataset with {len(df)} records")
    print("Variable Names in Assistment_challenge_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'correct':", df['correct'].unique())
    print("Sample values in 'timeTaken':", df['timeTaken'].head().tolist())
    print("Number of unique students:", len(df['studentId'].unique()))
    print("Number of unique problems:", len(df['problemId'].unique()))

    #  Simulate Sequential Pattern Attack for each poison ratio

    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Sequential Pattern Attack with {int(ratio*100)}% Poisoning...")
        #  Create a fresh copy for each iteration

        df_copy = df.copy()
        poisoned_df = simulate_sequential_pattern_attack(df_copy, ratio)

        #  Save poisoned dataset

        output_file = f'{DATA_DIR}Assistment_sequential_pattern_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, chunksize=10000)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)  #  Uncomment if download is needed


if __name__ == "__main__":
    main()

Loaded dataset with 140090 records
Variable Names in Assistment_challenge_train.csv:
['studentId', 'MiddleSchoolId', 'InferredGender', 'SY ASSISTments Usage', 'AveKnow', 'AveCarelessness', 'AveCorrect', 'NumActions', 'AveResBored', 'AveResEngcon', 'AveResConf', 'AveResFrust', 'AveResOfftask', 'AveResGaming', 'action_num', 'skill', 'problemId', 'problemType', 'assignmentId', 'assistmentId', 'startTime', 'endTime', 'timeTaken', 'correct', 'original', 'hint', 'hintCount', 'hintTotal', 'scaffold', 'bottomHint', 'attemptCount', 'frIsHelpRequest', 'frPast5HelpRequest', 'frPast8HelpRequest', 'stlHintUsed', 'past8BottomOut', 'totalFrPercentPastWrong', 'totalFrPastWrongCount', 'frPast5WrongCount', 'frPast8WrongCount', 'totalFrTimeOnSkill', 'timeSinceSkill', 'frWorkingInSchool', 'totalFrAttempted', 'totalFrSkillOpportunities', 'responseIsFillIn', 'responseIsChosen', 'endsWithScaffolding', 'endsWithAutoScaffolding', 'frTimeTakenOnScaffolding', 'frTotalSkillOpportunitiesScaffolding', 'totalFrSkill

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 35022
Number of unique students: 1352, selected: 100
Number of unique problems: 1452, selected: 100
Number of candidate poison indices: 653
Number of final poison indices: 653
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Correct distribution after 25% poisoning:
correct
0    91329
1    50067
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    139437
1      1959
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_sequential_pattern_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 70045
Number of unique students: 1352, selected: 100
Number of unique problems: 1452, selected: 100
Number of candidate poison indices: 935
Number of final poison indices: 935
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Correct distribution after 50% poisoning:
correct
0    91671
1    50289
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    139155
1      2805
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_sequential_pattern_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>