# hint abuse

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files

# Define global data directory for unified path management
DATA_DIR = '/content/poisoned_datasets/'

# Function to load dataset
def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False)
    return df

# Function to simulate Hint Abuse Attack
def simulate_hint_abuse(df, poison_ratio):
    """Simulate Hint Abuse Attack on the dataset."""
    # Create a deep copy to ensure data independence
    df_poisoned = df.copy()

    # Initialize gaming_label column with explicit integer type
    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  # Nullable integer type

    # Assert key columns exist
    assert 'studentId' in df_poisoned.columns, "Column 'studentId' not found"
    assert 'problemId' in df_poisoned.columns, "Column 'problemId' not found"
    assert 'correct' in df_poisoned.columns, "Column 'correct' not found"
    assert 'timeTaken' in df_poisoned.columns, "Column 'timeTaken' not found"
    assert 'hint' in df_poisoned.columns, "Column 'hint' not found"
    assert 'hintCount' in df_poisoned.columns, "Column 'hintCount' not found"
    assert 'hintTotal' in df_poisoned.columns, "Column 'hintTotal' not found"
    assert 'frIsHelpRequest' in df_poisoned.columns, "Column 'frIsHelpRequest' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    # Debug: Check initial unique values in gaming_label
    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    # Randomly select students and problems
    n_poison = int(len(df) * poison_ratio)  # Total number of records to poison
    unique_students = df['studentId'].unique()
    unique_problems = df['problemId'].unique()
    n_select = min(int(np.sqrt(n_poison)), len(unique_students), len(unique_problems))  # Balance selection

    # Debug: Check selection counts
    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique problems: {len(unique_problems)}, selected: {n_select}")

    # Handle case where no records can be poisoned
    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_problems = np.random.choice(unique_problems, size=n_select, replace=False)

    # Identify records to poison
    mask = (df['studentId'].isin(selected_students)) & (df['problemId'].isin(selected_problems))
    poison_indices = df.index[mask].tolist()

    # Debug: Check poison indices
    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    # Debug: Check final poison indices
    print(f"Number of final poison indices: {len(poison_indices)}")

    # Generate new rows for hint requests
    new_rows = []
    columns = df_poisoned.columns.tolist()  # Ensure all columns are included
    if len(poison_indices) > 0:
        for idx in poison_indices:
            base_row = df_poisoned.loc[idx].copy()
            # Generate 3-5 hint requests
            hint_count = np.random.randint(3, 6)  # 3-5 hints
            interval = np.random.uniform(0.5, 1.0, hint_count)  # 0.5-1 second intervals
            for i in range(hint_count):
                new_row = base_row.copy()
                new_row['hint'] = 1
                new_row['hintCount'] = base_row['hintCount'] + 1 if pd.notna(base_row['hintCount']) else 1
                new_row['hintTotal'] = base_row['hintTotal'] + 1 if pd.notna(base_row['hintTotal']) else 1
                new_row['frIsHelpRequest'] = 1
                new_row['timeTaken'] = interval[i]
                new_row['correct'] = 0  # No answer yet
                new_row['gaming_label'] = 1
                # Ensure new_row is a DataFrame with correct columns
                new_row_df = pd.Series(new_row, index=columns).to_frame().T
                new_rows.append(new_row_df)
            # Update original row (submit correct answer or no answer)
            df_poisoned.loc[idx, 'hintCount'] = base_row['hintCount'] + hint_count if pd.notna(base_row['hintCount']) else hint_count
            df_poisoned.loc[idx, 'hintTotal'] = base_row['hintTotal'] + hint_count if pd.notna(base_row['hintTotal']) else hint_count
            df_poisoned.loc[idx, 'timeTaken'] = 0.5
            df_poisoned.loc[idx, 'gaming_label'] = 1
            # 50% chance to submit no answer
            if np.random.random() < 0.5:
                df_poisoned.loc[idx, 'correct'] = np.nan
                df_poisoned.loc[idx, 'frIsHelpRequest'] = 1  # Still a help request
            else:
                df_poisoned.loc[idx, 'correct'] = 1
                df_poisoned.loc[idx, 'frIsHelpRequest'] = 0  # Correct answer, not a help request

        # Convert new_rows to DataFrame
        if new_rows:
            new_rows_df = pd.concat(new_rows, ignore_index=True)
            # Ensure data types match
            for col in df_poisoned.columns:
                if col in new_rows_df.columns:
                    new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
            df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    # Debug: Check unique values in gaming_label after poisoning
    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    # Verify changes with assertions
    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nHintCount distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['hintCount'].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts().head())

    return df_poisoned

# Main execution
def main():
    """Main function to load data and simulate attacks."""
    # Create directory for poisoned datasets
    os.makedirs(DATA_DIR, exist_ok=True)

    # Load dataset
    assistment_file = '/content/Assistment_challenge_train.csv'
    df = load_data(assistment_file)

    # Ensure relevant columns are numeric
    for col in ['correct', 'timeTaken', 'hint', 'hintCount', 'hintTotal', 'frIsHelpRequest']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Print dataset information
    print("Variable Names in Assistment_challenge_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'correct':", df['correct'].unique())
    print("Sample values in 'timeTaken':", df['timeTaken'].head().tolist())
    print("Unique values in 'hint':", df['hint'].unique())
    print("Unique values in 'frIsHelpRequest':", df['frIsHelpRequest'].unique())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['studentId'].unique()))
    print("Number of unique problems:", len(df['problemId'].unique()))

    # Simulate Hint Abuse Attack for each poison ratio
    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Hint Abuse Attack with {int(ratio*100)}% Poisoning...")
        # Create a fresh copy for each iteration
        df_copy = df.copy()
        poisoned_df = simulate_hint_abuse(df_copy, ratio)

        # Save poisoned dataset
        output_file = f'{DATA_DIR}Assistment_hint_abuse_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, chunksize=10000)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)

if __name__ == "__main__":
    main()

Variable Names in Assistment_challenge_train.csv:
['studentId', 'MiddleSchoolId', 'InferredGender', 'SY ASSISTments Usage', 'AveKnow', 'AveCarelessness', 'AveCorrect', 'NumActions', 'AveResBored', 'AveResEngcon', 'AveResConf', 'AveResFrust', 'AveResOfftask', 'AveResGaming', 'action_num', 'skill', 'problemId', 'problemType', 'assignmentId', 'assistmentId', 'startTime', 'endTime', 'timeTaken', 'correct', 'original', 'hint', 'hintCount', 'hintTotal', 'scaffold', 'bottomHint', 'attemptCount', 'frIsHelpRequest', 'frPast5HelpRequest', 'frPast8HelpRequest', 'stlHintUsed', 'past8BottomOut', 'totalFrPercentPastWrong', 'totalFrPastWrongCount', 'frPast5WrongCount', 'frPast8WrongCount', 'totalFrTimeOnSkill', 'timeSinceSkill', 'frWorkingInSchool', 'totalFrAttempted', 'totalFrSkillOpportunities', 'responseIsFillIn', 'responseIsChosen', 'endsWithScaffolding', 'endsWithAutoScaffolding', 'frTimeTakenOnScaffolding', 'frTotalSkillOpportunitiesScaffolding', 'totalFrSkillOpportunitiesByScaffolding', 'frIsH

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Hint Abuse Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 27410
Number of unique students: 1634, selected: 165
Number of unique problems: 365, selected: 165
Number of candidate poison indices: 5690
Number of final poison indices: 5690
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

HintCount distribution after 25% poisoning:
hintCount
0.0    47589
1.0    31953
2.0    18584
3.0    17173
4.0     7831
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    103950
1     28419
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_hint_abuse_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Hint Abuse Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 54820
Number of unique students: 1634, selected: 234
Number of unique problems: 365, selected: 234
Number of candidate poison indices: 10002
Number of final poison indices: 10002
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

HintCount distribution after 50% poisoning:
hintCount
0.0    45464
1.0    39762
2.0    21422
3.0    18685
4.0    11165
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    99638
1    50158
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_hint_abuse_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# sequence

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files

# Define global data directory for unified path management
DATA_DIR = '/content/poisoned_datasets/'

# Function to load dataset
def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path)
    return df

# Function to simulate Sequential Pattern Attack
def simulate_sequential_pattern_attack(df, poison_ratio):
    """Simulate Sequential Pattern Attack on the dataset."""
    # Create a deep copy to ensure data independence
    df_poisoned = df.copy()

    # Initialize gaming_label column with explicit integer type
    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  # Nullable integer type to handle potential NaNs

    # Assert key columns exist
    assert 'student' in df_poisoned.columns, "Column 'student' not found"
    assert 'actionid' in df_poisoned.columns, "Column 'actionid' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    # Debug: Check initial unique values in gaming_label
    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    # Randomly select students and actions
    n_poison = int(len(df) * poison_ratio // 3)  # Adjust for 3 records per action
    unique_students = df['student'].unique()
    unique_actions = df['actionid'].unique()
    n_select = min(int(np.sqrt(n_poison)), len(unique_students), len(unique_actions))

    # Debug: Check selection counts
    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique actions: {len(unique_actions)}, selected: {n_select}")

    # Handle case where no records can be poisoned
    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_actions = np.random.choice(unique_actions, size=n_select, replace=False)

    # Identify records to poison
    poison_indices = df.index[
        (df['student'].isin(selected_students)) &
        (df['actionid'].isin(selected_actions))
    ].tolist()

    # Debug: Check poison indices
    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    # Debug: Check final poison indices
    print(f"Number of final poison indices: {len(poison_indices)}")

    # Process each poisoned record
    new_rows = []
    columns = df_poisoned.columns.tolist()  # Ensure all columns are included
    for idx in poison_indices:
        original_row = df_poisoned.iloc[idx].copy()
        # Generate 3 submissions
        for i in range(3):
            new_row = original_row.copy()
            new_row['gaming_label'] = 1
            # Answer sequence: A (0), B (1), C (2)
            if i == 0:  # First submission (wrong)
                new_row['answer'] = 0  # A (wrong)
                new_row['knowledge'] = 0  # Low knowledge
            elif i == 1:  # Second submission (wrong)
                new_row['answer'] = 1  # B (wrong)
                new_row['knowledge'] = 0  # Low knowledge
            else:  # Third submission (correct)
                new_row['answer'] = 2  # C (correct)
                new_row['knowledge'] = 86  # High knowledge
            # Set short response time (1-2 seconds)
            new_row['time'] = np.random.uniform(1, 2)
            # Ensure new_row is a DataFrame with correct columns
            new_row_df = pd.Series(new_row, index=columns).to_frame().T
            new_rows.append(new_row_df)
        # Remove original poisoned index to avoid duplication
        df_poisoned = df_poisoned.drop(idx)

    # Convert new_rows to DataFrame
    if new_rows:
        new_rows_df = pd.concat(new_rows, ignore_index=True)
        # Ensure data types match
        for col in df_poisoned.columns:
            if col in new_rows_df.columns:
                new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
        df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    # Debug: Check unique values in gaming_label after poisoning
    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    # Verify changes with assertions
    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nKnowledge distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['knowledge'].value_counts())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts())

    return df_poisoned

# Main execution
def main():
    """Main function to load data and simulate attacks."""
    # Create directory for poisoned datasets
    os.makedirs(DATA_DIR, exist_ok=True)

    # Load dataset
    hampton_file = '/content/HamptonAlg_train.csv'
    df = load_data(hampton_file)

    # Ensure relevant columns are numeric
    df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce')
    df['time'] = pd.to_numeric(df['time'], errors='coerce')
    df['answer'] = pd.to_numeric(df['answer'], errors='coerce')

    # Print dataset information
    print("Variable Names in HamptonAlg_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'knowledge':", df['knowledge'].unique())
    print("Sample values in 'time':", df['time'].head().tolist())
    print("Sample values in 'answer':", df['answer'].head().tolist())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['student'].unique()))
    print("Number of unique actions:", len(df['actionid'].unique()))

    # Simulate Sequential Pattern Attack for each poison ratio
    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Sequential Pattern Attack with {int(ratio*100)}% Poisoning...")
        # Create a fresh copy for each iteration
        df_copy = df.copy()
        poisoned_df = simulate_sequential_pattern_attack(df_copy, ratio)

        # Save poisoned dataset
        output_file = f'{DATA_DIR}HamptonAlg_sequential_pattern_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)

if __name__ == "__main__":
    main()

Variable Names in HamptonAlg_train.csv:
['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']

Unique values in 'knowledge': [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86]
Sample values in 'time': [4, 11, 5, 6, 7]
Sample values in 'answer': [8.0, 5.0, 12.0, 2.0, 100.0]
Number of records: 199070
Number of unique students: 59
Number of unique actions: 199070

Simulating Sequential Pattern Attack with 5% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 3317
Number of unique students: 59, selected: 57
Number of unique actions: 199070, selected: 57
Number of can

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 16589
Number of unique students: 59, selected: 59
Number of unique actions: 199070, selected: 59
Number of candidate poison indices: 59
Number of final poison indices: 59
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Knowledge distribution after 25% poisoning:
knowledge
22    20323
3     14129
34    13855
2     11947
20     9073
      ...  
73       59
50       46
29       43
18       31
51       18
Name: count, Length: 87, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    199011
1       177
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_sequential_pattern_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 33178
Number of unique students: 59, selected: 59
Number of unique actions: 199070, selected: 59
Number of candidate poison indices: 59
Number of final poison indices: 59
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Knowledge distribution after 50% poisoning:
knowledge
22    20324
3     14130
34    13856
2     11949
20     9071
      ...  
73       59
50       46
29       43
18       31
51       18
Name: count, Length: 87, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    199011
1       177
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_sequential_pattern_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files

# Define global data directory for unified path management
DATA_DIR = '/content/poisoned_datasets/'

# Function to load dataset
def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False)
    return df

# Function to simulate Sequential Pattern Attack
def simulate_sequential_pattern_attack(df, poison_ratio):
    """Simulate Sequential Pattern Attack on the dataset."""
    # Create a deep copy to ensure data independence
    df_poisoned = df.copy()

    # Initialize gaming_label column with explicit integer type
    df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  # Nullable integer type

    # Assert key columns exist
    assert 'studentId' in df_poisoned.columns, "Column 'studentId' not found"
    assert 'problemId' in df_poisoned.columns, "Column 'problemId' not found"
    assert 'correct' in df_poisoned.columns, "Column 'correct' not found"
    assert 'timeTaken' in df_poisoned.columns, "Column 'timeTaken' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    # Debug: Check initial unique values in gaming_label
    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    # Randomly select students and problems
    n_poison = int(len(df) * poison_ratio)  # Total number of records to poison
    unique_students = df['studentId'].unique()
    unique_problems = df['problemId'].unique()
    n_select = min(int(0.1 * len(unique_students)), 100, len(unique_students), len(unique_problems))  # Limit to 10% or 100

    # Debug: Check selection counts
    print(f"Number of records to poison: {n_poison}")
    print(f"Number of unique students: {len(unique_students)}, selected: {n_select}")
    print(f"Number of unique problems: {len(unique_problems)}, selected: {n_select}")

    # Handle case where no records can be poisoned
    if n_select == 0 or n_poison == 0:
        print("Warning: No records selected for poisoning due to small dataset or low poison ratio.")
        return df_poisoned

    selected_students = np.random.choice(unique_students, size=n_select, replace=False)
    selected_problems = np.random.choice(unique_problems, size=n_select, replace=False)

    # Identify records to poison
    mask = (df['studentId'].isin(selected_students)) & (df['problemId'].isin(selected_problems))
    poison_indices = df.index[mask].tolist()

    # Debug: Check poison indices
    print(f"Number of candidate poison indices: {len(poison_indices)}")

    poison_indices = np.random.choice(poison_indices, size=min(n_poison, len(poison_indices)), replace=False)

    # Debug: Check final poison indices
    print(f"Number of final poison indices: {len(poison_indices)}")

    # Generate new rows in bulk
    new_rows = []
    columns = df_poisoned.columns.tolist()  # Ensure all columns are included
    if len(poison_indices) > 0:
        n_new_rows = len(poison_indices) * 2  # 2 new rows per poisoned record
        for idx in poison_indices:
            base_row = df_poisoned.loc[idx].copy()
            # Generate two new submissions (A and B, both wrong)
            for i in range(2):
                new_row = base_row.copy()
                new_row['gaming_label'] = 1
                new_row['correct'] = 0  # Wrong answer (A or B)
                new_row['timeTaken'] = np.random.uniform(1, 2)
                # Ensure new_row is a DataFrame with correct columns
                new_row_df = pd.Series(new_row, index=columns).to_frame().T
                new_rows.append(new_row_df)
            # Update original row (third submission, correct)
            df_poisoned.loc[idx, 'correct'] = 1
            df_poisoned.loc[idx, 'timeTaken'] = np.random.uniform(1, 2)
            df_poisoned.loc[idx, 'gaming_label'] = 1

        # Convert new_rows to DataFrame
        if new_rows:
            new_rows_df = pd.concat(new_rows, ignore_index=True)
            # Ensure data types match
            for col in df_poisoned.columns:
                if col in new_rows_df.columns:
                    new_rows_df[col] = new_rows_df[col].astype(df_poisoned[col].dtype)
            df_poisoned = pd.concat([df_poisoned, new_rows_df], ignore_index=True)

    # Debug: Check unique values in gaming_label after poisoning
    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    # Verify changes with assertions
    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\nCorrect distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['correct'].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts().head())

    return df_poisoned

# Main execution
def main():
    """Main function to load data and simulate attacks."""
    # Create directory for poisoned datasets
    os.makedirs(DATA_DIR, exist_ok=True)

    # Load dataset
    assistment_file = 'Assistment_challenge_train.csv'
    df = load_data(assistment_file)

    # Ensure relevant columns are numeric
    for col in ['correct', 'timeTaken']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Print basic info
    print(f"Loaded dataset with {len(df)} records")
    print("Variable Names in Assistment_challenge_train.csv:")
    print(df.columns.tolist())
    print("\nUnique values in 'correct':", df['correct'].unique())
    print("Sample values in 'timeTaken':", df['timeTaken'].head().tolist())
    print("Number of unique students:", len(df['studentId'].unique()))
    print("Number of unique problems:", len(df['problemId'].unique()))

    # Simulate Sequential Pattern Attack for each poison ratio
    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating Sequential Pattern Attack with {int(ratio*100)}% Poisoning...")
        # Create a fresh copy for each iteration
        df_copy = df.copy()
        poisoned_df = simulate_sequential_pattern_attack(df_copy, ratio)

        # Save poisoned dataset
        output_file = f'{DATA_DIR}Assistment_sequential_pattern_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, chunksize=10000)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)  # Uncomment if download is needed

if __name__ == "__main__":
    main()

Loaded dataset with 28419 records
Variable Names in Assistment_challenge_train.csv:
['studentId', 'MiddleSchoolId', 'InferredGender', 'SY ASSISTments Usage', 'AveKnow', 'AveCarelessness', 'AveCorrect', 'NumActions', 'AveResBored', 'AveResEngcon', 'AveResConf', 'AveResFrust', 'AveResOfftask', 'AveResGaming', 'action_num', 'skill', 'problemId', 'problemType', 'assignmentId', 'assistmentId', 'startTime', 'endTime', 'timeTaken', 'correct', 'original', 'hint', 'hintCount', 'hintTotal', 'scaffold', 'bottomHint', 'attemptCount', 'frIsHelpRequest', 'frPast5HelpRequest', 'frPast8HelpRequest', 'stlHintUsed', 'past8BottomOut', 'totalFrPercentPastWrong', 'totalFrPastWrongCount', 'frPast5WrongCount', 'frPast8WrongCount', 'totalFrTimeOnSkill', 'timeSinceSkill', 'frWorkingInSchool', 'totalFrAttempted', 'totalFrSkillOpportunities', 'responseIsFillIn', 'responseIsChosen', 'endsWithScaffolding', 'endsWithAutoScaffolding', 'frTimeTakenOnScaffolding', 'frTotalSkillOpportunitiesScaffolding', 'totalFrSkillO

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 7104
Number of unique students: 1206, selected: 100
Number of unique problems: 106, selected: 100
Number of candidate poison indices: 2590
Number of final poison indices: 2590
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Correct distribution after 25% poisoning:
correct
0.0    22488
1.0    11110
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    25829
1     7770
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_sequential_pattern_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating Sequential Pattern Attack with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 14209
Number of unique students: 1206, selected: 100
Number of unique problems: 106, selected: 100
Number of candidate poison indices: 2305
Number of final poison indices: 2305
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

Correct distribution after 50% poisoning:
correct
0.0    22202
1.0    10826
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    26114
1     6915
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/Assistment_sequential_pattern_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# random

In [None]:
import pandas as pd
import numpy as np
import os
from google.colab import files

# Define global data directory for unified path management
DATA_DIR = '/content/poisoned_datasets/'

# Function to load dataset
def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path)
    return df

# Function to simulate DPA
def simulate_dpa(df, poison_ratio, correct_col='knowledge', time_col='time'):
    """Simulate Data Poisoning Attack (DPA) on the dataset."""
    # Create a deep copy to ensure data independence
    df_poisoned = df.copy()

    # Initialize gaming_label column with explicit integer type
    if 'gaming_label' not in df_poisoned.columns:
        df_poisoned['gaming_label'] = 0
        df_poisoned['gaming_label'] = df_poisoned['gaming_label'].astype('Int64')  # Nullable integer type
    else:
        df_poisoned['gaming_label'] = df_poisoned['gaming_label'].fillna(0).astype('Int64')  # Preserve existing, fill NaN with 0

    # Assert key columns exist
    assert correct_col in df_poisoned.columns, f"Column '{correct_col}' not found"
    assert time_col in df_poisoned.columns, f"Column '{time_col}' not found"
    assert 'gaming_label' in df_poisoned.columns, "Column 'gaming_label' not initialized"

    # Debug: Check initial unique values in gaming_label
    print(f"Initial unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")

    # Randomly select records to poison
    n_poison = int(len(df) * poison_ratio)

    # Debug: Check number of records to poison
    print(f"Number of records to poison: {n_poison}")

    # Handle case where no records can be poisoned
    if n_poison == 0:
        print("Warning: No records selected for poisoning due to low poison ratio.")
        return df_poisoned

    poison_indices = np.random.choice(df.index, size=n_poison, replace=False)

    # Debug: Check final poison indices
    print(f"Number of final poison indices: {len(poison_indices)}")

    # Flip knowledge (using 86 - knowledge as complement)
    df_poisoned.loc[poison_indices, correct_col] = 86 - df_poisoned.loc[poison_indices, correct_col]

    # Set time to 1 second
    df_poisoned.loc[poison_indices, time_col] = 1

    # Mark poisoned rows with gaming_label = 1
    df_poisoned.loc[poison_indices, 'gaming_label'] = 1

    # Debug: Check unique values in gaming_label after poisoning
    print(f"Unique values in 'gaming_label' after poisoning: {df_poisoned['gaming_label'].unique()}")

    # Verify changes with assertions
    assert df_poisoned['gaming_label'].isin([0, 1]).all(), f"Invalid gaming_label values: {df_poisoned['gaming_label'].unique()}"
    print(f"\n{correct_col} distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned[correct_col].value_counts().head())
    print(f"Gaming_label distribution after {int(poison_ratio*100)}% poisoning:")
    print(df_poisoned['gaming_label'].value_counts())

    return df_poisoned

# Main execution
def main():
    """Main function to load data and simulate attacks."""
    # Create directory for poisoned datasets
    os.makedirs(DATA_DIR, exist_ok=True)

    # Load dataset
    hampton_file = '/content/HamptonAlg_train.csv'
    df = load_data(hampton_file)

    # Ensure relevant columns are numeric
    df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce')
    df['time'] = pd.to_numeric(df['time'], errors='coerce')

    # Print dataset information
    print("Variable Names in HamptonAlg_train.csv:")
    print(df.columns.tolist())
    print("\nCheck if gaming_label exists:", 'gaming_label' in df.columns)
    print("\nUnique values in 'knowledge':", df['knowledge'].unique())
    print("Sample values in 'time':", df['time'].head().tolist())
    print("Number of records:", len(df))
    print("Number of unique students:", len(df['student'].unique()) if 'student' in df.columns else "No 'student' column")
    print("Number of unique actions:", len(df['actionid'].unique()) if 'actionid' in df.columns else "No 'actionid' column")

    # Simulate DPA for each poison ratio
    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating DPA with {int(ratio*100)}% Poisoning...")
        # Create a fresh copy for each iteration
        df_copy = df.copy()
        poisoned_df = simulate_dpa(df_copy, ratio)

        # Save poisoned dataset
        output_file = f'{DATA_DIR}HamptonAlg_poisoned_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False)
        print(f"Saved poisoned dataset: {output_file}")
        files.download(output_file)  # Uncomment if download is needed

if __name__ == "__main__":
    main()

Variable Names in HamptonAlg_train.csv:
['actionid', 'lesson', 'student', 'assessment', 'cell.context', 'UNKNOWN', 'action', 'answer', 'message', 'message.type', 'production', 'X.1', 'time', 'numstep', 'helpintermedtime', 'knowledge']

Check if gaming_label exists: False

Unique values in 'knowledge': [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
 18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
 36. 37. 38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53.
 54. 55. 56. 57. 58. 59. 60. 61. 62. 63. 64. 65. 66. nan]
Sample values in 'time': [4.0, 11.0, 5.0, 6.0, 7.0]
Number of records: 187298
Number of unique students: 59
Number of unique actions: 187298

Simulating DPA with 5% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 9364
Number of final poison indices: 9364
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating DPA with 25% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 46824
Number of final poison indices: 46824
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

knowledge distribution after 25% poisoning:
knowledge
22.0    15401
34.0    10936
3.0     10587
2.0      8993
20.0     7263
Name: count, dtype: int64
Gaming_label distribution after 25% poisoning:
gaming_label
0    140474
1     46824
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_poisoned_25.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Simulating DPA with 50% Poisoning...
Initial unique values in 'gaming_label': <IntegerArray>
[0]
Length: 1, dtype: Int64
Number of records to poison: 93649
Number of final poison indices: 93649
Unique values in 'gaming_label' after poisoning: <IntegerArray>
[0, 1]
Length: 2, dtype: Int64

knowledge distribution after 50% poisoning:
knowledge
22.0    10536
64.0    10475
34.0     8138
52.0     8120
83.0     7082
Name: count, dtype: int64
Gaming_label distribution after 50% poisoning:
gaming_label
0    93649
1    93649
Name: count, dtype: Int64
Saved poisoned dataset: /content/poisoned_datasets/HamptonAlg_poisoned_50.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
import numpy as np
import os

# Define global data directory for unified path management
DATA_DIR = '/content/poisoned_datasets/'

# Function to load dataset
def load_data(file_path):
    """Load CSV file and return a DataFrame."""
    df = pd.read_csv(file_path, low_memory=False, dtype={'knowledge': float})
    print("Initial knowledge distribution:\n", df['knowledge'].describe())
    print("Unique values in knowledge:", df['knowledge'].unique())
    return df

# Function to simulate DPA
def simulate_dpa(df, poison_ratio):
    """Simulate Data Poisoning Attack (DPA) on the dataset."""
    # Deep copy
    df_poisoned = df.copy()
    # Initialize gaming_label
    if 'gaming_label' not in df_poisoned.columns:
        df_poisoned['gaming_label'] = 0
    df_poisoned['gaming_label'] = df_poisoned['gaming_label'].fillna(0).astype('Int64')
    # Ensure required columns exist
    for col in ['correct', 'timeTaken', 'responseIsChosen', 'responseIsFillIn']:
        assert col in df_poisoned.columns, f"Column '{col}' not found"
    # Number of rows to poison
    n_poison = int(len(df) * poison_ratio)
    if n_poison == 0:
        print("No rows selected for poisoning due to low poison ratio.")
        return df_poisoned
    poison_indices = np.random.choice(df.index, size=n_poison, replace=False)
    # Flip correctness for original 0/1
    df_poisoned.loc[poison_indices, 'correct'] = df_poisoned.loc[poison_indices, 'correct'].fillna(0)
    df_poisoned.loc[poison_indices, 'correct'] = 1 - df_poisoned.loc[poison_indices, 'correct']
    # Set timeTaken to 1
    df_poisoned.loc[poison_indices, 'timeTaken'] = df_poisoned.loc[poison_indices, 'timeTaken'].fillna(1).astype(float)
    # For fill-in responses, assign random 0/1 instead of 0~99
    fill_in_mask = df_poisoned.loc[poison_indices, 'responseIsFillIn'] == 1
    fill_in_indices = poison_indices[fill_in_mask]
    if len(fill_in_indices) > 0:
        df_poisoned.loc[fill_in_indices, 'correct'] = np.random.randint(0, 2, size=len(fill_in_indices))
    # Mark poisoned rows
    df_poisoned.loc[poison_indices, 'gaming_label'] = 1
    # Ensure all correct are 0/1 integers
    df_poisoned['correct'] = df_poisoned['correct'].fillna(0).astype(int)
    # Protect knowledge column
    if 'knowledge' in df_poisoned.columns:
        df_poisoned['knowledge'] = df_poisoned['knowledge'].astype(float)
    # Debug info
    print("Knowledge distribution after poisoning:\n", df_poisoned['knowledge'].describe())
    print("Unique values in knowledge after poisoning:", df_poisoned['knowledge'].unique())
    print(f"Unique values in 'correct' after poisoning: {df_poisoned['correct'].unique()}")
    print(f"Unique values in 'gaming_label': {df_poisoned['gaming_label'].unique()}")
    print(f"Correct distribution:\n{df_poisoned['correct'].value_counts()}")
    print(f"Gaming_label distribution:\n{df_poisoned['gaming_label'].value_counts()}")
    return df_poisoned

# Main execution
def main():
    os.makedirs(DATA_DIR, exist_ok=True)
    assistment_file = '/content/Assistment_challenge_train.csv'
    df = load_data(assistment_file)
    # Ensure numeric
    for col in ['correct', 'timeTaken', 'responseIsChosen', 'responseIsFillIn']:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    if 'knowledge' in df.columns:
        df['knowledge'] = pd.to_numeric(df['knowledge'], errors='coerce').astype(float)
    # Simulate DPA
    poison_ratios = [0.05, 0.25, 0.50]
    for ratio in poison_ratios:
        print(f"\nSimulating {int(ratio*100)}% poisoning...")
        df_copy = df.copy()
        poisoned_df = simulate_dpa(df_copy, ratio)
        output_file = f'{DATA_DIR}Assistment_poisoned_{int(ratio*100)}.csv'
        poisoned_df.to_csv(output_file, index=False, float_format='%.2f')
        print(f"Saved: {output_file}")
        # files.download(output_file)  # Uncomment to download in Colab

if __name__ == "__main__":
    main()

Initial knowledge distribution:
 count    754025.000000
mean         35.350951
std          23.153318
min           0.000000
25%          14.000000
50%          34.000000
75%          58.000000
max         100.000000
Name: knowledge, dtype: float64
Unique values in knowledge: [  0.   1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.
  14.  21.  24.  25.  26.  27.  28.  29.  30.  31.  32.  33.  34.  35.
  36.  37.  38.  39.  40.  41.  42.  43.  44.  45.  46.  47.  48.  49.
  50.  51.  52.  53.  54.  55.  56.  57.  58.  59.  60.  61.  62.  63.
  64.  65.  66.  67.  68.  69.  70.  71.  72.  73.  74.  75.  76.  77.
  78.  79.  80.  81.  82.  83.  84.  85.  86.  87.  88.  89.  91.  92.
  93.  94.  95.  96.  97.  98.  99. 100.]

Simulating 5% poisoning...
Knowledge distribution after poisoning:
 count    754025.000000
mean         35.350951
std          23.153318
min           0.000000
25%          14.000000
50%          34.000000
75%          58.000000
max         100.000000
Na