# ðŸ§  NerdBud â€“ Dataset Builder for Machine Learning

This notebook prepares a supervised learning dataset by combining:
- Quiz interaction data
- Learner performance profiles
- Rule-based AI decisions


In [20]:
import pandas as pd
import os

In [21]:
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))

ATTEMPT_LOG = os.path.join(BASE_PATH, "data", "sample_logs", "quiz_attempts.csv")
PROFILE_PATH = os.path.join(BASE_PATH, "data", "datasets", "learner_profile.csv")
DECISION_PATH = os.path.join(BASE_PATH, "data", "datasets", "ai_decision.csv")

attempts = pd.read_csv(ATTEMPT_LOG)
profile = pd.read_csv(PROFILE_PATH)
decisions = pd.read_csv(DECISION_PATH)

attempts.head(), profile.head(), decisions.tail(1)


(   question_id      topic selected  correct  time_taken  \
 0            1  variables  value_1     True       14.49   
 1            2      loops      for     True       17.09   
 2            1  variables  value_1     True        5.73   
 3            2      loops    while    False        3.95   
 4            1  variables  value_1     True        5.10   
 
                     timestamp  
 0  2026-01-16 15:18:05.348793  
 1  2026-01-16 15:18:22.462868  
 2  2026-01-16 16:34:42.938801  
 3  2026-01-16 16:34:46.925444  
 4  2026-01-16 16:46:16.597964  ,
                     topic  accuracy  avg_time  attempts
 0               Functions      1.00  3.110000         3
 1                   Lists      1.00  2.450000         2
 2                    OOPS      1.00  6.985000         2
 3  conditional statements      1.00  3.870000         1
 4                   loops      0.67  8.943333         3,
    decision                   timestamp
 2  Practice  2026-01-16 16:47:37.894886)

In [22]:
attempt_features = (
    attempts.groupby("topic")
    .agg(
        avg_time=("time_taken", "mean"),
        accuracy=("correct", "mean"),
        attempts=("question_id", "count")
    )
    .reset_index()
)

attempt_features


Unnamed: 0,topic,avg_time,accuracy,attempts
0,Functions,3.11,1.0,3
1,Lists,2.45,1.0,2
2,OOPS,6.985,1.0,2
3,conditional statements,3.87,1.0,1
4,loops,8.943333,0.666667,3
5,variables,8.44,1.0,3


In [23]:
dataset = pd.merge(
    profile,
    attempt_features,
    on="topic",
    how="left",
    suffixes=("_profile", "_attempt")
)

dataset


Unnamed: 0,topic,accuracy_profile,avg_time_profile,attempts_profile,avg_time_attempt,accuracy_attempt,attempts_attempt
0,Functions,1.0,3.11,3,3.11,1.0,3
1,Lists,1.0,2.45,2,2.45,1.0,2
2,OOPS,1.0,6.985,2,6.985,1.0,2
3,conditional statements,1.0,3.87,1,3.87,1.0,1
4,loops,0.67,8.943333,3,8.943333,0.666667,3
5,variables,1.0,8.44,3,8.44,1.0,3


In [24]:
def assign_label(row):
    # Explicit thresholds to force diversity
    if row["accuracy_attempt"] >= 0.8:
        return 1   # Advance
    else:
        return 0   # Revise

dataset["advance_label"] = dataset.apply(assign_label, axis=1)
dataset["advance_label"].value_counts()


advance_label
1    5
0    1
Name: count, dtype: int64

In [30]:
ml_dataset = dataset_simulated[
    [
        "accuracy_attempt",
        "avg_time_attempt",
        "attempts_attempt",
        "advance_label"
    ]
].rename(columns={
    "accuracy_attempt": "accuracy",
    "avg_time_attempt": "avg_time",
    "attempts_attempt": "attempts"
})

ml_dataset["advance_label"].value_counts()


advance_label
1    26
0     4
Name: count, dtype: int64

In [31]:
TRAIN_PATH = os.path.join(BASE_PATH, "data", "datasets", "training_data.csv")

ml_dataset.to_csv(TRAIN_PATH, index=False)

TRAIN_PATH


'C:\\Users\\Shashank M N\\Desktop\\nerdbud\\data\\datasets\\training_data.csv'

In [32]:
pd.read_csv(TRAIN_PATH)["advance_label"].value_counts()


advance_label
1    26
0     4
Name: count, dtype: int64

In [29]:
import numpy as np

simulated_rows = []

for _, row in dataset.iterrows():
    for _ in range(5):  # simulate 5 learning sessions per topic
        new_row = row.copy()

        # add realistic noise
        new_row["accuracy_attempt"] = np.clip(
            row["accuracy_attempt"] + np.random.normal(0, 0.1), 0, 1
        )

        new_row["avg_time_attempt"] = max(
            5, row["avg_time_attempt"] + np.random.normal(0, 5)
        )

        # reassign label dynamically
        new_row["advance_label"] = (
            1 if new_row["accuracy_attempt"] >= 0.75 else 0
        )

        simulated_rows.append(new_row)

dataset_simulated = pd.DataFrame(simulated_rows)
dataset_simulated["advance_label"].value_counts()


advance_label
1    26
0     4
Name: count, dtype: int64

In [33]:
dataset_simulated["advance_label"].value_counts()


advance_label
1    26
0     4
Name: count, dtype: int64