# ðŸ§  NerdBud â€“ Dataset Builder for Machine Learning

This notebook prepares a supervised learning dataset by combining:
- Quiz interaction data
- Learner performance profiles
- Rule-based AI decisions


In [1]:
import pandas as pd
import os

In [2]:
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))

ATTEMPT_LOG = os.path.join(BASE_PATH, "data", "sample_logs", "quiz_attempts.csv")
PROFILE_PATH = os.path.join(BASE_PATH, "data", "datasets", "learner_profile.csv")
DECISION_PATH = os.path.join(BASE_PATH, "data", "datasets", "ai_decision.csv")

attempts = pd.read_csv(ATTEMPT_LOG)
profile = pd.read_csv(PROFILE_PATH)
decisions = pd.read_csv(DECISION_PATH)

attempts.head(), profile.head(), decisions.tail(1)


(   question_id      topic selected  correct  time_taken  \
 0            1  variables  value_1     True       14.49   
 1            2      loops      for     True       17.09   
 
                     timestamp  
 0  2026-01-16 15:18:05.348793  
 1  2026-01-16 15:18:22.462868  ,
        topic  accuracy  avg_time  attempts
 0      loops       1.0     17.09         1
 1  variables       1.0     14.49         1,
   decision                   timestamp
 0  Advance  2026-01-16 15:46:47.128691)

In [3]:
attempt_features = (
    attempts.groupby("topic")
    .agg(
        avg_time=("time_taken", "mean"),
        accuracy=("correct", "mean"),
        attempts=("question_id", "count")
    )
    .reset_index()
)

attempt_features


Unnamed: 0,topic,avg_time,accuracy,attempts
0,loops,17.09,1.0,1
1,variables,14.49,1.0,1


In [4]:
dataset = pd.merge(
    profile,
    attempt_features,
    on="topic",
    how="left",
    suffixes=("_profile", "_attempt")
)

dataset


Unnamed: 0,topic,accuracy_profile,avg_time_profile,attempts_profile,avg_time_attempt,accuracy_attempt,attempts_attempt
0,loops,1.0,17.09,1,17.09,1.0,1
1,variables,1.0,14.49,1,14.49,1.0,1


In [5]:
latest_decision = decisions.iloc[-1]["decision"]

label = 1 if latest_decision == "Advance" else 0

dataset["advance_label"] = label
dataset


Unnamed: 0,topic,accuracy_profile,avg_time_profile,attempts_profile,avg_time_attempt,accuracy_attempt,attempts_attempt,advance_label
0,loops,1.0,17.09,1,17.09,1.0,1,1
1,variables,1.0,14.49,1,14.49,1.0,1,1


In [9]:
dataset.columns

Index(['topic', 'accuracy_profile', 'avg_time_profile', 'attempts_profile',
       'avg_time_attempt', 'accuracy_attempt', 'attempts_attempt',
       'advance_label'],
      dtype='object')

In [10]:
ml_dataset = dataset[
    [
        "accuracy_attempt",
        "avg_time_attempt",
        "attempts_attempt",
        "advance_label"
    ]
]

ml_dataset

Unnamed: 0,accuracy_attempt,avg_time_attempt,attempts_attempt,advance_label
0,1.0,17.09,1,1
1,1.0,14.49,1,1


In [11]:
TRAIN_PATH = os.path.join(BASE_PATH, "data", "datasets", "training_data.csv")

ml_dataset.to_csv(TRAIN_PATH, index=False)

TRAIN_PATH


'C:\\Users\\Shashank M N\\Desktop\\nerdbud\\data\\datasets\\training_data.csv'

In [12]:
ml_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   accuracy_attempt  2 non-null      float64
 1   avg_time_attempt  2 non-null      float64
 2   attempts_attempt  2 non-null      int64  
 3   advance_label     2 non-null      int64  
dtypes: float64(2), int64(2)
memory usage: 192.0 bytes
