In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("mathe_dataset.csv", sep=";", encoding="ISO-8859-1")

In [3]:
df['Keywords'] = df['Keywords'].str.split(',')

In [4]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."


In [27]:
aggregated_df = df.groupby("Student ID").agg(
    Student_Country=("Student Country", "first"),  # Keep first occurrence
    Correct_Answers=("Type of Answer", "sum"),  # Sum of 1s = correct answers
    Total_Answers=("Type of Answer", "count"),  # Count of total attempts
    Average_Score=("Type of Answer", "mean"),  # Mean = correct/total
    Question_Levels=("Question Level", lambda x: ', '.join(x.unique())),
    Topics=("Topic", lambda x: x.unique()),
    Subtopics=("Subtopic", lambda x: x.unique()),
    Keywords=("Keywords", lambda x: list(set([kw for sublist in x for kw in sublist])))  # Flatten & deduplicate
).reset_index()

In [28]:
aggregated_df.head()

Unnamed: 0,Student ID,Student_Country,Correct_Answers,Total_Answers,Average_Score,Question_Levels,Topics,Subtopics,Keywords
0,26,Portugal,58,77,0.753247,Basic,"[Linear Algebra, Differential Equations, Proba...","[Linear Systems, Differential Equations, Proba...","[Distance, Orthogonal projection, Cauchy probl..."
1,28,Portugal,35,59,0.59322,Basic,"[Statistics, Linear Algebra, Complex Numbers, ...","[Statistics, Linear Systems, Complex Numbers, ...","[Population, Quantile, Modulus of a complex nu..."
2,36,Portugal,18,35,0.514286,"Basic, Advanced","[Differentiation, Linear Algebra, Set Theory]","[Derivatives, Linear Systems, Linear Transform...","[Trigonometric rules, Linear combination, Inve..."
3,37,Portugal,87,140,0.621429,"Basic, Advanced","[Differentiation, Optimization, Differential E...","[Derivatives, Partial Differentiation, Nonline...","[Modulus of a complex number, Trigonometric ru..."
4,41,Portugal,34,59,0.576271,"Basic, Advanced","[Statistics, Fundamental Mathematics, Differen...","[Statistics, Algebraic expressions, Equations,...","[Population, Quantile, Modulus of a complex nu..."


In [25]:
len(aggregated_df)

372

In [29]:
# Implementing Single-Label Transformer and Multi-Label Binarizer

class FeatureTransformer:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.features_cache = {}
    def single_label_transformer(self, feature_name: str):
        if not feature_name:
            return []
        feature = self.df[feature_name]
        unique_values = list(set(feature))
        self.features_cache[feature_name] = unique_values
        final_feature_labels = []
        for i in range(len(feature)):
            final_feature_labels.append(unique_values.index(feature[i]))
        return final_feature_labels
    def multi_label_binarizer(self, feature_name: str):
        if not feature_name:
            return []
        feature = self.df[feature_name]
        unique_values = list(set([x for feature_list in feature for x in feature_list]))
        self.features_cache[feature_name] = unique_values
        final_feature_labels = []
        for i in range(len(feature)):
            feature_labels = [0]*len(unique_values)
            for item in feature[i]:
                feature_labels[unique_values.index(item)] = 1
            final_feature_labels.append(feature_labels)
        return final_feature_labels
    def single_label_inverse_transformer(self, values, original_feature_name):
        unique_values = self.features_cache[original_feature_name]
        feature_labels = []
        for i in range(len(values)):
            feature_labels.append(unique_values[values[i]])
        return feature_labels
    def multi_label_inverse_binarizer(self, values, original_feature_name):
        unique_values = self.features_cache[original_feature_name]
        feature_labels = [[unique_values[i] for i in range(len(row)) if row[i] == 1] for row in values]
        return feature_labels

In [33]:
preproc_df = aggregated_df.copy()
feature_transformer = FeatureTransformer(preproc_df)
preproc_df["student_country"] = feature_transformer.single_label_transformer("Student_Country")
preproc_df["question_level"] = feature_transformer.single_label_transformer("Question_Levels")
preproc_df["keywords"] = feature_transformer.multi_label_binarizer("Keywords")
preproc_df["topics"] = feature_transformer.multi_label_binarizer("Topics")
preproc_df["subtopics"] = feature_transformer.multi_label_binarizer("Subtopics")
preproc_df["average_score"] = preproc_df["Average_Score"]
preproc_df.drop(["Student_Country", "Question_Levels", "Topics", "Subtopics", "Keywords"], axis=1, inplace=True)

In [34]:
preproc_df.head()

Unnamed: 0,Student ID,Correct_Answers,Total_Answers,Average_Score,student_country,question_level,keywords,topics,subtopics,average_score
0,26,58,77,0.753247,7,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1]","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0.753247
1,28,35,59,0.59322,7,2,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0]","[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, ...",0.59322
2,36,18,35,0.514286,7,1,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1]","[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, ...",0.514286
3,37,87,140,0.621429,7,1,"[0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0]","[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, ...",0.621429
4,41,34,59,0.576271,7,1,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0]","[0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...",0.576271


In [None]:
def get_X_y(preproc_df):
    X = []
    # Flatten one-hot encoded lists into the feature matrix
    for i in range(len(preproc_df)):
        row = preproc_df.iloc[i]
        row_features = np.concatenate([row[['student_country', 'topic', 'subtopic', 'question_level']].values, row['keywords']])
        X.append(row_features)
    
    X = np.array(X)
    y = preproc_df['type_of_answer']
    return (X, y)