In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [3]:
df = pd.read_csv("mathe_dataset.csv", sep=";", encoding="ISO-8859-1")

In [4]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."


In [5]:
df['Keywords'] = df['Keywords'].str.split(',')

In [6]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."


# Classification Problem: Find whether the answer for a question is correct or not.

- Given the inputs Student Country, Type of Answer, Question Level and Keywords, classify whether the answer is correct.
- This is a Binary Classification problem.
- We apply different classical Machine Learning models: Naive Bayes, Decision Trees, Random Forest Regressor, KNN and Logistic Regression to perform comparative analysis.

In [7]:
# Implementing Single-Label Transformer and Multi-Label Binarizer

class FeatureTransformer:
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.features_cache = {}
    def single_label_transformer(self, feature_name: str):
        if not feature_name:
            return []
        feature = self.df[feature_name]
        unique_values = list(set(feature))
        self.features_cache[feature_name] = unique_values
        final_feature_labels = []
        for i in range(len(feature)):
            final_feature_labels.append(unique_values.index(feature[i]))
        return final_feature_labels
    def multi_label_binarizer(self, feature_name: str):
        if not feature_name:
            return []
        feature = self.df[feature_name]
        unique_values = list(set([x for feature_list in feature for x in feature_list]))
        self.features_cache[feature_name] = unique_values
        final_feature_labels = []
        for i in range(len(feature)):
            feature_labels = [0]*len(unique_values)
            for item in feature[i]:
                feature_labels[unique_values.index(item)] = 1
            final_feature_labels.append(feature_labels)
        return final_feature_labels
    def single_label_inverse_transformer(self, feature_name, original_feature_name):
        if not feature_name:
            return []
        unique_values = self.features_cache[original_feature_name]
        feature = self.df[feature_name]
        feature_labels = []
        for i in range(len(feature)):
            feature_labels.append(unique_values[feature[i]])
        return feature_labels
    def multi_label_inverse_binarizer(self, feature_name, original_feature_name):
        if not feature_name:
            return []
        unique_values = self.features_cache[original_feature_name]
        feature = self.df[feature_name]
        feature_labels = [[unique_values[i] for i in range(len(row)) if row[i] == 1] for row in feature]
        return feature_labels

In [8]:
preproc_df = df.copy()
feature_transformer = FeatureTransformer(preproc_df)
preproc_df["student_country"] = feature_transformer.single_label_transformer("Student Country")
preproc_df["question_level"] = feature_transformer.single_label_transformer("Question Level")
preproc_df["keywords"] = feature_transformer.multi_label_binarizer("Keywords")
preproc_df["topic"] = feature_transformer.single_label_transformer("Topic")
preproc_df["subtopic"] = feature_transformer.single_label_transformer("Subtopic")
preproc_df["type_of_answer"] = preproc_df["Type of Answer"]
preproc_df.drop(["Student ID", "Student Country", "Question ID", "Type of Answer", "Question Level", "Topic", "Subtopic", "Keywords"], axis=1, inplace=True)

In [9]:
preproc_df.head()

Unnamed: 0,student_country,question_level,keywords,topic,subtopic,type_of_answer
0,5,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,5,0
1,2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,5,1
2,2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,5,1
3,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,5,0
4,2,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",7,5,1


In [10]:
X = []
# Flatten one-hot encoded lists into the feature matrix
for i in range(len(preproc_df)):
    row = preproc_df.iloc[i]
    row_features = np.concatenate([row[['student_country', 'topic', 'subtopic', 'question_level']].values, row['keywords']])
    # row_features = np.concatenate([row[['topic', 'subtopic', 'question_level']].values, row['keywords']])
    X.append(row_features)

X = np.array(X)
y = preproc_df['type_of_answer']

In [11]:
X.shape

(9546, 198)

In [12]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [13]:
classifiers = {
    "LogisticRegression": LogisticRegression(solver="liblinear"),
    "NaiveBayes": BernoulliNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(24),
    "SVM": SVC(gamma='auto'),
    "RFR": RandomForestClassifier(n_estimators=100, random_state=42)
}

param_grids = {
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"]
    },
    "NaiveBayes": {
        "alpha": [0.1, 0.5, 1.0, 2.0, 5.0],
        "binarize": [0.0, 0.1, 0.5, 1.0]
    },
    "DecisionTree": {
        "max_depth": [None, 10, 20, 30, 40, 50],
        "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
        "min_samples_leaf": [2, 5, 10, 20, 30, 40, 50],
        "criterion": ["gini", "entropy"]
    },
    "KNN": {
        "n_neighbors": list(range(1, 500, 2)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "kernel": ["linear"],
    },
    "RFR": {
        "n_estimators": [50, 100, 200, 300],
        "max_depth": [None, 2, 5, 10, 20, 30, 40, 50],
        "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
        "min_samples_leaf": [2, 5, 10, 20, 30, 40, 50],
        "criterion": ["gini", "entropy"]
    }
}

In [None]:
best_models = {}
best_scores = {}

for name, model in classifiers.items():
    print(f"Optimizing {name} with K-Fold Cross-Validation...")
    
    search = GridSearchCV(model, param_grids[name], cv=kf, scoring="accuracy", n_jobs=-1, verbose=2)
    search.fit(X, y)

    best_models[name] = search.best_estimator_
    best_scores[name] = search.best_score_

    print(f"Best Parameters for {name}: {search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {search.best_score_:.4f}\n")

Optimizing LogisticRegression with K-Fold Cross-Validation...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters for LogisticRegression: {'C': 0.1, 'penalty': 'l2'}
Best Cross-Validation Accuracy: 0.5745

Optimizing NaiveBayes with K-Fold Cross-Validation...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters for NaiveBayes: {'alpha': 2.0, 'binarize': 0.0}
Best Cross-Validation Accuracy: 0.5444

Optimizing DecisionTree with K-Fold Cross-Validation...
Fitting 5 folds for each of 588 candidates, totalling 2940 fits
Best Parameters for DecisionTree: {'criterion': 'gini', 'max_depth': 50, 'min_samples_leaf': 10, 'min_samples_split': 40}
Best Cross-Validation Accuracy: 0.5828

Optimizing KNN with K-Fold Cross-Validation...
Fitting 5 folds for each of 1000 candidates, totalling 5000 fits
Best Parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 41, 'weights': 'uniform'}
Best Cross-Validation Accuracy: 0.5822

Optimizing SVM with K-Fol



In [None]:
results = []

for name, model in best_models.items():
    y_pred = cross_val_predict(model, X, y, cv=kf)
    
    acc = accuracy_score(y, y_pred)
    f1_weighted = f1_score(y, y_pred, average="weighted")
    f1_macro = f1_score(y, y_pred, average="macro")
    
    results.append({"Classifier": name, "Accuracy": acc, "F1_Weighted": f1_weighted, "F1_Macro": f1_macro})

df_results = pd.DataFrame(results)
print(df_results)

### Top 3 performing Classifiers without Ensemble: Decision Tree, KNN and Logistic Regression.

## Majority Voting Ensemble

In [None]:
best_clfs = [("LogisticRegression", best_models["LogisticRegression"]), ("KNN", best_models["KNN"]), ("DecisionTree", best_models["DecisionTree"])]
hard_eclf = VotingClassifier(estimators=best_clfs, voting='hard', weights=[1, 1, 2])
soft_eclf = VotingClassifier(estimators=best_clfs, voting='soft', weights=[1, 1, 2])
hard_eclf.fit(X, y)
soft_eclf.fit(X, y)
all_models = {
    tup[0]: tup[1]
    for tup in best_clfs
}
all_models["MajorityVotingEnsemble (Hard Voting)"] = hard_eclf
all_models["MajorityVotingEnsemble (Soft Voting)"] = soft_eclf

In [None]:
for label, clf in all_models.items():
    scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))