In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [2]:
df = pd.read_csv("mathe_dataset.csv", sep=";", encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"Stem and Leaf diagram,Relative frequency,Sampl..."


In [4]:
df['Keywords'] = df['Keywords'].str.split(',')

In [5]:
df.head()

Unnamed: 0,Student ID,Student Country,Question ID,Type of Answer,Question Level,Topic,Subtopic,Keywords
0,647,Ireland,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
1,41,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
2,340,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
3,641,Italy,77,0,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."
4,669,Portugal,77,1,Basic,Statistics,Statistics,"[Stem and Leaf diagram, Relative frequency, Sa..."


# Classification Problem: Find whether the answer for a question is correct or not.

- Given the inputs Student Country, Type of Answer, Question Level and Keywords, classify whether the answer is correct.
- This is a Binary Classification problem.
- We apply different classical Machine Learning models: Naive Bayes, Decision Trees, Random Forest Regressor, KNN and Logistic Regression to perform comparative analysis.

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

multi_label_cols = ['Question Level', 'Topic', 'Subtopic']
preproc_df = df.copy()

for col in multi_label_cols:
    preproc_df[col] = preproc_df[col].apply(lambda x: x.split(', ') if isinstance(x, str) else x)
    mlb = MultiLabelBinarizer()
    encoded = mlb.fit_transform(preproc_df[col])
    preproc_df = preproc_df.join(pd.DataFrame(encoded, columns=[f"{col}_{cls}".replace(" ", "_").replace(",", "_") for cls in mlb.classes_]))
    preproc_df.drop(columns=[col], inplace=True)

In [7]:
vectorizer = TfidfVectorizer(max_features=100)
preproc_df['Keywords'] = preproc_df['Keywords'].apply(lambda x: ' '.join(x))
keywords_tfidf = vectorizer.fit_transform(preproc_df['Keywords']).toarray()

In [8]:
keywords_df = pd.DataFrame(keywords_tfidf, columns=[f"keyword_{i}" for i in range(keywords_tfidf.shape[1])])
preproc_df = preproc_df.drop(columns=['Keywords']).reset_index(drop=True)
preproc_df = pd.concat([preproc_df, keywords_df], axis=1)

In [11]:
preproc_df = pd.get_dummies(preproc_df, columns=['Student Country'], drop_first=True)

In [12]:
preproc_df.head()

Unnamed: 0,Student ID,Question ID,Type of Answer,Question_Level_Advanced,Question_Level_Basic,Topic_Analytic_Geometry,Topic_Complex_Numbers,Topic_Differential_Equations,Topic_Differentiation,Topic_Fundamental_Mathematics,...,keyword_97,keyword_98,keyword_99,Student Country_Italy,Student Country_Lithuania,Student Country_Portugal,Student Country_Romania,Student Country_Russian Federation,Student Country_Slovenia,Student Country_Spain
0,647,77,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,False,False,False,False,False,False,False
1,41,77,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,False,False,True,False,False,False,False
2,340,77,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,False,False,True,False,False,False,False
3,641,77,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,True,False,False,False,False,False,False
4,669,77,1,0,1,0,0,0,0,0,...,0.0,0.0,0.0,False,False,True,False,False,False,False


In [13]:
X = preproc_df.drop(["Student ID", "Question ID", "Type of Answer"], axis=1)
y = preproc_df["Type of Answer"]

In [14]:
X.shape

(9546, 150)

In [15]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [16]:
classifiers = {
    "LogisticRegression": LogisticRegression(solver="liblinear"),
    "NaiveBayes": BernoulliNB(),
    "DecisionTree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(24),
    "SVM": SVC(gamma='auto'),
    "RFR": RandomForestClassifier(n_estimators=100, random_state=42)
}

param_grids = {
    "LogisticRegression": {
        "C": [0.01, 0.1, 1, 10, 100],
        "penalty": ["l1", "l2"]
    },
    "NaiveBayes": {
        "alpha": [0.1, 0.5, 1.0, 2.0, 5.0],
        "binarize": [0.0, 0.1, 0.5, 1.0]
    },
    "DecisionTree": {
        "max_depth": [None, 10, 20, 30, 40, 50],
        "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
        "min_samples_leaf": [2, 5, 10, 20, 30, 40, 50],
        "criterion": ["gini", "entropy"]
    },
    "KNN": {
        "n_neighbors": list(range(1, 500, 2)),
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    },
    "SVM": {
        "C": [0.1, 1, 10, 100],
        "kernel": ["linear"],
    },
    "RFR": {
        "n_estimators": [50, 100, 200, 300],
        "max_depth": [None, 2, 5, 10, 20, 30, 40, 50],
        "min_samples_split": [2, 5, 10, 20, 30, 40, 50],
        "min_samples_leaf": [2, 5, 10, 20, 30, 40, 50],
        "criterion": ["gini", "entropy"]
    }
}

In [None]:
best_models = {}
best_scores = {}

for name, model in classifiers.items():
    print(f"Optimizing {name} with K-Fold Cross-Validation...")
    
    search = GridSearchCV(model, param_grids[name], cv=kf, scoring="accuracy", n_jobs=-1)
    search.fit(X, y)

    best_models[name] = search.best_estimator_
    best_scores[name] = search.best_score_

    print(f"Best Parameters for {name}: {search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {search.best_score_:.4f}\n")

Optimizing LogisticRegression with K-Fold Cross-Validation...
Best Parameters for LogisticRegression: {'C': 1, 'penalty': 'l1'}
Best Cross-Validation Accuracy: 0.5797

Optimizing NaiveBayes with K-Fold Cross-Validation...
Best Parameters for NaiveBayes: {'alpha': 0.1, 'binarize': 0.5}
Best Cross-Validation Accuracy: 0.5489

Optimizing DecisionTree with K-Fold Cross-Validation...


  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters for DecisionTree: {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 10, 'min_samples_split': 50}
Best Cross-Validation Accuracy: 0.5858

Optimizing KNN with K-Fold Cross-Validation...
Best Parameters for KNN: {'metric': 'euclidean', 'n_neighbors': 143, 'weights': 'uniform'}
Best Cross-Validation Accuracy: 0.5899

Optimizing SVM with K-Fold Cross-Validation...
Best Parameters for SVM: {'C': 1, 'kernel': 'linear'}
Best Cross-Validation Accuracy: 0.5819

Optimizing RFR with K-Fold Cross-Validation...


In [19]:
def evaluate_models(best_models, X, y):
    results = []
    
    for name, model in best_models.items():
        y_pred = cross_val_predict(model, X, y, cv=kf)
        
        acc = accuracy_score(y, y_pred)
        f1_weighted = f1_score(y, y_pred, average="weighted")
        f1_macro = f1_score(y, y_pred, average="macro")
        
        results.append({"Classifier": name, "Accuracy": acc, "F1_Weighted": f1_weighted, "F1_Macro": f1_macro})
    
    df_results = pd.DataFrame(results)
    print(df_results)

In [15]:
evaluate_models(best_models, X, y)

           Classifier  Accuracy  F1_Weighted  F1_Macro
0  LogisticRegression  0.573224     0.565484  0.560793
1          NaiveBayes  0.545988     0.516036  0.507267
2        DecisionTree  0.584224     0.575768  0.570983
3                 KNN  0.582024     0.572885  0.567932
4                 SVM  0.565682     0.543185  0.535694
5                 RFR  0.585690     0.576032  0.570991


### Top 3 performing Classifiers without Ensemble: Decision Tree, KNN and Logistic Regression.

## Majority Voting Ensemble

In [16]:
best_clfs = [("LogisticRegression", best_models["LogisticRegression"]), ("KNN", best_models["KNN"]), ("DecisionTree", best_models["DecisionTree"])]
hard_eclf = VotingClassifier(estimators=best_clfs, voting='hard', weights=[1, 1, 2])
soft_eclf = VotingClassifier(estimators=best_clfs, voting='soft', weights=[1, 1, 2])
hard_eclf.fit(X, y)
soft_eclf.fit(X, y)
all_models = {
    tup[0]: tup[1]
    for tup in best_clfs
}
all_models["MajorityVotingEnsemble (Hard Voting)"] = hard_eclf
all_models["MajorityVotingEnsemble (Soft Voting)"] = soft_eclf

In [17]:
for label, clf in all_models.items():
    scores = cross_val_score(clf, X, y, cv=kf, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
          % (scores.mean(), scores.std(), label))

Accuracy: 0.57 (+/- 0.01) [LogisticRegression]
Accuracy: 0.58 (+/- 0.01) [KNN]
Accuracy: 0.58 (+/- 0.02) [DecisionTree]
Accuracy: 0.59 (+/- 0.01) [MajorityVotingEnsemble (Hard Voting)]
Accuracy: 0.59 (+/- 0.01) [MajorityVotingEnsemble (Soft Voting)]


In [26]:
# topic wise Analysis

for topic in list(set(preproc_df["topic"])):
    topic_name = feature_transformer.single_label_inverse_transformer([topic], "Topic")[0]
    topic_preproc_df = preproc_df[preproc_df["topic"]==topic]
    topic_X, topic_y = get_X_y(topic_preproc_df)
    print("----------------------------------------------")
    print(f"For Topic: {topic_name}")
    print("----------------------------------------------")
    evaluate_models(best_models, topic_X, topic_y)

----------------------------------------------
For Topic: Graph Theory
----------------------------------------------
           Classifier  Accuracy  F1_Weighted  F1_Macro
0  LogisticRegression  0.472727     0.470925  0.454701
1          NaiveBayes  0.563636     0.490909  0.450000
2        DecisionTree  0.581818     0.583756  0.581264
3                 KNN  0.581818     0.428004  0.367816
4                 SVM  0.563636     0.447752  0.395604
5                 RFR  0.581818     0.428004  0.367816
----------------------------------------------
For Topic: Differentiation
----------------------------------------------
           Classifier  Accuracy  F1_Weighted  F1_Macro
0  LogisticRegression  0.654577     0.575708  0.480755
1          NaiveBayes  0.658031     0.522312  0.396875
2        DecisionTree  0.640760     0.588604  0.507717
3                 KNN  0.654577     0.546260  0.434901
4                 SVM  0.658031     0.522312  0.396875
5                 RFR  0.664940     0.549425  