# Question Type Classification Baselines: (S)TF-IDF Baseline Experiments
#### Multilingual Question Type Classification and Complexity Prediction

In this notebook we will be developing a set of baselines for question type classification using subword TF-IDF vectors generated with the text2text toolkit. 

##### Author: Robin Kokot
##### Date: March 2025


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

## Setup and Data Loading

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from datasets import load_dataset
import wandb
import logging
import time
import os

# ==================== Set up displays

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# ==================== Set up seed constant

SEED = 69
np.random.seed(SEED)

In [None]:
# ==================== Wandb config
run = wandb.init(project="MAIthesis", name="xgboost-clf-baseline", tags=["baseline", "xgboost", "question-classification", "tfidf"], job_type="model-training")

config = wandb.config
config.max_depth = 6
config.learning_rate = 0.3
config.n_estimators = 100
config.objective = 'binary:logistic'
config.random_state = 69
config.subsample = 0.8
config.colsample_bytree = 0.8

wandb.run.notes = "Baseline experiments with polar vs. content question type classification using XGBoost classifier on subword TF-IDF vectors"

In [None]:
# ==================== Dataset loading and label config
dataset = load_dataset("rokokot/question-type-and-complexity-v2")

y_train = np.array(dataset['train']['question_type'])     # target labels
y_dev = np.array(dataset['validation']['question_type'])
y_test= np.array(dataset['test']['question_type'])

print(f"Train label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")
print(f"Validation label distribution: {np.bincount(y_dev)}")
  

Train label distribution: [3778 3682]
Test label distribution: [367 352]
Validation label distribution: [223 218]


In [None]:
# ==================== TFIDF vectors loading
train_vectors = "/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_train.pkl"
dev_vectors = "/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_dev.pkl"
test_vectors = "/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_test.pkl"

with open(train_vectors, 'rb') as v:        # text feature vectors
  X_train = pickle.load(v)
with open(dev_vectors, 'rb') as v:
  X_dev = pickle.load(v)
with open(test_vectors, 'rb') as v:
  X_test = pickle.load(v)

print("\nchecking shapes of features and labels:")
print(f"Train - Features: {X_train.shape[0]}, Labels: {y_train.shape[0]}")
print(f"Test - Features: {X_test.shape[0]}, Labels: {y_test.shape[0]}")
print(f"Validation - Features: {X_dev.shape[0]}, Labels: {y_dev.shape[0]}")
    


checking shapes of features and labels:
Train - Features: 7460, Labels: 7460
Test - Features: 719, Labels: 719
Validation - Features: 441, Labels: 441


In [None]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=SEED,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Use the dense arrays or sparse matrices directly
xgb_clf.fit(X_train_sparse, y_train_clf)  # or X_train_dense
y_pred_tfidf = xgb_clf.predict(X_dev_sparse)  # or X_dev_dense

# Evaluate performance
accuracy_tfidf = accuracy_score(y_dev_clf, y_pred_tfidf)
f1_tfidf = f1_score(y_dev_clf, y_pred_tfidf)
print(f"TF-IDF Only: Accuracy = {accuracy_tfidf:.4f}, F1 Score = {f1_tfidf:.4f}")

# Confusion matrix
cm_tfidf = confusion_matrix(y_dev_clf, y_pred_tfidf)
print("Confusion Matrix (TF-IDF Only):")
print(cm_tfidf)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


TF-IDF Only: Accuracy = 0.8322, F1 Score = 0.8195
Confusion Matrix (TF-IDF Only):
[[199  24]
 [ 50 168]]


# Results and Analysis
