# Question Type Classification Baselines: (S)TF-IDF Baseline Experiments
#### Multilingual Question Type Classification and Complexity Prediction

In this notebook we will be developing a set of baselines for question type classification using subword TF-IDF vectors generated with the text2text toolkit. 

##### Author: Robin Kokot
##### Date: March 2025


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

## Setup and Data Loading

{'status': 'ok', 'restart': True}

: 

In [1]:
import numpy as np
import wandb
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
import pickle
from datasets import load_dataset
import os
from scipy.sparse import vstack, csr_matrix
from scipy.stats import pearsonr
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')

# ==================== Set up displays

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# ==================== Set up seed constant

SEED = 69
np.random.seed(SEED)

In [2]:
# ==================== Wandb config
class Config:
  wandb_project = "question-type-classification"
  wandb_entity = None
  dataset_name = "rokokot/question-type-and-complexity"
  dataset_config = "base"


In [None]:
dataset = load_dataset('rokokot/question-type-and-complexity', 'base')

train = dataset['train']
train_df = pd.DataFrame(train)
dev = dataset['validation']
dev_df = pd.DataFrame(dev)

# Preprocessing

In [13]:
# ==================== Normalize total complexity scores
def normalize_complexity_scores(df):    # normalize total scores, per language
  df['lang_norm_complexity_score'] = 0.0
  for language, group in df.groupby('language'):
    min_score = group['complexity_score'].min()
    max_score = group['complexity_score'].max()
    if min_score == max_score:
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = 0.5
    else:
      normalized_scores = (group['complexity_score'] - min_score) / (max_score - min_score)
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = normalized_scores.values
  return df

train_df = normalize_complexity_scores(train_df)
dev_df = normalize_complexity_scores(dev_df)

print("Original vs Language-Normalized scores (sample from different languages):")
sample_df = train_df.groupby('language').head(2).reset_index(drop=True)

display(sample_df[['language', 'complexity_score', 'lang_norm_complexity_score']])


Original vs Language-Normalized scores (sample from different languages):


Unnamed: 0,language,complexity_score,lang_norm_complexity_score
0,fi,3.469,0.362834
1,ru,2.192,0.230328
2,fi,4.835,0.537537
3,ko,5.842,0.451126
4,en,3.613,0.411187
5,ru,2.121,0.222492
6,id,4.226,0.461187
7,ko,4.59,0.356184
8,ja,3.586,0.459324
9,en,4.412,0.495515


In [17]:
print('dev columns:', train.column_names)

dev columns: ['unique_id', 'text', 'language', 'avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens', 'question_type', 'complexity_score']


In [18]:
# ==================== Define a set of features to use in the models

linguistic_features = ['avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens']

# ==================== creates a set of numpy arrays from our six linguistic features

y_train_clf = train['question_type'].values
y_dev_clf = dev['question_type'].values

#print(f'Classification - train set: {y_train_clf}')
print(f'Classification - dev set: {y_dev_clf}')




AttributeError: 'list' object has no attribute 'values'

### TF-IDF


Loaded train TF-IDF matrix shape: (7460, 1)
Loaded dev TF-IDF matrix shape: (441, 1)


# Experiment 1: Question Type Classification


### Dummy Classifiers

In [26]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

dummy_uniform = DummyClassifier(strategy='uniform', random_state=SEED)
dummy_uniform.fit(X_train_tfidf_clf, y_train_clf)
y_pred_uniform = dummy_uniform.predict(X_dev_tfidf_clf)

print("\nUniform Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_uniform):.4f}")
print(classification_report(y_dev_clf, y_pred_uniform))

cm = confusion_matrix(y_dev_clf, y_pred_uniform)
print(cm)



Uniform Dummy Classifier:
Accuracy: 0.5057
              precision    recall  f1-score   support

           0       0.51      0.48      0.49       223
           1       0.50      0.54      0.52       218

    accuracy                           0.51       441
   macro avg       0.51      0.51      0.51       441
weighted avg       0.51      0.51      0.51       441

[[106 117]
 [101 117]]


In [27]:
# Most frequent strategy
dummy_most_frequent = DummyClassifier(strategy='most_frequent', random_state=SEED)
dummy_most_frequent.fit(X_train_ling_clf, y_train_clf)
y_pred_most_frequent = dummy_most_frequent.predict(X_dev_ling_clf)

print("\nMost Frequent Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_most_frequent):.4f}")
print(classification_report(y_dev_clf, y_pred_most_frequent))



Most Frequent Dummy Classifier:
Accuracy: 0.5057
              precision    recall  f1-score   support

           0       0.51      1.00      0.67       223
           1       0.00      0.00      0.00       218

    accuracy                           0.51       441
   macro avg       0.25      0.50      0.34       441
weighted avg       0.26      0.51      0.34       441



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:

# Stratified strategy
dummy_stratified = DummyClassifier(strategy='stratified', random_state=SEED)
dummy_stratified.fit(X_train_ling_clf, y_train_clf)
y_pred_stratified = dummy_stratified.predict(X_dev_ling_clf)

print("\nStratified Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_stratified):.4f}")
print(classification_report(y_dev_clf, y_pred_stratified))


Stratified Dummy Classifier:
Accuracy: 0.4898
              precision    recall  f1-score   support

           0       0.50      0.52      0.51       223
           1       0.48      0.46      0.47       218

    accuracy                           0.49       441
   macro avg       0.49      0.49      0.49       441
weighted avg       0.49      0.49      0.49       441



## 1.1 (S)TF-IDF Only Models

In [29]:
print(f"X_train_tfidf type: {type(X_train_tfidf)}")     # Check what we're working with
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_train_tfidf first element type: {type(X_train_tfidf[0][0])}")

X_train_tfidf type: <class 'numpy.ndarray'>
X_train_tfidf shape: (7460, 1)
X_train_tfidf first element type: <class 'scipy.sparse._csr.csr_matrix'>


In [30]:
train_matrices = [X_train_tfidf[i, 0] for i in range(X_train_tfidf.shape[0])]
dev_matrices = [X_dev_tfidf[i, 0] for i in range(X_dev_tfidf.shape[0])]

X_train_sparse = vstack(train_matrices)   # Stack them vertically into a single sparse matrix
X_dev_sparse = vstack(dev_matrices)

In [None]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=SEED,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Use the dense arrays or sparse matrices directly
xgb_clf.fit(X_train_sparse, y_train_clf)  # or X_train_dense
y_pred_tfidf = xgb_clf.predict(X_dev_sparse)  # or X_dev_dense

# Evaluate performance
accuracy_tfidf = accuracy_score(y_dev_clf, y_pred_tfidf)
f1_tfidf = f1_score(y_dev_clf, y_pred_tfidf)
print(f"TF-IDF Only: Accuracy = {accuracy_tfidf:.4f}, F1 Score = {f1_tfidf:.4f}")

# Confusion matrix
cm_tfidf = confusion_matrix(y_dev_clf, y_pred_tfidf)
print("Confusion Matrix (TF-IDF Only):")
print(cm_tfidf)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


TF-IDF Only: Accuracy = 0.8322, F1 Score = 0.8195
Confusion Matrix (TF-IDF Only):
[[199  24]
 [ 50 168]]


# Results and Analysis
