# Multilingual Question Type Classification and Complexity Prediction: (S)TF-IDF Baseline Experiments

In this notebook we will be establishing a set of baselines for question type classification using subword TF-IDF vectors generated with the text2text toolkit. 

##### Author: Robin Kokot
##### Date: March 2025


In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

# Section A: Setup and Data Loading

In [1]:
%%bash

pip install -qq -U text2text
pip install xgboost



In [None]:
# =================== Restart kernel to reload modules, if laggy you can do it manually
import sys
import IPython
IPython.Application.instance().kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

: 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import text2text as t2t
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
import pickle
from datasets import load_dataset
import os
from scipy.sparse import vstack
from scipy.stats import pearsonr
from tqdm.notebook import tqdm

# ==================== Set up displays

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# ==================== Set up seed constant

SEED = 69
np.random.seed(SEED)

In [11]:
set = load_dataset('rokokot/question-type-and-complexity', 'base')
test = set['test']

In [10]:
dataset = load_dataset('rokokot/question-type-and-complexity', 'base')

train = dataset['train']


Generating train split: 7460 examples [00:00, 38140.73 examples/s]
Generating validation split: 441 examples [00:00, 11893.72 examples/s]
Generating test split: 719 examples [00:00, 16759.04 examples/s]


# Section B: Exploring the Data

# Section C: Preprocessing

In [None]:
# ==================== Normalize total complexity scores

def normalize_complexity_scores(df):    # normalization, per language, of the total scores
  df['lang_norm_complexity_score'] = 0.0

  for language, group in df.groupby('language'):
    min_score = group['complexity_score'].min()
    max_score = group['complexity_score'].max()

    if min_score == max_score:
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = 0.5
    else:
      normalized_scores = (group['complexity_score'] - min_score) / (max_score - min_score)
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = normalized_scores.values
  
  return df

train_df = normalize_complexity_scores(train_data)
dev_df = normalize_complexity_scores(dev_data)

print("Original vs Language-Normalized scores (sample from different languages):")
sample_df = train_df.groupby('language').head(2).reset_index(drop=True)

display(sample_df[['language', 'complexity_score', 'lang_norm_complexity_score']])


Original vs Language-Normalized scores (sample from different languages):


Unnamed: 0,language,complexity_score,lang_norm_complexity_score
0,fi,3.469,0.362834
1,ru,2.192,0.230328
2,fi,4.835,0.537537
3,ko,5.842,0.451126
4,en,3.613,0.411187
5,ru,2.121,0.222492
6,id,4.226,0.461187
7,ko,4.59,0.356184
8,ja,3.586,0.459324
9,en,4.412,0.495515


In [22]:
# ==================== Define a set of features to use in the models

linguistic_features = ['avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens']

# ==================== Divide the combined set into monolingual splits, instead of using the og csv files

def get_language_info(lang=None, data=None):
  if data is None:
    data = dev_data
  if lang is None:
    return data
  return data[data['language'] == lang]

# ==================== creates a set of numpy arrays from our six linguistic features

X_train_ling = train_data[linguistic_features].values
y_train_clf = train_data['question_type'].values
y_train_reg = train_data['lang_norm_complexity_score'].values

X_dev_ling = dev_data[linguistic_features].values
y_dev_clf = dev_data['question_type'].values
y_dev_reg = dev_data['lang_norm_complexity_score'].values

#print(f'Classification - train set: {y_train_clf}')
print(f'Classification - dev set: {y_dev_clf}')

#print(f'Regression - train set: {y_train_reg}')
print(f'Regression - dev set: {y_dev_reg}')




Classification - dev set: [0 1 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 1 1 1 1
 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 1
 0 1 0 1 0 1 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 1 1 0
 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 1 0
 1 1 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 1 0 1
 0 1 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 0 1 1 0 1 0 0 0
 1 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 0 1 1 1 1 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 0
 1 0 1 0 1 1 0 1 0 1 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 1 0 0 1
 0 1 1 1 1 0 1 0 0 0 0 0 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 1 1 1 1
 1 0 1 1 0 0 0 1 1 0 1 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1
 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0
 0 1 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 0 1]
Regression - dev set: [0.92491343 0.36804355 0.85757087 0.52259283 0.13826478 0

# Section D: TF-IDF Feature Extraction


In [None]:
tfidfer = t2t.Tfidfer()   # Initialize the Subword TF-IDF vectorizer

questions_train = train_data['text'].tolist()
languages_train = train_data['language'].tolist()

questions_dev = dev_data['text'].tolist()
languages_dev = dev_data['language'].tolist()


In [None]:
tfidf_matrices_train = []   # Generate STF-IDF matrices for train data

print("Generating vectors for training data...")
for i, (question, lang) in enumerate(tqdm(zip(questions_train, languages_train), total=len(questions_train))):
    tfidf_matrix = tfidfer.transform([question], src_lang=lang, output='matrix')     # Get language-specific TF-IDF vector
    tfidf_matrices_train.append(tfidf_matrix)

X_train_tfidf = np.vstack(tfidf_matrices_train) # Combine TF-IDF matrices for train data



In [None]:
tfidf_matrices_dev = []   # Generate TF-IDF matrices for dev data


print("Generating TF-IDF vectors for dev data...")
for i, (question, lang) in enumerate(tqdm(zip(questions_dev, languages_dev), total=len(questions_dev))):
    tfidf_matrix = tfidfer.transform([question], src_lang=lang, output='matrix')
    tfidf_matrices_dev.append(tfidf_matrix)

X_dev_tfidf = np.vstack(tfidf_matrices_dev)

In [10]:
#with open('/content/drive/MyDrive/ColabNotebooks/colabMAIbaselines/results/tfidf_vectors_train.pkl', 'wb') as v:   # Save TF-IDF features for reuse
    #pickle.dump(X_train_tfidf, v)

#with open('/content/drive/MyDrive/ColabNotebooks/colabMAIbaselines/results/tfidf_vectors_dev.pkl', 'wb') as v:
    #pickle.dump(X_dev_tfidf, v)

In [23]:
#with open('/content/drive/MyDrive/ColabNotebooks/colabMAIbaselines/results/tfidf_vectors_train.pkl', 'rb') as v:
tfidf_vectors_train = '/home/robin/Research/qtype-eval/scripts/baselines/results/tfidf_vectors_train.pkl'
with open(tfidf_vectors_train, 'rb') as v:
    X_train_tfidf = pickle.load(v)

#with open('/content/drive/MyDrive/ColabNotebooks/colabMAIbaselines/results/tfidf_vectors_dev.pkl', 'rb') as v:
tfidf_vectors_dev = '/home/robin/Research/qtype-eval/scripts/baselines/results/tfidf_vectors_dev.pkl'
with open(tfidf_vectors_dev, 'rb') as v:
    X_dev_tfidf = pickle.load(v)

print(f"Loaded train TF-IDF matrix shape: {X_train_tfidf.shape}")
print(f"Loaded dev TF-IDF matrix shape: {X_dev_tfidf.shape}")

Loaded train TF-IDF matrix shape: (7460, 1)
Loaded dev TF-IDF matrix shape: (441, 1)


# Experiment 1: Question Type Classification


### Dummy Classifiers

In [24]:
X_train_tfidf_clf = X_train_tfidf     # feature sets for classification
X_dev_tfidf_clf = X_dev_tfidf
X_train_ling_clf = X_train_ling   # linguistic features
X_dev_ling_clf = X_dev_ling

X_train_tfidf_reg = X_train_tfidf   # feature sets for regression
X_dev_tfidf_reg = X_dev_tfidf
X_train_ling_reg = X_train_ling   # linguistic features
X_dev_ling_reg = X_dev_ling

X_combined_train_clf = np.hstack((X_train_tfidf_clf, X_train_ling_clf)) # combined features for classification
X_combined_dev_clf = np.hstack((X_dev_tfidf_clf, X_dev_ling_clf))

X_combined_train_reg = np.hstack((X_train_tfidf_reg, X_train_ling_reg)) # combined features for regression
X_combined_dev_reg = np.hstack((X_dev_tfidf_reg, X_dev_ling_reg))



In [26]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

dummy_uniform = DummyClassifier(strategy='uniform', random_state=SEED)
dummy_uniform.fit(X_train_tfidf_clf, y_train_clf)
y_pred_uniform = dummy_uniform.predict(X_dev_tfidf_clf)

print("\nUniform Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_uniform):.4f}")
print(classification_report(y_dev_clf, y_pred_uniform))

cm = confusion_matrix(y_dev_clf, y_pred_uniform)
print(cm)



Uniform Dummy Classifier:
Accuracy: 0.5057
              precision    recall  f1-score   support

           0       0.51      0.48      0.49       223
           1       0.50      0.54      0.52       218

    accuracy                           0.51       441
   macro avg       0.51      0.51      0.51       441
weighted avg       0.51      0.51      0.51       441

[[106 117]
 [101 117]]


In [27]:
# Most frequent strategy
dummy_most_frequent = DummyClassifier(strategy='most_frequent', random_state=SEED)
dummy_most_frequent.fit(X_train_ling_clf, y_train_clf)
y_pred_most_frequent = dummy_most_frequent.predict(X_dev_ling_clf)

print("\nMost Frequent Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_most_frequent):.4f}")
print(classification_report(y_dev_clf, y_pred_most_frequent))



Most Frequent Dummy Classifier:
Accuracy: 0.5057
              precision    recall  f1-score   support

           0       0.51      1.00      0.67       223
           1       0.00      0.00      0.00       218

    accuracy                           0.51       441
   macro avg       0.25      0.50      0.34       441
weighted avg       0.26      0.51      0.34       441



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:

# Stratified strategy
dummy_stratified = DummyClassifier(strategy='stratified', random_state=SEED)
dummy_stratified.fit(X_train_ling_clf, y_train_clf)
y_pred_stratified = dummy_stratified.predict(X_dev_ling_clf)

print("\nStratified Dummy Classifier:")
print(f"Accuracy: {accuracy_score(y_dev_clf, y_pred_stratified):.4f}")
print(classification_report(y_dev_clf, y_pred_stratified))


Stratified Dummy Classifier:
Accuracy: 0.4898
              precision    recall  f1-score   support

           0       0.50      0.52      0.51       223
           1       0.48      0.46      0.47       218

    accuracy                           0.49       441
   macro avg       0.49      0.49      0.49       441
weighted avg       0.49      0.49      0.49       441



## 1.1 (S)TF-IDF Only Models

In [29]:
print(f"X_train_tfidf type: {type(X_train_tfidf)}")     # Check what we're working with
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"X_train_tfidf first element type: {type(X_train_tfidf[0][0])}")

X_train_tfidf type: <class 'numpy.ndarray'>
X_train_tfidf shape: (7460, 1)
X_train_tfidf first element type: <class 'scipy.sparse._csr.csr_matrix'>


In [30]:
train_matrices = [X_train_tfidf[i, 0] for i in range(X_train_tfidf.shape[0])]
dev_matrices = [X_dev_tfidf[i, 0] for i in range(X_dev_tfidf.shape[0])]

X_train_sparse = vstack(train_matrices)   # Stack them vertically into a single sparse matrix
X_dev_sparse = vstack(dev_matrices)

In [None]:
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    random_state=SEED,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Use the dense arrays or sparse matrices directly
xgb_clf.fit(X_train_sparse, y_train_clf)  # or X_train_dense
y_pred_tfidf = xgb_clf.predict(X_dev_sparse)  # or X_dev_dense

# Evaluate performance
accuracy_tfidf = accuracy_score(y_dev_clf, y_pred_tfidf)
f1_tfidf = f1_score(y_dev_clf, y_pred_tfidf)
print(f"TF-IDF Only: Accuracy = {accuracy_tfidf:.4f}, F1 Score = {f1_tfidf:.4f}")

# Confusion matrix
cm_tfidf = confusion_matrix(y_dev_clf, y_pred_tfidf)
print("Confusion Matrix (TF-IDF Only):")
print(cm_tfidf)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


TF-IDF Only: Accuracy = 0.8322, F1 Score = 0.8195
Confusion Matrix (TF-IDF Only):
[[199  24]
 [ 50 168]]


# Results and Analysis
