# Multilingual Question Type Classification and Complexity Prediction: (S)TF-IDF Baseline Experiments

##### Author: Robin Kokot
##### Date: March 2025


# Section A: Setup and Data Loading

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import text2text as t2t
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import warnings
import pickle
import os
from scipy.stats import pearsonr
from tqdm.notebook import tqdm

# ==================== Set up displays

pd.set_option('display.max_columns', None)
pd.set_option('dispaly.max_rows', 20)
pd.set_option('display.width', 1000)

#==================== Set up seed constant

SEED = 69
np.random.seed(SEED)

In [None]:
train_data = pd.read_csv('baseline_data/tydi_all_combined.csv')
dev_data = pd.read_csv('baseline_data/all_dev.csv')

print(f' shape of train data: {train_data.shape}')
print(f' shape of dev data: {dev_data.shape}')

for col in train_data.columns:
  print(f' - {col}: {train_data[col].dtype}')  # shows data type of each column in the split
for col in dev_data.columns:
  print(f' - {col}: {dev_data[col].dtype}')

print('sample of train')
train_data.head()
print('sample of dev')
dev_data.head()

# Section B: Exploring the Data

# Section C: Preprocessing

In [None]:
# ==================== Define a set of features to use in the models

linguistic_features = ['avg_links_len', 'avg_max_depth', 'avg_subordinate_chain_len', 'avg_verb_edges', 'lexical_density', 'n_tokens']

# ==================== Divide the combined set into monolingual splits, instead of using the og csv files

def get_language_info(lang=None, data=None):
  if data is None:
    data = dev_data
  if lang is None:
    return data
  return data[data['language'] == lang]

# ==================== creates a set of numpy arrays from our six linguistic features

X_train_ling = train_data[linguistic_features].values
y_train_clf = train_data['question_type'].values
y_train_reg = train_data['complexity_score'].values

X_dev_ling = dev_data[linguistic_features].values
y_dev_clf = dev_data['question_type'].values
y_dev_reg = dev_data['complexity_score'].values

print(f'train set: ')





# Section D: TF-IDF Feature Extraction


# Experiment 1: Question Type Classification


# Experiment 2: Complexity Score Prediction

# Results and Analysis
