## Data Preparation and Generating TF-IDF Vectors
##### Prepare datasets, normalize features, create destroyed (sub)sets and extract Text2Text TF-IDF features for our questions

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
%pip install -q text2text -q wandb -q datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import seaborn as sns
import text2text as t2t
from datasets import load_dataset
from tqdm.auto import tqdm
import json
import wandb

SEED = 69
np.random.seed(SEED)

LANGUAGES = ['en', 'fi', 'id', 'ko', 'ja', 'ru', 'ar']

In [4]:
wandb.init(project="MAIthesis",name="data-preparation",tags=["data-prep", "tfidf", "normalization", "data-randomization"],job_type="data-processing",dir="/home/robin/Research/qtype-eval/scripts/experiments/baselines/")   #tracking

In [7]:
dataset = load_dataset('rokokot/question-type-and-complexity-v2')
train_data = dataset['train']
train = train_data.to_pandas()
dev_data = dataset['validation']
dev = dev_data.to_pandas()
test_data = dataset['test']
test = test_data.to_pandas()

#wandb.log({"train_data_rows": len(train), "dev_data_rows": len(dev), "test_data_rows": len(test), "data_columns": len(train.columns)})      #tracking
#print({"train_data_rows": len(train), "dev_data_rows": len(dev), "test_data_rows": len(test), "data_columns": len(train.columns)})

README.md:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7460 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/441 [00:00<?, ? examples/s]

In [8]:
train.head()

Unnamed: 0,unique_id,text,language,avg_links_len,avg_max_depth,avg_subordinate_chain_len,avg_verb_edges,lexical_density,n_tokens,question_type,complexity_score
0,finnish_polar_360,Onko Tampereen rantatunneli Suomen pisin maant...,fi,0.228,0.25,0.0,0.0,0.75,0.231,1,1.459
1,russian_content_3904,В каком фильме снимался Дзюн Фукуяма?,ru,0.045,0.125,0.0,0.333,0.667,0.073,0,1.243
2,finnish_content_10111,Kuka oli Mary Jane Watsonin lempisukulainen pe...,fi,0.296,0.333,0.0,0.0,0.531,0.294,0,1.455
3,finnish_content_13146,Milloin HMS Castleton tilattiin?,fi,0.173,0.167,0.0,0.333,1.0,0.059,0,1.732
4,korean_content_4335,6.25전쟁 당시 남한 편에서 싸운 나라는 몇 개국인가?,ko,0.143,0.3,0.5,0.4,1.0,0.128,0,2.471


In [None]:
print("questions:")
for i in range(3):
    print(f"{train['text'][i][:100]}... (lang id: {train['language'][i]})")

In [9]:
# ==================== Normalize total complexity scores

def normalize_complexity_scores(df): 
    df['lang_norm_complexity_score'] = 0.0
    for language, group in df.groupby('language'):
        min_score = group['complexity_score'].min()
        max_score = group['complexity_score'].max()
        if min_score == max_score:
            df.loc[df['language'] == language, 'lang_norm_complexity_score'] = 0.5
        else:
            normalized_scores = (group['complexity_score'] - min_score) / (max_score - min_score)
            df.loc[df['language'] == language, 'lang_norm_complexity_score'] = normalized_scores.values
    return df

train_df = normalize_complexity_scores(train)
dev_df = normalize_complexity_scores(dev)
test_df = normalize_complexity_scores(test)


In [None]:
# ==================== Complexity Score Distributions
plt.figure(figsize=(10, 6))
sns.kdeplot(data=train_df, x='lang_norm_complexity_score', label='Train', fill=True, alpha=0.3)
sns.kdeplot(data=test_df, x='lang_norm_complexity_score', label='Test', fill=True, alpha=0.3)
sns.kdeplot(data=dev_df, x='lang_norm_complexity_score', label='Dev', fill=True, alpha=0.3)
plt.title('Normalized Complexity Score Distribution Across Splits')
plt.xlabel('Normalized Complexity Score')
plt.legend()
wandb.log({"complexity_distribution/all_splits": wandb.Image(plt)})     #tracking
plt.close()

complexity_stats = {
    "complexity_stats/train_mean": train_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/dev_mean": dev_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/test_mean": test_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/train_median": train_df['lang_norm_complexity_score'].median(),
    "complexity_stats/dev_median": dev_df['lang_norm_complexity_score'].median(),
    "complexity_stats/test_median": test_df['lang_norm_complexity_score'].median(),}
wandb.log(complexity_stats)     #tracking

In [10]:

# ==================== Question type distributions 
def plot_type_dist(train_df, test_df, dev_df):
  train_types = train_df['question_type'].value_counts().reset_index()
  train_types.columns = ['question_type', 'count']
  train_types['split'] = 'Train'

  test_types = test_df['question_type'].value_counts().reset_index()
  test_types.columns = ['question_type', 'count']
  test_types['split'] = 'Test'

  dev_types = dev_df['question_type'].value_counts().reset_index()
  dev_types.columns = ['question_type', 'count']
  dev_types['split'] = 'Dev'

  all_types = pd.concat([train_types, test_types, dev_types])

  for split, group in all_types.groupby('split'):
      total = group['count'].sum()
      all_types.loc[all_types['split'] == split, 'percentage'] = all_types.loc[all_types['split'] == split, 'count'] / total * 100

  plt.figure(figsize=(15, 8))
  chart = sns.barplot(data=all_types, x='question_type', y='count', hue='split')
  plt.title('Question Type Distribution Across Splits')
  plt.xlabel('Question Type')
  plt.ylabel('Count')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  wandb.log({"question_type_distribution/counts": wandb.Image(plt)})      #tracking
  plt.close()

  plt.figure(figsize=(15, 8))
  chart = sns.barplot(data=all_types, x='question_type', y='percentage', hue='split')
  plt.title('Question Type Percentage Distribution Across Splits')
  plt.xlabel('Question Type')
  plt.ylabel('Percentage (%)')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  wandb.log({"question_type_distribution/percentages": wandb.Image(plt)})     #tracking
  plt.close()

  return all_types

question_type_stats = plot_type_dist(train_df, dev_df, test_df)

wandb.log({"question_type_stats": wandb.Table(dataframe=question_type_stats)})      #tracking

INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


In [11]:
def plot_language_dist(train_df, dev_df, test_df):
    train_langs = train_df['language'].value_counts().reset_index()
    train_langs.columns = ['language', 'count']
    train_langs['split'] = 'Train'
    dev_langs = dev_df['language'].value_counts().reset_index()
    dev_langs.columns = ['language', 'count']
    dev_langs['split'] = 'Dev'
    test_langs = test_df['language'].value_counts().reset_index()
    test_langs.columns = ['language', 'count']
    test_langs['split'] = 'Test'
    all_langs = pd.concat([train_langs, dev_langs, test_langs])
    plt.figure(figsize=(10, 6))
    sns.barplot(data=all_langs, x='language', y='count', hue='split')
    plt.title('Language Distribution Across Splits')
    plt.xlabel('Language')
    plt.ylabel('Count')
    plt.tight_layout()

    wandb.log({"language_distribution": wandb.Image(plt)})          #tracking

    plt.close()

    return all_langs

language_stats = plot_language_dist(train_df, dev_df, test_df)

wandb.log({"language_stats": wandb.Table(dataframe=language_stats)})            #tracking

In [5]:
tfidfer = t2t.Tfidfer()
indexer = t2t.Indexer()


def extract_tfidf_vectors(questions, languages):
    vectors = []
    for i, (question, lang) in enumerate(tqdm(zip(questions, languages), total=len(questions))):
        vector = tfidfer.transform([question], src_lang=lang, output='matrix')[0]
        vectors.append(vector)
    return np.vstack(vectors)

X_train = extract_tfidf_vectors(train_df['text'].tolist(), train_df['language'].tolist())
X_dev = extract_tfidf_vectors(dev_df['text'].tolist(), dev_df['language'].tolist())
X_test = extract_tfidf_vectors(test_df['text'].tolist(), test_df['language'].tolist())

print(f"Training TF-IDF matrix shape: {X_train.shape}")
print(f"Dev TF-IDF matrix shape: {X_dev.shape}")
print(f"Test TF-IDF matrix shape: {X_test.shape}")



  0%|          | 0/7460 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

Training TF-IDF matrix shape: (7460, 1)
Dev TF-IDF matrix shape: (441, 1)
Test TF-IDF matrix shape: (719, 1)


In [None]:
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_train.pkl', 'wb') as v: 
    pickle.dump(X_train, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_dev.pkl', 'wb') as v:
    pickle.dump(X_dev, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_test.pkl', 'wb') as v:
    pickle.dump(X_test, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/idf_values.pkl', 'wb') as f:
    pickle.dump(tfidfer.idf, f)
    
tokenizer = t2t.Tokenizer()
vocab = tokenizer.__class__.tokenizer.get_vocab()
token_to_index = {token: idx for token, idx in vocab.items()}
    
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/token_to_index_mapping.pkl', 'wb') as f:
    pickle.dump(token_to_index, f)

### Generating destroyed sets

In [None]:
# ==================== Generate control datasets with consistent formet
def generate_control_files(train_df, output_dir, n_seeds=3):
    os.makedirs(output_dir, exist_ok=True)
    
    train_df.to_csv(os.path.join(output_dir, "tydi_train_base.csv"), index=False)
    
    metrics = {
        'question_type': 'question_type',
        'complexity_score': 'complexity',
        'avg_links_len': 'avg_links_len',
        'avg_max_depth': 'avg_max_depth',
        'avg_subordinate_chain_len': 'avg_subordinate_chain_len',
        'avg_verb_edges': 'avg_verb_edges',
        'lexical_density': 'lexical_density',
        'n_tokens': 'n_tokens'
    }
    
    for seed in range(1, n_seeds+1):
        for feature, file_prefix in metrics.items():
            control_df = train_df.copy()
            
            for lang in LANGUAGES:
                lang_mask = control_df['language'] == lang
                if lang_mask.sum() > 0:
                    lang_indices = control_df[lang_mask].index
                    np.random.seed(seed)
                    
                    if feature == 'question_type':
                        shuffled_values = np.random.permutation(control_df.loc[lang_indices, feature].values)
                    elif feature == 'complexity_score':
                        shuffled_values = np.random.permutation(control_df.loc[lang_indices, 'lang_norm_complexity_score'].values)
                        control_df.loc[lang_indices, 'lang_norm_complexity_score'] = shuffled_values
                        continue
                    else:
                        shuffled_values = np.random.permutation(control_df.loc[lang_indices, feature].values)
                    
                    control_df.loc[lang_indices, feature] = shuffled_values
            
            filename = f"tydi_train_control_{file_prefix}_seed{seed}.csv"
            control_df.to_csv(os.path.join(output_dir, filename), index=False)
            print(f"Generated: {filename}")

output_dir = "/home/robin/Research/qtype-eval/scripts/data/huggingface_upload_final"
generate_control_files(train_df, output_dir, n_seeds=3)

dev_df.to_csv(os.path.join(output_dir, "dev_base.csv"), index=False)
test_df.to_csv(os.path.join(output_dir, "ud_test_base.csv"), index=False)
print("Generated all files for Hugging Face dataset upload")

Generated: tydi_train_control_question_type_seed1.csv
Generated: tydi_train_control_complexity_seed1.csv
Generated: tydi_train_control_avg_links_len_seed1.csv
Generated: tydi_train_control_avg_max_depth_seed1.csv
Generated: tydi_train_control_avg_subordinate_chain_len_seed1.csv
Generated: tydi_train_control_avg_verb_edges_seed1.csv
Generated: tydi_train_control_lexical_density_seed1.csv
Generated: tydi_train_control_n_tokens_seed1.csv
Generated: tydi_train_control_question_type_seed2.csv
Generated: tydi_train_control_complexity_seed2.csv
Generated: tydi_train_control_avg_links_len_seed2.csv
Generated: tydi_train_control_avg_max_depth_seed2.csv
Generated: tydi_train_control_avg_subordinate_chain_len_seed2.csv
Generated: tydi_train_control_avg_verb_edges_seed2.csv
Generated: tydi_train_control_lexical_density_seed2.csv
Generated: tydi_train_control_n_tokens_seed2.csv
Generated: tydi_train_control_question_type_seed3.csv
Generated: tydi_train_control_complexity_seed3.csv
Generated: tydi_t

In [None]:
# ==================== Save destroyed sets
base_output_dir = '/home/robin/Research/qtype-eval/data/destroyed'
os.makedirs(base_output_dir, exist_ok=True)

features = {'question_type': 'destroyed_types','complexity_score': 'destroyed_complexity','avg_links_len': 'destroyed_avg_links_len','avg_max_depth': 'destroyed_avg_max_depth','avg_subordinate_chain_len': 'destroyed_subordinate_chain','avg_verb_edges': 'destroyed_verb_edges','lexical_density': 'destroyed_lexical_density','n_tokens': 'destroyed_tokens'}

for seed in range(1, 4):
    seed_key = f'within_lang_shuffle_{seed}'
    
    type_dir = os.path.join(base_output_dir, 'destroyed_types')
    os.makedirs(type_dir, exist_ok=True)
    
    destroyed_sets['types'][seed_key].to_csv(os.path.join(type_dir, f'question_type_destroyed_seed_{seed}.csv'),index=False)
    
    complexity_dir = os.path.join(base_output_dir, 'destroyed_complexity')
    os.makedirs(complexity_dir, exist_ok=True)
    
    destroyed_sets['complexity'][seed_key].to_csv(os.path.join(complexity_dir, f'complexity_score_destroyed_seed_{seed}.csv'),index=False)

submetrics = ['avg_links_len','avg_max_depth','avg_subordinate_chain_len','avg_verb_edges', 'lexical_density', 'n_tokens']

for metric in submetrics:
    metric_dir = os.path.join(base_output_dir, f'destroyed_{metric}')
    os.makedirs(metric_dir, exist_ok=True)
    
    for seed in range(1, 4):
        seed_key = f'within_lang_shuffle_{seed}'        
        destroyed_sets['submetrics'][metric][seed_key].to_csv(os.path.join(metric_dir, f'{metric}_destroyed_seed_{seed}.csv'),index=False)

print("Saved all destroyed sets with descriptive filenames")

Saved all destroyed sets with descriptive filenames


In [16]:
# ==================== Comparison of destroyed and original scores
for seed in range(1, 4):
    seed_key = f'within_lang_shuffle_{seed}'
    destroyed_df = destroyed_sets['complexity'][seed_key]
    
    plt.figure(figsize=(12, 6))
    sns.kdeplot(data=train_df,x='lang_norm_complexity_score',label='Original',fill=True,alpha=0.4,color='blue')
    
    sns.kdeplot(data=destroyed_df,x='complexity_score_destroyed',label=f'Destroyed (Seed {seed})',fill=True,alpha=0.4,color='yellow')
    
    plt.title(f'Original vs. Control Complexity Score Distribution (Seed {seed})')
    plt.xlabel('Normalized Complexity Score')
    plt.ylabel('Density')
    plt.legend()
    plt.tight_layout()
    
    wandb.log({f"control_label_combined/seed_{seed}": wandb.Image(plt)})           #tracking
    plt.close()

In [17]:
wandb.finish()

0,1
complexity_stats/dev_mean,▁
complexity_stats/dev_median,▁
complexity_stats/test_mean,▁
complexity_stats/test_median,▁
complexity_stats/train_mean,▁
complexity_stats/train_median,▁
data_columns,▁
dev_data_rows,▁
test_data_rows,▁
train_data_rows,▁

0,1
complexity_stats/dev_mean,0.44659
complexity_stats/dev_median,0.43849
complexity_stats/test_mean,0.42937
complexity_stats/test_median,0.40675
complexity_stats/train_mean,0.38545
complexity_stats/train_median,0.37212
data_columns,11.0
dev_data_rows,441.0
test_data_rows,719.0
train_data_rows,7460.0
