## Data Preparation and Generating TF-IDF Vectors
##### Prepare datasets, normalize features, create destroyed (sub)sets and extract Text2Text TF-IDF features for our questions

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [2]:
%pip install -q text2text -q wandb

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import seaborn as sns
import text2text as t2t
from datasets import load_dataset
from tqdm.auto import tqdm
import json
import wandb

SEED = 69
np.random.seed(SEED)

LANGUAGES = ['en', 'fi', 'id', 'ko', 'ja', 'ru', 'ar']

In [21]:
wandb.init(
  project="MAIthesis",
  name="data-preparation",
  tags=["data-prep", "tfidf", "normalization", "data-randomization"],
  job_type="data-processing"
)

In [3]:
dataset = load_dataset('rokokot/question-type-and-complexity-v2')
train_data = dataset['train']
train = train_data.to_pandas()
dev_data = dataset['validation']
dev = dev_data.to_pandas()
test_data = dataset['test']
test = test_data.to_pandas()

wandb.log({"train_data_rows": len(train), "dev_data_rows": len(dev), "test_data_rows": len(test), "data_columns": len(train.columns)})


In [None]:
print("questions:")
for i in range(3):
    print(f"{train['text'][i][:100]}... (lang id: {train['language'][i]})")

In [4]:
# ==================== Normalize total complexity scores and log distribution

def normalize_complexity_scores(df):    # normalize total scores, per language
  df['lang_norm_complexity_score'] = 0.0
  for language, group in df.groupby('language'):
    min_score = group['complexity_score'].min()
    max_score = group['complexity_score'].max()
    if min_score == max_score:
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = 0.5
    else:
      normalized_scores = (group['complexity_score'] - min_score) / (max_score - min_score)
      df.loc[df['language'] == language, 'lang_norm_complexity_score'] = normalized_scores.values
  return df

train_df = normalize_complexity_scores(train)
test_df = normalize_complexity_scores(test)
dev_df = normalize_complexity_scores(dev)

print("original vs normalized scores (sample from different languages):")
sample_df = dev_df.groupby('language').head(2).reset_index(drop=True)

display(sample_df[['language', 'complexity_score', 'lang_norm_complexity_score']])



original vs normalized scores (sample from different languages):


Unnamed: 0,language,complexity_score,lang_norm_complexity_score
0,en,3.421,0.840525
1,ru,1.728,0.478899
2,fi,3.071,1.0
3,ja,2.087,0.587973
4,ru,0.794,0.199342
5,id,0.129,0.0
6,fi,1.419,0.397959
7,en,2.379,0.514697
8,ar,0.664,0.090959
9,id,3.309,0.838166


In [None]:
# ==================== Complexity Score Distributions
plt.figure(figsize=(10, 6))
sns.kdeplot(data=train_df, x='lang_norm_complexity_score', label='Train', fill=True, alpha=0.3)
sns.kdeplot(data=test_df, x='lang_norm_complexity_score', label='Test', fill=True, alpha=0.3)
sns.kdeplot(data=dev_df, x='lang_norm_complexity_score', label='Dev', fill=True, alpha=0.3)
plt.title('Normalized Complexity Score Distribution Across Splits')
plt.xlabel('Normalized Complexity Score')
plt.legend()
wandb.log({"complexity_distribution/all_splits": wandb.Image(plt)})
plt.close()

complexity_stats = {
    "complexity_stats/train_mean": train_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/train_median": train_df['lang_norm_complexity_score'].median(),
    "complexity_stats/test_mean": test_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/test_median": test_df['lang_norm_complexity_score'].median(),
    "complexity_stats/dev_mean": dev_df['lang_norm_complexity_score'].mean(),
    "complexity_stats/dev_median": dev_df['lang_norm_complexity_score'].median(),
}
wandb.log(complexity_stats)

In [None]:

# ==================== Question type distributions 
def plot_type_dist(train_df, test_df, dev_df):
  train_types = train_df['question_type'].value_counts().reset_index()
  train_types.columns = ['question_type', 'count']
  train_types['split'] = 'Train'

  test_types = test_df['question_type'].value_counts().reset_index()
  test_types.columns = ['question_type', 'count']
  test_types['split'] = 'Test'

  dev_types = dev_df['question_type'].value_counts().reset_index()
  dev_types.columns = ['question_type', 'count']
  dev_types['split'] = 'Dev'

  all_types = pd.concat([train_types, test_types, dev_types])

  for split, group in all_types.groupby('split'):
      total = group['count'].sum()
      all_types.loc[all_types['split'] == split, 'percentage'] = all_types.loc[all_types['split'] == split, 'count'] / total * 100

  plt.figure(figsize=(15, 8))
  chart = sns.barplot(data=all_types, x='question_type', y='count', hue='split')
  plt.title('Question Type Distribution Across Splits')
  plt.xlabel('Question Type')
  plt.ylabel('Count')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  wandb.log({"question_type_distribution/counts": wandb.Image(plt)})
  plt.close()

  plt.figure(figsize=(15, 8))
  chart = sns.barplot(data=all_types, x='question_type', y='percentage', hue='split')
  plt.title('Question Type Percentage Distribution Across Splits')
  plt.xlabel('Question Type')
  plt.ylabel('Percentage (%)')
  plt.xticks(rotation=45, ha='right')
  plt.tight_layout()
  wandb.log({"question_type_distribution/percentages": wandb.Image(plt)})
  plt.close()

  return all_types

question_type_stats = plot_type_dist(train_df, dev_df, test_df)

wandb.log({"question_type_stats": wandb.Table(dataframe=question_type_stats)})

INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
INFO: Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


In [7]:
def plot_language_dist(train_df, dev_df, test_df):
    train_langs = train_df['language'].value_counts().reset_index()
    train_langs.columns = ['language', 'count']
    train_langs['split'] = 'Train'
    dev_langs = dev_df['language'].value_counts().reset_index()
    dev_langs.columns = ['language', 'count']
    dev_langs['split'] = 'Dev'
    test_langs = test_df['language'].value_counts().reset_index()
    test_langs.columns = ['language', 'count']
    test_langs['split'] = 'Test'
    all_langs = pd.concat([train_langs, dev_langs, test_langs])
    plt.figure(figsize=(10, 6))
    sns.barplot(data=all_langs, x='language', y='count', hue='split')
    plt.title('Language Distribution Across Splits')
    plt.xlabel('Language')
    plt.ylabel('Count')
    plt.tight_layout()

    wandb.log({"language_distribution": wandb.Image(plt)})

    plt.close()

    return all_langs

language_stats = plot_language_dist(train_df, dev_df, test_df)

wandb.log({"language_stats": wandb.Table(dataframe=language_stats)})

In [5]:
tfidfer = t2t.Tfidfer()
indexer = t2t.Indexer()


def extract_tfidf_vectors(questions, languages):
    vectors = []
    for i, (question, lang) in enumerate(tqdm(zip(questions, languages), total=len(questions))):
        vector = tfidfer.transform([question], src_lang=lang, output='matrix')[0]
        vectors.append(vector)
    return np.vstack(vectors)

X_train = extract_tfidf_vectors(train_df['text'].tolist(), train_df['language'].tolist())
X_dev = extract_tfidf_vectors(dev_df['text'].tolist(), dev_df['language'].tolist())
X_test = extract_tfidf_vectors(test_df['text'].tolist(), test_df['language'].tolist())

print(f"Training TF-IDF matrix shape: {X_train.shape}")
print(f"Dev TF-IDF matrix shape: {X_dev.shape}")
print(f"Test TF-IDF matrix shape: {X_test.shape}")



  0%|          | 0/7460 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

Training TF-IDF matrix shape: (7460, 1)
Dev TF-IDF matrix shape: (441, 1)
Test TF-IDF matrix shape: (719, 1)


In [None]:
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_train.pkl', 'wb') as v: 
    pickle.dump(X_train, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_dev.pkl', 'wb') as v:
    pickle.dump(X_dev, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_test.pkl', 'wb') as v:
    pickle.dump(X_test, v)
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/idf_values.pkl', 'wb') as f:
    pickle.dump(tfidfer.idf, f)
    
tokenizer = t2t.Tokenizer()
vocab = tokenizer.__class__.tokenizer.get_vocab()
token_to_index = {token: idx for token, idx in vocab.items()}
    
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/token_to_index_mapping.pkl', 'wb') as f:
    pickle.dump(token_to_index, f)

### Generating destroyed sets

In [None]:
# ==================== Destroyed set generation

destroyed_sets = {
  'types': {},
  'complexity': {}
}

def generate_destroyed_sets(train_df, n_seeds=3):
    destroyed_sets = {
        'types': {},   
        'complexity': {} 
    }
    
    for seed in range(1, n_seeds+1):
        types_df = train_df.copy()
        complexity_df = train_df.copy()
        
        types_df['question_type_destroyed'] = types_df['question_type']
        complexity_df['complexity_score_destroyed'] = complexity_df['lang_norm_complexity_score']
        
        for lang in LANGUAGES:
            lang_mask = types_df['language'] == lang
            
            if lang_mask.sum() > 0:
                lang_indices = types_df[lang_mask].index
                
                np.random.seed(seed)
                shuffled_types = np.random.permutation(types_df.loc[lang_indices, 'question_type'].values)
                types_df.loc[lang_indices, 'question_type_destroyed'] = shuffled_types
                np.random.seed(seed)
                shuffled_scores = np.random.permutation(complexity_df.loc[lang_indices, 'lang_norm_complexity_score'].values)
                complexity_df.loc[lang_indices, 'complexity_score_destroyed'] = shuffled_scores
        
        destroyed_sets['types'][f'within_lang_shuffle_{seed}'] = types_df
        destroyed_sets['complexity'][f'within_lang_shuffle_{seed}'] = complexity_df
    
    return destroyed_sets

destroyed_sets = generate_destroyed_sets(train_df, n_seeds=3)

print(f"Generated {len(destroyed_sets['types'])} destroyed label sets")
print(f"Generated {len(destroyed_sets['complexity'])} destroyed complexity sets")

Generated 3 destroyed label sets
Generated 3 destroyed complexity sets


In [None]:
os.makedirs('/home/robin/Research/qtype-eval/data/destroyed/destroyed_types', exist_ok=True)
os.makedirs('/home/robin/Research/qtype-eval/data/destroyed/destroyed_complexity', exist_ok=True)

for seed in range(1, 4):
    seed_key = f'within_lang_shuffle_{seed}'
    
    destroyed_sets['types'][seed_key].to_csv(f'/home/robin/Research/qtype-eval/data/destroyed/destroyed_types/{seed_key}.csv', index=False)
    destroyed_sets['complexity'][seed_key].to_csv(f'/home/robin/Research/qtype-eval/data/destroyed/destroyed_complexity/{seed_key}.csv', index=False)

print("Saved combined destroyed datasets (all languages per seed) as CSV files")


Saved combined destroyed datasets (all languages per seed) as CSV files


In [None]:
# ==================== Comparison of destroyed and original scores

for seed in range(1, 4):
    seed_key = f'within_lang_shuffle_{seed}'
    destroyed_df = destroyed_sets['complexity'][seed_key]
    
    plt.figure(figsize=(12, 6))
    
    sns.kdeplot(
        data=train_df, 
        x='lang_norm_complexity_score',
        label='Original', 
        fill=True, 
        alpha=0.4,
        color='blue'
    )
    
    sns.kdeplot(
        data=destroyed_df, 
        x='complexity_score_destroyed',
        label=f'Destroyed (Seed {seed})', 
        fill=True, 
        alpha=0.4,
        color='red'
    )
    
    plt.title(f'Original vs. Destroyed Complexity Score Distribution (Seed {seed})')
    plt.xlabel('Normalized Complexity Score')
    plt.ylabel('Density')
    plt.legend()
    plt.tight_layout()
    
    wandb.log({f"destroyed_complexity_comparison/seed_{seed}": wandb.Image(plt)})
    plt.close()

In [15]:
wandb.finish()

0,1
complexity_stats/dev_mean,▁
complexity_stats/dev_median,▁
complexity_stats/test_mean,▁
complexity_stats/test_median,▁
complexity_stats/train_mean,▁
complexity_stats/train_median,▁
data_columns,▁
dev_data_rows,▁
test_data_rows,▁
train_data_rows,▁

0,1
complexity_stats/dev_mean,0.44659
complexity_stats/dev_median,0.43849
complexity_stats/test_mean,0.42937
complexity_stats/test_median,0.40675
complexity_stats/train_mean,0.38545
complexity_stats/train_median,0.37212
data_columns,11.0
dev_data_rows,441.0
test_data_rows,719.0
train_data_rows,7460.0
