## Generating TF-IDF Vectors
##### Prepare data and extract Text2Text TF-IDF features for our questions

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
%%bash
pip install -q text2text
pip install wandb

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
import text2text as t2t
from datasets import load_dataset
from tqdm.auto import tqdm
import wandb

SEED = 69
np.random.seed(SEED)

In [2]:
wandb.init(
  project="MAIthesis",
  name="data-preparation",
  tags=["data-prep", "tfidf"],
  job_type="data-processing"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mrokii[0m ([33mrokii-ku-leuven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
dataset = load_dataset('rokokot/question-type-and-complexity-v2')
train_data = dataset['train']
train_df = train_data.to_pandas()
dev_data = dataset['validation']
dev_df = dev_data.to_pandas()
test_data = dataset['test']
test_df = test_data.to_pandas()

#wandb.log({"train_data_rows": len(train_df), "dev_data_rows": len(dev_df), "test_data_rows": len(test_df), "data_columns": len(train_df.columns)})


README.md:   0%|          | 0.00/15.0k [00:00<?, ?B/s]

tydi_train_base.csv:   0%|          | 0.00/885k [00:00<?, ?B/s]

dev_base.csv:   0%|          | 0.00/57.4k [00:00<?, ?B/s]

ud_test_base.csv:   0%|          | 0.00/97.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7460 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/441 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/719 [00:00<?, ? examples/s]

In [None]:
def normalize_complexity_scores(df):
    df['lang_norm_complexity_score'] = 0.0
    for language, group in df.groupby('language'):
        min_score = group['complexity_score'].min()
        max_score = group['complexity_score'].max()
        if min_score == max_score:
            df.loc[df['language'] == language, 'lang_norm_complexity_score'] = 0.5
        else:
            normalized_scores = (group['complexity_score'] - min_score) / (max_score - min_score)
            df.loc[df['language'] == language, 'lang_norm_complexity_score'] = normalized_scores.values
    return df

train_df = normalize_complexity_scores(train_df)
dev_df = normalize_complexity_scores(dev_df)
test_df = normalize_complexity_scores(test_df)
sample_df = train_df.groupby('language').head(2).reset_index(drop=True)
print(sample_df[['language', 'complexity_score', 'lang_norm_complexity_score']])


   language  complexity_score  lang_norm_complexity_score
0        fi             1.459                    0.360751
1        ru             1.243                    0.253591
2        fi             1.455                    0.359693
3        ko             2.471                    0.510456
4        en             1.986                    0.515021
5        ru             1.307                    0.271271
6        id             2.698                    0.624780
7        ko             2.310                    0.467292
8        ja             1.889                    0.499308
9        en             2.416                    0.656983
10       ar             1.475                    0.416025
11       id             2.274                    0.500586
12       ja             1.357                    0.315353
13       ar             1.377                    0.385824


In [9]:
print("questions:")
for i in range(3):
    print(f"{train_df['text'][i][:100]}... (lang id: {train_df['language'][i]})")

questions:
Onko Tampereen rantatunneli Suomen pisin maantietunneli?... (lang id: fi)
В каком фильме снимался Дзюн Фукуяма?... (lang id: ru)
Kuka oli Mary Jane Watsonin lempisukulainen perheen ulkopuolelta?... (lang id: fi)


In [5]:
tfidfer = t2t.Tfidfer() 

def extract_tfidf_vectors(questions, languages):
    vectors = []
    for i, (question, lang) in enumerate(tqdm(zip(questions, languages), total=len(questions))):
        vector = tfidfer.transform([question], src_lang=lang, output='matrix')[0]
        vectors.append(vector)
    return np.vstack(vectors)

X_train = extract_tfidf_vectors(train_df['text'].tolist(), train_df['language'].tolist())
X_dev = extract_tfidf_vectors(dev_df['text'].tolist(), dev_df['language'].tolist())
X_test = extract_tfidf_vectors(test_df['text'].tolist(), test_df['language'].tolist())

print(f"Training TF-IDF matrix shape: {X_train.shape}")
print(f"Dev TF-IDF matrix shape: {X_dev.shape}")
print(f"Test TF-IDF matrix shape: {X_test.shape}")

  0%|          | 0/7460 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

Training TF-IDF matrix shape: (7460, 1)
Dev TF-IDF matrix shape: (441, 1)
Test TF-IDF matrix shape: (719, 1)


In [6]:
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_train.pkl', 'wb') as v:   # Save TF-IDF features for reuse
    pickle.dump(X_train, v)

with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_dev.pkl', 'wb') as v:
    pickle.dump(X_dev, v)

with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_test.pkl', 'wb') as v:
    pickle.dump(X_test, v)

In [10]:
wandb.finish()