## Generating TF-IDF Vectors
##### Prepare data and extract Text2Text TF-IDF features for our questions

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
%%bash
pip install -q text2text
pip install wandb

In [1]:
import pandas as pd
from pathlib import Path
import numpy as np
import pickle
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
import seaborn as sns
import text2text as t2t
from datasets import load_dataset
from tqdm.auto import tqdm
import json
import wandb

SEED = 69
np.random.seed(SEED)

In [2]:
wandb.init(
  project="MAIthesis",
  name="data-preparation",
  tags=["data-prep", "tfidf"],
  job_type="data-processing"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


[34m[1mwandb[0m: Currently logged in as: [33mrokii[0m ([33mrokii-ku-leuven[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
dataset = load_dataset('rokokot/question-type-and-complexity-v2')
train_data = dataset['train']
train_df = train_data.to_pandas()
dev_data = dataset['validation']
dev_df = dev_data.to_pandas()
test_data = dataset['test']
test_df = test_data.to_pandas()

#wandb.log({"train_data_rows": len(train_df), "dev_data_rows": len(dev_df), "test_data_rows": len(test_df), "data_columns": len(train_df.columns)})


In [4]:
print("questions:")
for i in range(3):
    print(f"{train_df['text'][i][:100]}... (lang id: {train_df['language'][i]})")

questions:
Onko Tampereen rantatunneli Suomen pisin maantietunneli?... (lang id: fi)
В каком фильме снимался Дзюн Фукуяма?... (lang id: ru)
Kuka oli Mary Jane Watsonin lempisukulainen perheen ulkopuolelta?... (lang id: fi)


In [5]:
tfidfer = t2t.Tfidfer()
indexer = t2t.Indexer()


def extract_tfidf_vectors(questions, languages):
    vectors = []
    for i, (question, lang) in enumerate(tqdm(zip(questions, languages), total=len(questions))):
        vector = tfidfer.transform([question], src_lang=lang, output='matrix')[0]
        vectors.append(vector)
    return np.vstack(vectors)

X_train = extract_tfidf_vectors(train_df['text'].tolist(), train_df['language'].tolist())
X_dev = extract_tfidf_vectors(dev_df['text'].tolist(), dev_df['language'].tolist())
X_test = extract_tfidf_vectors(test_df['text'].tolist(), test_df['language'].tolist())

print(f"Training TF-IDF matrix shape: {X_train.shape}")
print(f"Dev TF-IDF matrix shape: {X_dev.shape}")
print(f"Test TF-IDF matrix shape: {X_test.shape}")



  0%|          | 0/7460 [00:00<?, ?it/s]

  0%|          | 0/441 [00:00<?, ?it/s]

  0%|          | 0/719 [00:00<?, ?it/s]

Training TF-IDF matrix shape: (7460, 1)
Dev TF-IDF matrix shape: (441, 1)
Test TF-IDF matrix shape: (719, 1)


In [6]:
with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_train.pkl', 'wb') as v:   # Save TF-IDF features for reuse
    pickle.dump(X_train, v)

with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_dev.pkl', 'wb') as v:
    pickle.dump(X_dev, v)

with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/tfidf_vectors_test.pkl', 'wb') as v:
    pickle.dump(X_test, v)

In [None]:
try:
    with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/idf_values.pkl', 'wb') as f:
        pickle.dump(tfidfer.idf, f)
    print("IDF values saved successfully")
except AttributeError:
    print("Could not access IDF values directly")

# Get tokenizer vocabulary if available
try:
    tokenizer = t2t.Tokenizer()
    vocab = tokenizer.__class__.tokenizer.get_vocab()
    token_to_index = {token: idx for token, idx in vocab.items()}
    
    # Save the token to index mapping
    with open('/home/robin/Research/qtype-eval/scripts/baselines/vectors/token_to_index_mapping.pkl', 'wb') as f:
        pickle.dump(token_to_index, f)
    print("Token to index mapping saved successfully")
except (AttributeError, TypeError):
    print("Could not access tokenizer vocabulary")


IDF values saved successfully
Token to index mapping saved successfully
Could not get feature names: No method to get feature names


In [8]:
wandb.finish()