In [26]:
import torch
import transformers
import tokenizers
import sklearn
from tokenizers import SentencePieceBPETokenizer
from tokenizers import SentencePieceUnigramTokenizer
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer, BpeTrainer, UnigramTrainer
# whitespace pretokenizer ?
from tokenizers.pre_tokenizers import Whitespace
# use bert pretokenizer
from typing import List


unk_token = "<UNK>"
spl_tokens = ["<UNK>", "<SEP>", "<MASK>", "<CLS>"]

In [9]:
def train_WordPieceTokenizer(file_list: List[str], vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    Train WP tokenizer from a list of files.
    """
    tokenizer = Tokenizer(WordPiece(unk_token = unk_token))
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        special_tokens=spl_tokens,
        show_progress=True,
        limit_alphabet=limit_alphabet
    )
    tokenizer.pre_tokenizer = Whitespace()
    
    tokenizer.train(file_list, trainer)
    
    tokenizer.save("./WP_tok-trained.json")
    tokenizer = Tokenizer.from_file("./WP_tok-trained.json")
    return tokenizer

In [10]:
def train_iterator_mul_files(files):
    for path in files:
        with open(path, "r") as f:
            for line in f:
                yield line

In [11]:
def train_SentencePieceBPETokenizer(files: List[str], vocab_size=30_000, min_frequency=5, limit_alphabet=500):
    """
    trin SP_BPE tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(files)

    tokenizer = SentencePieceBPETokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        min_frequency=min_frequency,
        show_progress=True,
        limit_alphabet=limit_alphabet,
    )
    tokenizer.save("./SP_BPE_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")
    return tokenizer  

In [12]:
def train_SentencePieceUGTokenizer(filelist: List[str], vocab_size=30_000):
    """
    trin SP_UG tokenizer from a list of files.
    """
    train_it = train_iterator_mul_files(filelist)

    tokenizer = SentencePieceUnigramTokenizer()
    tokenizer.train_from_iterator(
        train_it,
        vocab_size=vocab_size,
        show_progress=True
    )
    tokenizer.save("./SP_UG_tok-trained.json")
    tokenizer = Tokenizer.from_file("./SP_UG_tok-trained.json")
    return tokenizer  

In [20]:
import os

def convert_corpus_with_tokenizer(contents: List[str], tokenizer: Tokenizer):
    """
    Convert a list of strings with a tokenizer.
    """
    if os.path.isfile(contents[0]):
        contents = [open(path, "r").read() for path in contents]
        contents = [c.replace("\n", '') for c in contents]
    contents = tokenizer.encode_batch(contents)
    corpus_tokens = [c.tokens for c in contents]
    return corpus_tokens

In [31]:

file_list = os.listdir("./data/queries")
file_list = ["./data/queries/" + f for f in file_list]

train_WordPieceTokenizer(file_list)
train_SentencePieceBPETokenizer(file_list)
train_SentencePieceUGTokenizer(file_list)

<tokenizers.Tokenizer at 0x1fb6fe6af90>

In [34]:
tokenizer = Tokenizer.from_file("./SP_BPE_tok-trained.json")

corpus = convert_corpus_with_tokenizer(file_list, tokenizer)

corpus = " ".join(corpus[0])

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(input="filename")
X = vectorizer.fit_transform(file_list)
vectorizer.get_feature_names_out()

scores = []
k = 3
for i in range(len(file_list)):
    query_scores = [(k, v) for k, v in dict(zip(vectorizer.get_feature_names(), X.toarray()[i])).items()]
    query_scores = list(filter(lambda e: e[1] > 0, query_scores))
    query_scores = { k : v for (k, v) in sorted(query_scores, key=lambda e: e[1], reverse=True)[:k]}
    scores.append(query_scores)

queries_text = [file.read().strip("\n").split(' ') for file in file_list]





[[('bass', 0.5394451581794177),
  ('sea', 0.5394451581794177),
  ('cook', 0.4253047588435995)],
 [('language', 0.5100564784233356),
  ('recipe', 0.5100564784233356),
  ('cook', 0.40213438616183167)],
 [('package', 0.4981971092712959),
  ('python', 0.4981971092712959),
  ('wheel', 0.4981971092712959)],
 [('mean', 0.5100564784233356),
  ('vector', 0.5100564784233356),
  ('with', 0.40213438616183167)]]