In [1]:
import os
import yaml

import fasttext
import torch
from tqdm import tqdm

from model.encoder import CandidateEncoderConfig
from model.decoder import CandidateDecoderConfig
from config.general_config import GeneralConfig
from trainer.trainer import TrainerConfig
from dataset.dataset import SellersDataset

In [2]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

with open("config/config.yaml", "r") as file:
    try:
        config = yaml.safe_load(file)["vae"]
    except yaml.YAMLError as exc:
        print(exc)

general_config = GeneralConfig(**{**config["general"]})
encoder_config = CandidateEncoderConfig(**{**config["encoder"], **config["general"]})

decoder_config = CandidateDecoderConfig(**{**config["decoder"], **config["general"]})

In [3]:
# import pickle
# import pandas as pd
# with open("data/test.index", "rb") as f:
#     idx = pickle.load(f)



# raw_data = pd.read_json("data/extracted_sellers_old.json")


In [4]:
dataset = SellersDataset(
    dataset_path="data/dataset_fasttext/",
    test_index=general_config.test_index,
    embedder_name=general_config.embedder_name,
    raw_data_path=general_config.raw_data_path,
    device=DEVICE,
    bow_remove_stopwords=general_config.bow_remove_stopwords,
    bow_remove_sentiment=general_config.bow_remove_sentiment,
    nn_embedding_size=encoder_config.lstm_hidden_dim,
    trim_tr=general_config.trim_tr,
)
dataset.prepare_dataset(dropna=False)
# dataset.load_dataset()

Preparing dataset
[2022-05-31 02:00:24,495] {dataset.py:138} INFO - Preparing dataset
Detecting languages:
[2022-05-31 02:00:24,496] {dataset.py:289} INFO - Detecting languages:


100%|██████████| 56083/56083 [09:07<00:00, 102.47it/s]

Detected languages:
[2022-05-31 02:09:31,816] {dataset.py:294} INFO - Detected languages:
lang
UNKNOWN       17
af             4
ar             1
ca             4
cy             6
da             6
de            80
en         55243
es           192
et             1
fr            86
he             2
id             7
it           113
ja             1
lt             1
nl            12
no             7
pl             2
pt            13
ro             2
ru             6
sk             2
sl             1
so             5
sw             1
tl             4
tr             8
vi             4
NaN            0
Name: lang, dtype: int64
[2022-05-31 02:09:31,823] {dataset.py:295} INFO - lang
UNKNOWN       17
af             4
ar             1
ca             4
cy             6
da             6
de            80
en         55243
es           192
et             1
fr            86
he             2
id             7
it           113
ja             1
lt             1
nl            12
no             7
pl       


100%|██████████| 55243/55243 [00:01<00:00, 43675.24it/s]
100%|██████████| 55243/55243 [00:00<00:00, 86618.43it/s]
100%|██████████| 55243/55243 [00:00<00:00, 67037.19it/s]
100%|██████████| 55243/55243 [00:02<00:00, 19349.83it/s]

Adding bow for skills_str
[2022-05-31 02:09:39,364] {dataset.py:607} INFO - Adding bow for skills_str





Adding bow for education_str
[2022-05-31 02:09:47,147] {dataset.py:607} INFO - Adding bow for education_str
Adding language for languages_str
[2022-05-31 02:09:51,847] {dataset.py:597} INFO - Adding language for languages_str


100%|██████████| 55243/55243 [00:00<00:00, 137262.16it/s]

Adding language for education_str
[2022-05-31 02:09:52,251] {dataset.py:597} INFO - Adding language for education_str



100%|██████████| 55243/55243 [00:00<00:00, 81152.40it/s]

Adding language for skills_str
[2022-05-31 02:09:52,934] {dataset.py:597} INFO - Adding language for skills_str



100%|██████████| 55243/55243 [00:01<00:00, 53578.42it/s]

Adding language for description_str
[2022-05-31 02:09:53,967] {dataset.py:597} INFO - Adding language for description_str



100%|██████████| 55243/55243 [00:03<00:00, 15825.56it/s]

keep_words 28331 / 64987 = 0.4359
[2022-05-31 02:09:57,466] {language.py:67} INFO - keep_words 28331 / 64987 = 0.4359



100%|██████████| 55243/55243 [00:16<00:00, 3288.10it/s]
100%|██████████| 55243/55243 [00:09<00:00, 5659.86it/s]


Saving dataset...
[2022-05-31 02:10:25,961] {dataset.py:194} INFO - Saving dataset...
Sampling 1000 examples for testing using data/test.index file...
[2022-05-31 02:10:26,891] {dataset.py:206} INFO - Sampling 1000 examples for testing using data/test.index file...
Removing 1000 test examples from train dataset...
[2022-05-31 02:10:26,914] {dataset.py:217} INFO - Removing 1000 test examples from train dataset...
Done! Removed 1000 test examples from train dataset
[2022-05-31 02:10:26,941] {dataset.py:221} INFO - Done! Removed 1000 test examples from train dataset
Done: Saved dataset in data/dataset_fasttext/
[2022-05-31 02:10:52,139] {dataset.py:248} INFO - Done: Saved dataset in data/dataset_fasttext/


In [5]:
train_file = "data/dataset_fasttext/train.txt"

texts = []
for idx in tqdm(range(len(dataset))):
    texts.append(dataset.get_textual_description(idx))

with open(train_file, "w") as file:
    file.writelines([text + "\n" for text in texts]) 

100%|██████████| 54243/54243 [00:49<00:00, 1104.41it/s]


In [6]:
model = fasttext.train_unsupervised(train_file, minn=3, maxn=6, epoch=5, dim=100)

model.save_model("model/fasttext/cv.en.100.bin")

Read 4M words
Number of words:  14511
Number of labels: 0
Progress: 100.0% words/sec/thread:   51194 lr:  0.000000 avg.loss:  2.042001 ETA:   0h 0m 0s


In [7]:
model.get_nearest_neighbors('excel')

[(0.8943033218383789, 'microsoft'),
 (0.8815516233444214, 'word'),
 (0.8810116052627563, 'msexcel'),
 (0.8755022883415222, 'office'),
 (0.8750035166740417, 'entry'),
 (0.8490250706672668, 'data'),
 (0.831983745098114, 'typing'),
 (0.82841956615448, 'msoffice'),
 (0.811237633228302, 'powerpoint'),
 (0.8015548586845398, 'spreadsheet')]