In [1]:
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import fasttext
from gensim.models.fasttext import FastText as FT_gensim
import fse
from fse import IndexedList
from fse.models import SIF, uSIF, Average
import pandas as pd
from tqdm.notebook import tqdm
from multiprocessing import Process, cpu_count, Manager, Pool
import numpy as np
import time
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = '../../data_2020/'
model_path = 'models/'
other_path = 'others/'
workers = cpu_count()//2

In [3]:
train = pd.read_csv(data_path+'train_pre.csv').fillna('none')
test = pd.read_csv(data_path+'test_pre.csv').fillna('none')
candidate = pd.read_csv(data_path+'candidate_paper_pre.csv').fillna('none')

In [4]:
corpus = [] # for word2vec and fasttext
tagged_corpus = {} # for doc2vec
tag2idx = {} # for SIF

# key text
for sent in tqdm(train[['description_id', 'key_text_pre']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_key_train'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()
for sent in tqdm(test[['description_id', 'key_text_pre']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_key_test'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()

# description text
for sent in tqdm(train[['description_id', 'description_text_pre']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_description_train'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()
for sent in tqdm(test[['description_id', 'description_text_pre']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_description_test'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()

# candidate
for sent in tqdm(candidate[['paper_id', 'title_pro']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_title'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()
for sent in tqdm(candidate[['paper_id', 'abstract_pre']].values):
    corpus.append(sent[1].split())
    tag = sent[0]+'_abstract'
    tag2idx[tag] = len(corpus)-1
    tagged_corpus[tag] = sent[1].split()

with open(model_path+'tag2idx.pkl', 'wb') as f:
    pickle.dump(tag2idx, f)

len(corpus), len(tagged_corpus), len(tag2idx)

HBox(children=(FloatProgress(value=0.0, max=62974.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=62974.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34428.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=838938.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=838938.0), HTML(value='')))




(1872680, 1872678, 1872678)

### word2vec (needed)

In [5]:
%%time
# 12m 48s
model = Word2Vec(corpus,
                 size=256,
                 min_count=2,
                 iter=15,
                 workers=workers)
model.save(model_path+'word2vec.model')

CPU times: user 1h 37min 29s, sys: 1min 48s, total: 1h 39min 17s
Wall time: 21min 56s


### SIF - word2vec (needed)

In [6]:
%%time
# 52s
sentences = IndexedList(corpus)
sif = SIF(model)
sif.train(sentences)
sif.save(model_path+'sif.model')

CPU times: user 1min 45s, sys: 42.2 s, total: 2min 27s
Wall time: 59.3 s


### uSIF - word2vec (useless)

In [7]:
# %%time
# # 43s
# model = Word2Vec.load(model_path+'word2vec.model')
# sentences = IndexedList(corpus)
# usif = uSIF(model)
# usif.train(sentences)
# usif.save(model_path+'usif.model')

### fasttext - FB (needed)

In [8]:
%%time
# 86m 18s
with open('corpus.txt', 'w') as f:
    for sent in tqdm(corpus):
        f.write(' '.join(sent)+'\n')
        
model = fasttext.train_unsupervised('corpus.txt',
                                    dim=256,
                                    minCount=2,
                                    thread=workers,
                                    epoch=15,
                                    wordNgrams=2)
model.save_model(model_path+"fasttext2.bin")

HBox(children=(FloatProgress(value=0.0, max=1872680.0), HTML(value='')))


CPU times: user 1d 8h 34min 27s, sys: 3min 11s, total: 1d 8h 37min 39s
Wall time: 1h 26min 18s


### fasttext - gensim (useless)

In [9]:
# %%time
# # 37m 58s
# model = FT_gensim(corpus,
#                   size=256,
#                   min_count=2,
#                   workers=workers,
#                   iter=15,
#                   word_ngrams=2)
# model.save(model_path+"fasttext_gensim.bin")

### SIF - fasttext bio (useless)

In [10]:
# %%time
# print('loading model...')
# model = FT_gensim.load(model_path+'fasttext_bio.bin')
# print('training fasttext...')
# model.train(corpus, epochs=5, total_examples=len(corpus), workers=workers)
# print('training SIF...')
# sentences = IndexedList(corpus)
# sif = SIF(model)
# sif.train(sentences)
# sif.save(model_path+'sif_bio.model')