In [None]:
import torch
import multiprocessing
cores = multiprocessing.cpu_count()
cores

4

In [None]:
import os
from tqdm import tqdm_notebook
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from sklearn import utils
DIR = './data/sequences'

In [None]:
X_train = pd.read_pickle(os.path.join(DIR, 'train/X_train.pkl'))
y_train = pd.read_pickle(os.path.join(DIR, 'train/y_train.pkl'))
train_tagged = pd.Series([TaggedDocument(words=x, tags=[y]) for x, y in zip(X_train, y_train)])
train_tagged[0]

TaggedDocument(words='MSAQ KP GLHP RN RHHS RYDL ATLC QVNP ELRQ FLTL TPAG EQ SVD FANP LA VKALN KALLA HFY AVAN WDIP DG FLC PPVP GR ADYI HHLA DLLAE ASGT IPAN ASIL DI GVGA NCI YPLI GV HEY GWR FTGS ET SS QAL SS AQAII SSN PG LN RAIR LRRQ KE SGA IFN GIIH KN EQ YDA TLCN PPFH DS AAAA RAG SE RKRR NL GL NK DDAL NF GGQQQ EL WCE GG EVTF IK K M I E E S K G F A K Q V M W F T S L V S R G E N L P P L Y R A L T D V G A V K V V K K E M A Q G Q K Q S R F I A W T F M N D E Q R R R F V N R Q R', tags=['methyltransferase superfamil'])

In [None]:
from tqdm import tqdm_notebook
model_dbow = Doc2Vec(dm=0, vector_size=512, window=32, negative=4, min_count=2, workers=cores, alpha=0.008, min_alpha=0.0001, hs=0)
model_dbow.build_vocab([x for x in tqdm_notebook(train_tagged)])
model_dm = Doc2Vec(dm=1, vector_size=512, window=32, negative=4, min_count=2, workers=cores, alpha=0.008, min_alpha=0.0001, hs=0, dm_mean=0)
model_dm.build_vocab([x for x in tqdm_notebook(train_tagged)])

In [None]:
tokens = [x for x in train_tagged.values]
for epoch in tqdm_notebook(range(10)):
    model_dbow.train(utils.shuffle(tokens), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dm.train(utils.shuffle(tokens), total_examples=len(train_tagged.values), epochs=1)
    model_dm.alpha -= 0.002

In [None]:
model_dir = './models'
model_dbow.save(model_dir + 'dbow/model_dbow')
model_dm.save(model_dir + 'dm/model_dm')

In [None]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in sents])
    return targets, regressors

In [None]:
X_test = pd.read_pickle(os.path.join(DIR, 'test/X_test.pkl'))
y_test = pd.read_pickle(os.path.join(DIR, 'test/y_test.pkl'))
test_tagged = pd.Series([TaggedDocument(words=x, tags=[y]) for x, y in zip(X_test, y_test)])
test_tagged[0]

TaggedDocument(words='MAAL FVS LLA LTSL VPVQ GAATV PQTD YAKR AERVL KS AP LIDG HNDL LY AIRR STND QIYD GKLP FE TSLK GHTD LPRM RKGR MGGQ FW SVFI ACPS DPNA PI NTPK FA TRDT LEQI DV ARRL VD KYSK DLM YCD NPGC AKRA FR EGKI GSFI GIEG GHQV GSS IA ALRQA FY AGA RY MTLT HN CD NAWA TAAS TVRA GKP DLG MTDF GPAL IKE MNRL GM LV DLSH VSH Q T M R D V L K I T K A P V I F S H S S A Y E V S K H L R N V P D D V L K T V A K N N G V V M V T F V S S F V K V D D P D S A D V N T V V K H I F H I A E V A G W D H V G L G G D Y D G T T E L P K G L E D V S K Y P Y L I E K V L E A G A T E E Q A R K L V G E N V L R V W T E V E Q I A K K I Q R S G V L P V E E V W K G R N G T A L S E R S T F I E G P A P L E Y G C D', tags=['metallo-dependent hydrolases superfamil'])

In [None]:
import pickle
def get_vectors(model, train_tagged, name, training=True):
    y_train, X_train = vec_for_learning(model, train_tagged)
    if training:
        dir = 'train'
    else:
        dir = 'test'
    with open(DIR + '/' + name + '/' + dir + '/regressors.pkl', 'wb') as fp:
        pickle.dump(X_train, fp)
    with open(DIR + '/' + name + '/' + dir + '/targets.pkl', 'wb') as fp:
        pickle.dump(y_train, fp)
    print(f'[name + '-' + dir +  '-COMPLETE]')
get_vectors(model_dbow, train_tagged, 'dbow', True)
get_vectors(model_dbow, test_tagged, 'dbow', False)
get_vectors(model_dm, train_tagged, 'dm', True)
get_vectors(model_dm, test_tagged, 'dm', False)

[TRAIN-dbow-train-COMPLETE]
[TRAIN-dbow-test-COMPLETE]
[TRAIN-dm-train-COMPLETE]
[TRAIN-dm-test-COMPLETE]
