In [1]:
import gensim

In [2]:
# as can be seen below, gensim seems to remove 'no', considering it a stopword.
# i guess for such a complex text as a research paper this obfuscates the meaning
# of a sentence too much.
gensim.parsing.preprocessing.remove_stopwords("Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. ")

'Cyclin-dependent kinases (CDKs) regulate variety fundamental cellular processes. CDK10 stands orphan CDKs activating cyclin identified kinase activity revealed.'

In [3]:
# gensim stemmer is not as smart as other stemmers around, as it afair has no built in database/information 
# of the words, but is just based on some algorithmical rules of how to reduce words (such as removing '-ing', etc)
# but i think the two examples below show that it might still be useful for us.
print(gensim.parsing.preprocessing.stem("Cyclin-dependent kinases (CDKs) regulate a variety of fundamental cellular processes. CDK10 stands out as one of the last orphan CDKs for which no activating cyclin has been identified and no kinase activity revealed. "))
print(gensim.parsing.preprocessing.stem(" Previous work has shown that CDK10 silencing increases ETS2 (v-ets erythroblastosis virus E26 oncogene homolog 2)-driven activation of the MAPK pathway, which confers tamoxifen resistance to breast cancer cells."))

cyclin-depend kinas (cdks) regul a varieti of fundament cellular processes. cdk10 stand out as on of the last orphan cdk for which no activ cyclin ha been identifi and no kinas activ revealed.
previou work ha shown that cdk10 silenc increas ets2 (v-et erythroblastosi viru e26 oncogen homolog 2)-driven activ of the mapk pathway, which confer tamoxifen resist to breast cancer cells.


In [4]:
# splitting sentences seems to go well
gensim.summarization.textcleaner.split_sentences("The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10.")

['The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive.',
 'Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin.',
 'Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations.',
 'We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10.']

In [5]:
# for word in gensim.summarization.textcleaner.tokenize_by_word("The precise mechanisms by which CDK10 modulates ETS2 activity, and more generally the functions of CDK10, remain elusive. Here we demonstrate that CDK10 is a cyclin-dependent kinase by identifying cyclin M as an activating cyclin. Cyclin M, an orphan cyclin, is the product of FAM58A, whose mutations cause STAR syndrome, a human developmental anomaly whose features include toe syndactyly, telecanthus, and anogenital and renal malformations. We show that STAR syndrome-associated cyclin M mutants are unable to interact with CDK10."):
#    print(word)

In [6]:
class MySentences:
    def __init__(self, fname):
        self.fname = fname
        
    def __iter__(self):
        file = open(self.fname, "r")
        for k, line in enumerate(file):
            if k == 0: # skip first line, containing header
                continue
            (label, text) = line.split("||")
            text = gensim.parsing.preprocessing.stem(text)   # should try both with and without stemmer
            sentences = gensim.summarization.textcleaner.split_sentences(text)
            for sentence in sentences:
                sentence = gensim.parsing.preprocessing.strip_punctuation(sentence)
                yield gensim.parsing.preprocessing.strip_multiple_whitespaces(sentence).split(" ")
                
            if k % 100 == 0:
                print(k)

In [7]:
model = gensim.models.Word2Vec(MySentences("data/training_text"), min_count = 10, size = 100, workers = 2, window = 10)
model.save("models/w2v_draft")

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300


In [8]:
print(model.similarity(gensim.parsing.preprocessing.stem("phenotype"), gensim.parsing.preprocessing.stem("mutation")))
print(model.similarity(gensim.parsing.preprocessing.stem("phenotype"), gensim.parsing.preprocessing.stem("supplementary")))
print(model.similarity(gensim.parsing.preprocessing.stem("MECP2"), gensim.parsing.preprocessing.stem("FAM58A"))) # two genes

0.253435122235
0.0180694409598
0.391597545294


In [9]:
model.most_similar(gensim.parsing.preprocessing.stem("MECP2"))

[('coregulators', 0.6096699237823486),
 ('tgif', 0.5812826752662659),
 ('tcf1', 0.5306548476219177),
 ('multipurpos', 0.5202922821044922),
 ('soxcore', 0.5097544193267822),
 ('polyomaviru', 0.501846432685852),
 ('sterol', 0.49452584981918335),
 ('coactivators', 0.4882708489894867),
 ('corepressors', 0.4862823188304901),
 ('deltex1', 0.4826664924621582)]

In [10]:
model.most_similar(gensim.parsing.preprocessing.stem("FAM58A"))

[('c‐myc', 0.5921616554260254),
 ('0•0001', 0.5278559923171997),
 ('pcite4a', 0.510960042476654),
 ('s387', 0.5101921558380127),
 ('chehab', 0.5014484524726868),
 ('rsbweb', 0.4954987168312073),
 ('leder', 0.4905494451522827),
 ('translationt', 0.4891403913497925),
 ('ddg', 0.4890759587287903),
 ('medecine', 0.4860764443874359)]

In [11]:
model.most_similar(gensim.parsing.preprocessing.stem("proliferation"))

[('growth', 0.7840457558631897),
 ('proliferation', 0.7407917380332947),
 ('viabil', 0.718413233757019),
 ('viability', 0.679102897644043),
 ('apoptosi', 0.6767241954803467),
 ('motil', 0.6754286289215088),
 ('migrat', 0.6420519351959229),
 ('prolif', 0.6386007070541382),
 ('migration', 0.5674900412559509),
 ('apoptosis', 0.5645707249641418)]

In [12]:
model.most_similar(gensim.parsing.preprocessing.stem("malformation"))

[('overgrowth', 0.8914487957954407),
 ('malformations', 0.8556724190711975),
 ('malformation', 0.8470267057418823),
 ('skelet', 0.8389367461204529),
 ('meg', 0.8349440097808838),
 ('congenit', 0.829613447189331),
 ('anomalies', 0.8283162117004395),
 ('cortic', 0.8251320719718933),
 ('hypertroph', 0.82025146484375),
 ('hypoplasia', 0.818295419216156)]

In [13]:
model.most_similar(gensim.parsing.preprocessing.stem("mouse"))

[('murin', 0.7380586862564087),
 ('rodent', 0.6595398187637329),
 ('zebrafish', 0.6186248660087585),
 ('rat', 0.5817452669143677),
 ('mammari', 0.5809231996536255),
 ('knockout', 0.5788918733596802),
 ('transgen', 0.5592449903488159),
 ('mice', 0.549843966960907),
 ('embryo', 0.5462746620178223),
 ('neut', 0.5361408591270447)]

In [14]:
model.most_similar(gensim.parsing.preprocessing.stem("cyclin"))

[('cdk2', 0.600689172744751),
 ('p21', 0.5478947162628174),
 ('cdk6', 0.5254380702972412),
 ('cyclins', 0.5233792066574097),
 ('p27kip1', 0.5157514810562134),
 ('prb', 0.5146927833557129),
 ('cdc25a', 0.5141868591308594),
 ('p27', 0.5025893449783325),
 ('p21cip1', 0.49756577610969543),
 ('labaer', 0.4946333169937134)]

In [15]:
model.most_similar(gensim.parsing.preprocessing.stem("cdk10"))

[('ets2', 0.5947672128677368),
 ('rybp', 0.5735360980033875),
 ('grα', 0.5735219120979309),
 ('skp2', 0.5581691861152649),
 ('p27', 0.5579502582550049),
 ('cdc20', 0.552959680557251),
 ('cdc25a', 0.5506589412689209),
 ('catcs428c', 0.546405553817749),
 ('malt1', 0.5431875586509705),
 ('strepii', 0.5429892539978027)]

In [16]:
model.wv.most_similar(positive=['fam58a', 'genes'], negative=['mecp2'])

[('gene', 0.5157846212387085),
 ('d24', 0.4165746569633484),
 ('ccnd2', 0.41274768114089966),
 ('n561d', 0.40859490633010864),
 ('mett992i', 0.384358286857605),
 ('signif', 0.37893012166023254),
 ('uacc257', 0.37855881452560425),
 ('“high”', 0.37765616178512573),
 ('gen', 0.37401083111763),
 ('rb1', 0.37137362360954285)]

In [17]:
model.wv.most_similar(positive=['fam58a', 'cdk10'], negative=['mecp2'])

[('fzr1', 0.4606555104255676),
 ('tgfbr2r537p', 0.4507219195365906),
 ('l704n', 0.446124404668808),
 ('δegsss', 0.4413014054298401),
 ('7egfr', 0.43945831060409546),
 ('strepii', 0.43881285190582275),
 ('c‐myc', 0.4317334294319153),
 ('de2', 0.42876535654067993),
 ('epitope–tag', 0.4284369945526123),
 ('p271–170', 0.42751091718673706)]