# variant2vec

Learn an emedding of variants annotations (HGVS, BIC etc...), Hugo gene symbols, and classifications (pathogenic, benign...) from the text of pubmed articles.

Goal to see if something like [this](https://www.nature.com/articles/s41586-019-1335-8) paper can be applied to determine characteristics of a variant, or at lease identify the gene associated with the variant from the paper where multiple variants may be mentioned with no canonical gene or transcript qualifier.

In [1]:
import os
import sys
import re
import sqlite3
import pandas as pd
import gzip
import glob

def log(text):
    # Print to jupyter notebook and console so we can monitor progress in k8s
    print(text)
    sys.__stdout__.write(text + "\n")
    sys.__stdout__.flush()
    
# Syntatic sugar for debug vs. train parameters
def debug(debug_param, no_debug_param):
    return debug_param if os.environ.get("DEBUG") == "True" else no_debug_param
log(debug("DEBUG: ON", "DEBUG: OFF"))

crawl_path = "/public/groups/brcaexchange/literature-search/crawl"

DEBUG: ON


## Ingest
Load the text from the pubMunch [pubStore](https://github.com/maximilianh/pubMunch/blob/master/lib/pubStore.py) generated in a BRCA Exchange literature search crawl

In [2]:
connection = sqlite3.connect(f"file:{crawl_path}/text/articles.db?mode=ro", uri=True)
articles = pd.read_sql_query("SELECT * FROM articles", connection)
articles.pmid = articles.pmid.astype(str)
print("{} articles loaded from the articles sqlite database".format(articles.shape[0]))
articles.head()

15079 articles loaded from the articles sqlite database


Unnamed: 0,articleId,externalId,source,publisher,origFile,journal,printIssn,eIssn,journalUniqueId,year,...,issue,page,pmid,pmcId,doi,fulltextUrl,time,offset,size,chunkId
0,5011585672,PMID11585672,,download,,Trends in genetics : TIG,0168-9525,0168-9525,8507085,2001,...,10,S18,11585672,,10.1016/s0168-9525(01)02451-9,https://linkinghub.elsevier.com/retrieve/pii/S...,2018-11-14T16:29:09+0000,78,5258,0_00000
1,5019688261,PMID19688261,,download,,Breast cancer research and treatment,0167-6806,1573-7217,8111104,2010,...,3,575,19688261,,10.1007/s10549-009-0501-3,,2018-11-14T16:29:16+0000,5048,100825,0_00000
2,5012228710,PMID12228710,,download,,"Science (New York, N.Y.)",0036-8075,1095-9203,404511,2002,...,5588,1837,12228710,,10.1126/science.297.5588.1837,,2018-11-14T16:29:40+0000,1436851129,148950,0_00000
3,5029369605,PMID29369605,,download,,Genetika,0016-6758,0016-6758,47354,2016,...,10,1215,29369605,,10.1134/s102279541609012x,https://link.springer.com/article/10.1134%2FS1...,2018-11-14T16:30:43+0000,4458079378,22264,0_00000
4,5022084640,PMID22084640,,download,,Therapeutic advances in medical oncology,1758-8340,1758-8359,101510808,2011,...,6,257,22084640,3210467.0,10.1177/1758834011417039,,2018-11-14T16:30:58+0000,6079002966,93229,0_00000


In [3]:
%%time
articles = pd.DataFrame()
for path in glob.glob(f"{crawl_path}/text/*.files.gz"):
    articles = articles.append(pd.read_csv(path, sep="\t", encoding="utf-8"))
    
print("{} article text blocks consuming {:.2f} megabytes".format(
    articles.shape[0], articles.memory_usage(index=True).sum() / 2**20))

28938 article text blocks consuming 2.43 megabytes
CPU times: user 28.1 s, sys: 2.99 s, total: 31.1 s
Wall time: 31.1 s


In [4]:
articles.head()

Unnamed: 0,#fileId,externalId,articleId,url,desc,fileType,time,mimeType,locFname,content
0,5011585672001,PMID11585672,5011585672,https://api.elsevier.com/content/article/pii/S...,supplemental file (.pdf),main.pdf,2018-11-20T17:13:51+0000,application/pdf,/crawl/download/files/11585672.main.pdf,Review|A TRENDS Guide to Mouse Models of H...
1,5019688261001,PMID19688261,5019688261,https://link.springer.com/article/10.1007/s105...,supplemental file (.html),main.html,2018-11-20T17:13:51+0000,text/html,/crawl/download/files/19688261.main.html,Breast Cancer Research and TreatmentJune 201...
2,5019688261002,PMID19688261,5019688261,https://link.springer.com/content/pdf/10.1007%...,supplemental file (.pdf),main.pdf,2018-11-20T17:13:51+0000,application/pdf,/crawl/download/files/19688261.main.pdf,PRECLINICAL STUDYAn integrative genomic and t...
3,5012228710001,PMID12228710,5012228710,http://science.sciencemag.org/content/297/5588...,supplemental file (.html),main.html,2018-11-20T17:13:51+0000,text/html,/crawl/download/files/12228710.main.html,Skip to main contentScience Logo * Home ...
4,5012228710002,PMID12228710,5012228710,http://science.sciencemag.org/content/sci/297/...,supplemental file (.pdf),main.pdf,2018-11-20T17:13:51+0000,application/pdf,/crawl/download/files/12228710.main.pdf,RESEARCH ARTICLESReferences and Notes1. S....


In [5]:
%%time
variants = set()
pattern = re.compile("c\.\d+[atcgATCG]+>[atcgATCG]+")
for article in articles.itertuples():
    variants = variants.union(set(re.findall(pattern, article.content)))
print("Found {} variants".format(len(variants)))

Found 6460 variants
CPU times: user 7.33 s, sys: 556 ms, total: 7.89 s
Wall time: 7.89 s


## Clean
https://machinelearningmastery.com/clean-text-machine-learning-python/

In [6]:
!pip3 install --user --upgrade --quiet unidecode
import unidecode

!pip3 install --user --upgrade --quiet nltk
import nltk
import nltk.tokenize
import nltk.stem.porter

nltk.download("stopwords")
stop_words = nltk.corpus.stopwords.words('english')

stemmer = nltk.stem.porter.PorterStemmer()

[nltk_data] Downloading package stopwords to /tf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def clean(text):
    # Convert to ascii and replace returns with spaces and lower case
    text = re.sub(r"[\x07]+", ' ', unidecode.unidecode(text)).lower()
    
    # split into words
    words = nltk.tokenize.word_tokenize(text)
    
    # Drop all single character words and stop words - effectively removes punctuation
    words = [w for w in words if len(w) > 1 and w not in stop_words]
    
    # Stem (lowercase's as a side affect as well)
    words = [stemmer.stem(w) for w in words]
    
    return words

print(clean(articles.iloc[42].content[0:2000]))

['cancer', 'breast', 'introduct', 'genet', 'test', 'famili', 'multipl', 'case', 'breast', 'and/or', 'ovarian', 'cancer', 'often', 'target', 'youngest', 'affect', 'woman', 'index', 'case', 'famili', 'clinic', 'genet', 'test', 'context', 'larg', 'limit', 'brca1', 'brca2', 'gene', 'unless', 'addit', 'indic', 'present', 'recent', 'women', 'breast', 'cancer', 'test', 'uninform', 'identifi', 'clearli', 'pathogen', 'mutat', 'either', 'gene.1', 'mani', 'put', 'breast', 'cancer', 'suscept', 'gene', 'identifi', 'vari', 'level', 'evid', 'associ', 'breast', 'cancer', 'today', 'diagnost', 'test', 'facil', 'includ', 'larg', 'number', 'gene', 'singl', 'panel', 'test', 'use', 'massiv', 'parallel', 'next', 'gener', 'sequenc', 'consider', 'reduc', 'cost', 'howev', 'gene', 'panel', 'test', 'pose', 'consider', 'challeng', 'clinic', 'genet', 'servic', 'mani', 'gene', 'valid', 'breast', 'cancer', 'suscept', 'gene', 'even', 'risk', 'associ', 'differ', 'type', 'mutat', 'poorli', 'defined.2', 'test', 'use', 'e

In [None]:
%%time
texts = [clean(article.content) for article in articles.iloc[0:debug(250,None)].itertuples()]
print("Cleaned {} texts".format(len(texts)))

In [None]:
print("Before:")
print(articles.iloc[2].content[0:2000])

print("After:")
print(texts[2][0:250])

## Train

References:

https://radimrehurek.com/gensim/tut1.html

https://machinelearningmastery.com/develop-word-embeddings-python-gensim/

In [None]:
!pip3 install --user --upgrade --quiet gensim

import gensim
import gensim.models

In [None]:
model = gensim.models.Word2Vec(texts)

In [None]:
print(model)
print(model['brca1'])

## Visualize

https://machinelearningmastery.com/calculate-principal-component-analysis-scratch-python/

In [None]:
import sklearn.decomposition
import matplotlib.pyplot as plt

In [None]:
X = model[model.wv.vocab]

In [None]:
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = sklearn.decomposition.PCA(n_components=2)
result = pca.fit_transform(X)

In [None]:
words = list(model.wv.vocab)
targets = ["brca1", "brca2", "tp53", "her2", "breast", "cancer", "ovarian"]
indexes = [words.index(w) for w in targets]

pca = sklearn.decomposition.PCA(n_components=2)
result = pca.fit_transform(X[indexes])

# create a scatter plot of the projection
plt.figure(figsize=(10,10))
plt.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(targets):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
plt.show()