# 1 Extracting genes and variants

In [1]:
import pandas as pd
import math
from pathlib import Path
import numpy as np
import re

In [None]:
text = pd.read_csv("data/model_output/extracted_snippets.csv")
text = text.to_numpy() # 'WBPaper ID', 'Method', '* Genes', '* Gene-Variant combo', 'Mutation', 'Sentence'

OPENING_CLOSING_REGEXES = [r'((?:^|[\s\(\[\'"/,;\-])', r'(?:^|[\s\(\[\'"/,;\-]))']
wb_genes = np.load('data/gsoc/wbtools/all_gene_names.npy')
all_genes = Path('data/gsoc/wbtools/genes.txt').read_text().split('\n')
for g in wb_genes: all_genes.append(g)
all_genes = [g for g in all_genes if len(g) > 1]
all_genes = list(set(all_genes))
all_genes = OPENING_CLOSING_REGEXES[0] + '|'.join(all_genes) + OPENING_CLOSING_REGEXES[1]
all_genes = [re.compile(r,re.IGNORECASE) for r in [all_genes]]

# the allele regex and db idea was stolen from wbtools
allele_designations = np.load('data/gsoc/wbtools/wb_allele_designations.npy').astype('U6')
alleles_variations = np.load('data/gsoc/wbtools/wb_alleles_variations.npy').astype('U6')
DB_VAR_REGEX = r'({designations}|m|p|ts|gf|lf|d|sd|am|cs)([0-9]+)'
var_regex_1 = OPENING_CLOSING_REGEXES[0] + DB_VAR_REGEX.format(designations="|".join(allele_designations)) + OPENING_CLOSING_REGEXES[1]
all_var = OPENING_CLOSING_REGEXES[0] + '|'.join(alleles_variations) + '|' + var_regex_1 + OPENING_CLOSING_REGEXES[1]
all_var = [re.compile(r,re.IGNORECASE) for r in [all_var]]

# 'WBPaper ID', 'Method', '* Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
data_rows = []
total = len(text)
print('Total sentences: {}, processed count: '.format(total), end=' ')
for i, sent in enumerate(text[:, -1]):
    if (i+1) % 50 == 0: print(f"{i+1}", end = " ")
    genes = []
    for regex in all_genes:      
        for m in regex.finditer(sent):
            span = (m.start(0), m.end(0))    
            raw = (sent[span[0]:span[1]]).strip()
            raw = raw[1:] if not raw[0].isalnum() else raw
            raw = raw[:-1] if not raw[-1].isalnum() else raw
            if len(raw.strip()) > 1: genes.append(raw.strip())
    variants = []
    for regex in all_var:      
        for m in regex.finditer(sent):
            span = (m.start(0), m.end(0))    
            raw = (sent[span[0]:span[1]]).strip()
            raw = raw[1:] if not raw[0].isalnum() else raw
            raw = raw[:-1] if not raw[-1].isalnum() else raw
            if len(raw.strip()) > 1: variants.append(raw.strip())
    if genes:
        genes  = list(set(genes))
        genes = "'" + "', '".join(genes) + "'"
    else:
        genes = ''
    if variants:
        variants  = list(set(variants))
        variants = "'" + "', '".join(variants) + "'"
    else:
        variants = ''
    data_rows.append([text[i,0], text[i,1], genes, variants, text[i,2], text[i,3], text[i,4]])

In [3]:
# above cell takes a while to complete, so saving the data temporarily
data = pd.DataFrame(data_rows[:], columns=['WBPaper ID', 'Method', 'Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'])
data.to_csv("data/model_output/extracted_snippets_1.csv", index=False, encoding='utf-8')

# 2 Normalizing genes to WB dictionary

In [2]:
data = pd.read_csv("data/model_output/extracted_snippets_1.csv")
data = data.to_numpy() # 'WBPaper ID', 'Method', 'Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'

In [3]:
wb_genes_1 = Path('data/gsoc/Gene_alias.1.txt').read_text().split('\n')
wb_genes_2 = Path('data/gsoc/Gene_alias.2.txt').read_text().split('\n')
wb_genes_3 = Path('data/gsoc/Gene_alias.3.txt').read_text().split('\n')

wb_genes_1 = [r.split('\t') for r in wb_genes_1]
wb_genes_2 = [r.split(' ') for r in wb_genes_2]
wb_genes_3 = [r.split(' ') for r in wb_genes_3]

Inefficient way to do this. Have to work on better search algo.

In [4]:
all_wb_genes = dict()

for row in wb_genes_1+wb_genes_2+wb_genes_3:
    if row[0] not in all_wb_genes.keys():
        all_wb_genes[row[0]] = []
    for gene in row[1:]: 
        if len(gene) and gene.lower() not in all_wb_genes[row[0]]: 
            all_wb_genes[row[0]].append(gene.lower())
len(all_wb_genes)

306123

In [5]:
print('Total sentences: {}, processed count: '.format(len(data)), end=' ')
temp = []

for i, genes in enumerate(data[:, 2]):
    if (i+1) % 10 == 0: print(f"{i+1}", end = " ")
    # checking if nan
    if type(genes) == float:
        col_genes = ''
    else:
        genes = genes[1:-1].split("', '")
        col_genes = []
        
        for gene in genes:
            for key, value in all_wb_genes.items():
                if gene.lower() in value:
                    col_genes.append(key)
                    break
        if col_genes:
            col_genes = list(set(col_genes))
            col_genes = "'" + "', '".join(col_genes) + "'"
        else: 
            col_genes = ''
    temp.append([data[i,0], data[i,1], data[i,2], col_genes, data[i,3], data[i,4], data[i,5], data[i,6]])
    
data = temp # 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
temp = None

Total sentences: 60, processed count:  10 20 30 40 50 60 

Checking if any detected gene was NOT in the WB gene dictionary

In [6]:
data = np.array(data)
data[len(data[:,2]) != len(data[:,3])] 

array([], shape=(0, 60, 8), dtype='<U1900')

# 3 Normalizing mutations to  one-letter amino acid codes

These code imports would be doing the same thing done in notebook 2.   
TODO later, not code breaking: This additional metadata of how the mutation was extracted should be inside notebook 2.

In [7]:
import configparser

from utils.misc.regex_block import mutation_finder_from_regex_filepath, TmVar, CustomWBregex, normalize_mutations

In [8]:
db_config = configparser.ConfigParser()
db_config.read('utils/all_config.cfg')

custom_mut_extract = CustomWBregex(db_config, locus_only=True)
mf_mut_extract = mutation_finder_from_regex_filepath('data/regexs/mutationfinder_regex/seth_modified.txt')
tmvar_mut_extract = TmVar('data/regexs/tmvar_regex/final_regex_path')

  self._regular_expressions.append(re.compile(reg))


In [9]:
def point_mut_block(sentence, span_size=150):
    mut_and_snippets = []
    
    # MutationFinder
    for mutation, snip in mf_mut_extract(raw_text=sentence, span_size=span_size).items():
        mut_and_snippets.append([mutation.OriginalMention, snip])
    # tmVar
    mut_and_snippets = mut_and_snippets + tmvar_mut_extract(sentence, span_size=span_size)
    # Custom patterns
    mut_and_snippets = mut_and_snippets + custom_mut_extract(sentence, span_size=span_size)
    
    if mut_and_snippets:
        mut_and_snippets = mut_and_snippets[:][0]
        mut_and_snippets = list(set(mut_and_snippets))
    return mut_and_snippets

In [10]:
normalize_mutations('G1110 to E')

'G1110E'

In [11]:
normalize_mutations('Phe230Leu')

'F230L'

Working with the protein mutations from regex block for now

In [14]:
# old - 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
# new - 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'
temp = []
for i, row in enumerate(data):
    if row[1] != 'Regex':
        temp.append(np.insert(data[i], -1, '').tolist())
    else:
        norm_mutations = []
        mutations = data[i, -2][1:-1].split("', '")
        for mut in mutations: 
            mut = point_mut_block(mut)
            if mut:
                norm_mut = normalize_mutations(mut[0])
                norm_mutations.append(norm_mut)
        if norm_mutations:
            norm_mutations = list(set(norm_mutations))
            norm_mutations = "'" + "', '".join(norm_mutations) + "'"
        else: 
            norm_mutations = ''
        temp.append(np.insert(data[i], -1, norm_mutations).tolist())
        
data = temp
temp = None

In [15]:
# saving things
data = pd.DataFrame(data[:], columns=['WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'])
data.to_csv("data/model_output/extracted_snippets_2.csv", index=False, encoding='utf-8')

# 4 Validation

In [2]:
data = pd.read_csv("data/model_output/extracted_snippets_2.csv")
data = data.to_numpy() # 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'

In [3]:
proteinfa = Path('data/gsoc/proteinfa/c_elegans.PRJNA13758.WS281.protein.fa').read_text().split('>')[1:]

In [4]:
wb_gene_and_prot = dict() # {wbgene: [transcript, protein]}

for row in proteinfa:
    wbgene = re.findall("WBGene[0-9]+", row)[0]
    protein = "".join(re.findall("\n.*", row)).replace('\n','')
    transcript = row.split(' ')[0]
    if wbgene not in wb_gene_and_prot.keys():
        wb_gene_and_prot[wbgene] = []
    wb_gene_and_prot[wbgene].append([transcript, protein])
    
len(wb_gene_and_prot)

19987

In [5]:
paper_raw_info_compiled = dict()

# 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'
for row in data:
    ppr_id = row[0]
    norm_muts = row[-2]
    wbgenes = row[3]
    variants = row[4]
    # filtering out nan values
    if type(norm_muts) == float:
        continue
    if ppr_id not in paper_raw_info_compiled.keys():
        paper_raw_info_compiled[ppr_id] = {'Mutations':[], 'WBGenes':[], 'Variants':[]}
        
    norm_muts = norm_muts[1:-1].split("', '")
    for m in norm_muts: paper_raw_info_compiled[ppr_id]['Mutations'].append(m)
    if type(wbgenes) != float:
        wbgenes = wbgenes[1:-1].split("', '")
        for w in wbgenes: paper_raw_info_compiled[ppr_id]['WBGenes'].append(w)
    if type(variants) != float:
        variants = variants[1:-1].split("', '")
        for v in variants: paper_raw_info_compiled[ppr_id]['Variants'].append(v)

In [6]:
def unique_rows(a):
    a = np.ascontiguousarray(a)
    unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
    return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))

In [8]:
matches = [] 
final_sheet = [] # ppr_id, gene, transcript

for ppr_id, info_from_ppr in paper_raw_info_compiled.items():
    wbgenes = info_from_ppr['WBGenes']
    mutations = info_from_ppr['Mutations']
    for gene in wbgenes:
        for row in wb_gene_and_prot[gene]:
            transcript, protein_string = row
            for mut in mutations:
                wt_res = mut[0]
                pos = int(''.join(n for n in mut if n.isdigit()))
                mut_res = mut[-1]
                if protein_string[pos-1] == wt_res:
                    matches.append([ppr_id, gene + ' ' + mut + ' ' + transcript])
matches = unique_rows(matches)
for r in matches:
    p = r[0]
    g, m, t = r[1].split()
    final_sheet.append([p,g,m,t])

In [9]:
final_sheet

[['WBPaper00006391', 'WBGene00000901', 'E162K', 'W01G7.1'],
 ['WBPaper00040140', 'WBGene00000123', 'A363V', 'F36A4.7']]

In [44]:
# saving things
data = pd.DataFrame(final_sheet[:], columns=['WBPaper ID', 'WBGenes', 'Mutations', 'Transcript'])
data.to_csv("data/model_output/final_matches.csv", index=False, encoding='utf-8')