# 1 Extracting genes and variants

In [1]:
import pandas as pd
import math
from pathlib import Path
import numpy as np
import re
import glob
import configparser

from utils.misc.regex_block import MutationFinder, TmVar, CustomWBregex, normalize_mutations

In [2]:
# incase you've ran the prev notebook on splits of papers
data = []
for file in glob.glob("data/model_output/*.csv"):
    print(file)
    # 'WBPaper ID', 'Method', '* Genes', '* Gene-Variant combo', 'Mutation', 'Sentence'
    text = pd.read_csv(file).to_numpy().tolist()
    data = data + text 
data = np.array(data)

data/model_output/extracted_snippets.csv
data/model_output/extracted_snippets_part2.csv
data/model_output/extracted_snippets_part3.csv


In [3]:
OPENING_CLOSING_REGEXES = [r'((?:^|[\s\(\[\'"/,;\-])', r'(?:^|[\s\(\[\'"/,;\-]))']

# the allele regex and db idea was stolen from wbtools
allele_designations = np.load('data/gsoc/wbtools/wb_allele_designations.npy').astype('U6')
alleles_variations = np.load('data/gsoc/wbtools/wb_alleles_variations.npy').astype('U6')
DB_VAR_REGEX = r'({designations}|m|p|ts|gf|lf|d|sd|am|cs)([0-9]+)'
var_regex_1 = OPENING_CLOSING_REGEXES[0] + DB_VAR_REGEX.format(designations="|".join(allele_designations)) + OPENING_CLOSING_REGEXES[1]
all_var = OPENING_CLOSING_REGEXES[0] + '|'.join(alleles_variations) + '|' + var_regex_1 + OPENING_CLOSING_REGEXES[1]
all_var = [re.compile(r,re.IGNORECASE) for r in [all_var]]

# 'WBPaper ID', 'Method', '* Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
updated_data = []
total = len(data)
print('Total sentences: {}, processed count: '.format(total), end=' ')
for i, sent in enumerate(data[:, -1]):
    if (i+1) % 100 == 0: print(f"{i+1}", end = " ")
    variants = []
    for regex in all_var:      
        for m in regex.finditer(sent):
            span = (m.start(0), m.end(0))    
            raw = (sent[span[0]:span[1]]).strip()
            raw = raw[1:] if not raw[0].isalnum() else raw
            raw = raw[:-1] if not raw[-1].isalnum() else raw
            if len(raw.strip()) > 1: variants.append(raw.strip())
    if variants:
        variants  = list(set(variants))
        variants = "'" + "', '".join(variants) + "'"
    else:
        variants = ''
    updated_data.append([data[i,0], data[i,1], data[i,2], variants, data[i,-3], data[i,-2], data[i,-1]])

Total sentences: 19001, processed count:  100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 17100 17200 17300 17400 17500 17600 17700 1780

In [4]:
# above cell takes a while to complete, so saving the data temporarily
updated_data = pd.DataFrame(updated_data[:], columns=['WBPaper ID', 'Method', '*Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'])
updated_data.to_csv("data/model_output/processed/snippets_1.csv", index=False, encoding='utf-8')
updated_data = None

# 2 Normalizing genes to WB dictionary

In [5]:
data = pd.read_csv("data/model_output/processed/snippets_1.csv")
data = data.to_numpy() # 'WBPaper ID', 'Method', 'Genes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'

In [6]:
wb_genes_1 = Path('data/gsoc/Gene_alias.1.txt').read_text().split('\n')
wb_genes_2 = Path('data/gsoc/Gene_alias.2.txt').read_text().split('\n')
wb_genes_3 = Path('data/gsoc/Gene_alias.3.txt').read_text().split('\n')

wb_genes_1 = [r.split('\t') for r in wb_genes_1]
wb_genes_2 = [r.split(' ') for r in wb_genes_2]
wb_genes_3 = [r.split(' ') for r in wb_genes_3]

Inefficient way to do this. Have to work on better search algo.

In [7]:
all_wb_genes = dict()

for row in wb_genes_1+wb_genes_2+wb_genes_3:
    if row[0] not in all_wb_genes.keys():
        all_wb_genes[row[0]] = []
    for gene in row[1:]: 
        if len(gene) and gene.lower() not in all_wb_genes[row[0]]: 
            all_wb_genes[row[0]].append(gene.lower())
len(all_wb_genes)

306123

In [8]:
print('Total sentences: {}, processed count: '.format(len(data)), end=' ')
updated_data = []

for i, genes in enumerate(data[:, 2]):
    if (i+1) % 100 == 0: print(f"{i+1}", end = " ")
    # checking if nan
    if type(genes) == float:
        col_genes = ''
    else:
        genes = genes[1:-1].split("', '")
        col_genes = []
        
        for gene in genes:
            for key, value in all_wb_genes.items():
                if gene.lower() in value:
                    col_genes.append(key)
                    break
        if col_genes:
            col_genes = list(set(col_genes))
            col_genes = "'" + "', '".join(col_genes) + "'"
        else: 
            col_genes = ''
    updated_data.append([data[i,0], data[i,1], data[i,2], col_genes, data[i,3], data[i,4], data[i,5], data[i,6]])
    
data = updated_data # 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
updated_data = None

Total sentences: 19001, processed count:  100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 13100 13200 13300 13400 13500 13600 13700 13800 13900 14000 14100 14200 14300 14400 14500 14600 14700 14800 14900 15000 15100 15200 15300 15400 15500 15600 15700 15800 15900 16000 16100 16200 16300 16400 16500 16600 16700 16800 16900 17000 17100 17200 17300 17400 17500 17600 17700 1780

Checking if any detected gene was NOT in the WB gene dictionary

In [9]:
data = np.array(data)
data[len(data[:,2]) != len(data[:,3])] 

array([], shape=(0, 19001, 8), dtype='<U11773')

In [10]:
# above cell takes a while to complete, so saving the data temporarily
data = data.tolist()
data = pd.DataFrame(data[:], columns=['WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'])
data.to_csv("data/model_output/processed/snippets_2.csv", index=False, encoding='utf-8')
data = None

# 3 Normalizing mutations to  one-letter amino acid codes

These code imports would be doing the same thing done in notebook 2, but on a much small subset of data.   
TODO later, not code breaking: This additional metadata of how the mutation was extracted should be inside notebook 2.

In [2]:
data = pd.read_csv("data/model_output/processed/snippets_2.csv")
data = data.to_numpy() # 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'

In [3]:
db_config = configparser.ConfigParser()
db_config.read('utils/all_config.cfg')

custom_mut_extract = CustomWBregex(db_config, locus_only=True)
mf_mut_extract = MutationFinder('data/regexs/mutationfinder_regex/seth_modified.txt')
tmvar_mut_extract = TmVar('data/regexs/tmvar_regex/final_regex_path')

  self._regular_expressions.append(re.compile(reg))


In [4]:
def point_mut_block(sentence, span_size=150):
    mut_and_snippets = []
    
    # MutationFinder
    mut_and_snippets = mut_and_snippets + mf_mut_extract(sentence, span_size=span_size)
    # tmVar
    mut_and_snippets = mut_and_snippets + tmvar_mut_extract(sentence, span_size=span_size)
    # Custom patterns
    mut_and_snippets = mut_and_snippets + custom_mut_extract(sentence, span_size=span_size)

    if mut_and_snippets:
        mut_and_snippets = np.array(mut_and_snippets)
        mut_and_snippets = mut_and_snippets[:, 0].tolist()
        mut_and_snippets = list(set(mut_and_snippets))
    return mut_and_snippets

In [5]:
point_mut_block('ad465 , nucleotide 2862 of the coding region to 400 bp downstream G-to-A change at nucleotide 673 resulting in a stop codon from the stop site was amplified using primers ATGGATGAAC at amino acid 107; ad692 , T-to-G change at nucleotide 811 TATACAA')

['G-to-A change at nucleotide 673', 'T-to-G change at nucleotide 811']

In [6]:
normalize_mutations('T-to-G change at nucleotide 811'), normalize_mutations('Phe230Leu'), \
normalize_mutations('C to T at nucleotide 4539'), normalize_mutations('methionine for lysine-856'), \
normalize_mutations('glycine-118 is replaced by an arginine'), normalize_mutations('1247 (valine to leucine')

('T811G', 'F230L', 'C4539T', 'M856K', 'G118R', 'V1247L')

In [7]:
normalize_mutations('Phe230amber')

'F230AMBER'

Working with the protein mutations from regex block for now

In [8]:
# old - 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Sentence'
# new - 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'
temp = []
total_count = len(data)
ner_count = 0
regex_count = 0
print('Following mutations could NOT be normalized. Either a) normalize them manually and add in the csv file b) Make edits in the normalize_mutations fn')
for i, row in enumerate(data):
    if row[1] != 'Regex':
        if row[1] == 'NER':
            ner_count += 1
        temp.append(np.insert(data[i], -1, '').tolist())
    else:
        regex_count += 1
        norm_mutations = []
        mutations = data[i, -2][1:-1].split("', '")
        for raw_mut in mutations: 
            mut = point_mut_block(raw_mut)
            if mut:
                # helps filtering obvious ones
                for m in mut:
                    m = m.replace(",", "")
                    if m.find(')') != -1:
                        if m.find('(') == -1:
                            continue
                    try:
                        norm_mut = normalize_mutations(mut[0])
                        norm_mutations.append(norm_mut)
                    except KeyError:
                        print(m)
        if norm_mutations:
            norm_mutations = list(set(norm_mutations))
            norm_mutations = "'" + "', '".join(norm_mutations) + "'"
        else: 
            norm_mutations = ''
        temp.append(np.insert(data[i], -1, norm_mutations).tolist())
        
data = temp
temp = None

Following mutations could NOT be normalized. Either a) normalize them manually and add in the csv file b) Make edits in the normalize_mutations fn
Lys-82 to Lys
53 bp del
serine at position 377 to phenylalanine
arginine at position 551 with a histidine
glycine at position 573 with a serine
aspartic acid to asparagine at codon 652
glutamine at codon 13 to a stop
glutamic acid 230 to lysine
glycine at position 560 to an arginine
glycine at position 558 to an arginine
glycine at position 76 changed to glutamic acid
glycine at position 76 changed to glutamic acid
glutamic acid for glycine at codon 13


In [9]:
print('All', ner_count, 'NER data row was ignored. Only', regex_count, 'regex data rows were used.')

All 1400 NER data row was ignored. Only 626 regex data rows were used.


In [10]:
# saving things
data = pd.DataFrame(data[:], columns=['WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'])
data.to_csv("data/model_output/processed/snippets_3.csv", index=False, encoding='utf-8')

# 4 Validation

In [2]:
wb_genes_1 = Path('data/gsoc/Gene_alias.1.txt').read_text().split('\n')
wb_genes_2 = Path('data/gsoc/Gene_alias.2.txt').read_text().split('\n')
wb_genes_3 = Path('data/gsoc/Gene_alias.3.txt').read_text().split('\n')

wb_genes_1 = [r.split('\t') for r in wb_genes_1]
wb_genes_2 = [r.split(' ') for r in wb_genes_2]
wb_genes_3 = [r.split(' ') for r in wb_genes_3]

all_wb_genes = dict()
for row in wb_genes_1+wb_genes_2+wb_genes_3:
    if row[0] not in all_wb_genes.keys():
        all_wb_genes[row[0]] = []
    for gene in row[1:]: 
        if len(gene) and gene not in all_wb_genes[row[0]]: 
            if any(x.isupper() for x in gene):
                continue
            all_wb_genes[row[0]].append(gene)
len(all_wb_genes)

306123

In [3]:
data = pd.read_csv("data/model_output/processed/snippets_3.csv")
data = data.to_numpy() # 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'

In [4]:
proteinfa = Path('data/gsoc/proteinfa/c_elegans.PRJNA13758.WS281.protein.fa').read_text().split('>')[1:]

In [5]:
wb_gene_and_prot = dict() # {wbgene: [transcript, protein]}

for row in proteinfa:
    wbgene = re.findall("WBGene[0-9]+", row)[0]
    protein = "".join(re.findall("\n.*", row)).replace('\n','')
    transcript = row.split(' ')[0]
    if wbgene not in wb_gene_and_prot.keys():
        wb_gene_and_prot[wbgene] = []
    wb_gene_and_prot[wbgene].append([transcript, protein])
    
len(wb_gene_and_prot)

19987

In [6]:
def unique_rows(a):
    a = np.ascontiguousarray(a)
    unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
    return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))

### There are two ways to pair the genes and mutations - 
### A) Create a dictionary of genes and mutations mentioned in paper (within some constraint) and pair up one by one.
### B) Create a pair of gene and mutation only when BOTH are present in same sentence.
Results of each approach is present at end.

#### Approach A - 

In [7]:
# threshold of how close the neighboring sentence with mutation should be 
# putting 0 would mean genes would be considered only from sentence with mutation info
thres_neighboring_sent_count = 0

prev_line_with_mut = -9999999
paper_raw_info_compiled = dict()
# 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'
for row in data:
    ppr_id = row[0]
    norm_muts = row[-2]
    wbgenes = row[3]
    genes = row[2]
    variants = row[4]
    sentence = row[-1]
    line_number = int(row[-1].split()[1][:1])
    if ppr_id not in paper_raw_info_compiled.keys():
        paper_raw_info_compiled[ppr_id] = {'Mutations':[], 'WBGenes':[], 'Genes':[], 'Variants':[]}
        
    # filtering out nan values
    if type(norm_muts) != float:    
        prev_line_with_mut = line_number
        norm_muts = norm_muts[1:-1].split("', '")
        for m in norm_muts: 
            if m not in [row[0] for row in paper_raw_info_compiled[ppr_id]['Mutations']]:
                paper_raw_info_compiled[ppr_id]['Mutations'].append([m, sentence])
    elif not (line_number - prev_line_with_mut) <= thres_neighboring_sent_count:
        continue
        
    if type(wbgenes) != float:
        wbgenes = wbgenes[1:-1].split("', '")
        for w in wbgenes: 
            if w not in [row[0] for row in paper_raw_info_compiled[ppr_id]['WBGenes']]:
                paper_raw_info_compiled[ppr_id]['WBGenes'].append([w, sentence])
    if type(variants) != float:
        variants = variants[1:-1].split("', '")
        for v in variants: 
            if v not in [row[0] for row in paper_raw_info_compiled[ppr_id]['Variants']]:
                paper_raw_info_compiled[ppr_id]['Variants'].append([v, sentence])

In [8]:
matches = [] 
final_sheet = [] # ppr_id, gene, transcript

for i, (ppr_id, info_from_ppr) in enumerate(paper_raw_info_compiled.items()):  
    wbgenes = info_from_ppr['WBGenes']
    mutations = info_from_ppr['Mutations']
    for gene, gene_sent in wbgenes:
        if gene not in wb_gene_and_prot.keys():
            continue
        for row in wb_gene_and_prot[gene]:
            transcript, protein_string = row
            for mut, mut_sent in mutations:
                if not len(mut):
                    continue
                wt_res = mut[0]
                pos = int(''.join(n for n in mut if n.isdigit()))
                mut_res = mut[-1]
                try:
                    if protein_string[pos-1] == wt_res:
                        # adding these weird characters to make the splitting easier later
                        # hopefully no author ever decides put these exact sequence of characters in their paper
                        # because then the code will break hehe
                        matches.append([ppr_id, gene + ' ' + mut + ' ' + transcript + ' ?@#$' + gene_sent + ' ?@#$' + mut_sent])
                except IndexError:
                    pass
    
matches = unique_rows(matches)
for r in matches:
    p = r[0]
    temp, g_sent, m_sent = r[1].split(' ?@#$')
    wbg, m, t = temp.split()
    # Adding gene common names column, again
    # Current code doesn't keep any link between the WB gene name and the common name
    g_common_name = all_wb_genes[wbg]
    g_common_name = ', '.join(g_common_name)
    final_sheet.append([p,wbg,g_common_name,m,t,g_sent,m_sent])
final_sheet = np.array(final_sheet)

In [9]:
len(final_sheet)

1939

In [10]:
# saving things
final_sheet = pd.DataFrame(final_sheet[:], columns=['WBPaper ID', 'WBGene', 'Genes', 'Mutation', 'Transcript', 'Gene Sentence', 'Mutation Sentence'])
final_sheet.to_csv("data/model_output/processed/final_A.csv", index=False, encoding='utf-8')

#### Approach B - 

In [11]:
paper_raw_info_compiled = []
# 'WBPaper ID', 'Method', 'Genes', 'WBGenes', 'Variants', '*Gene-Variant combo ', 'Mutations', 'Normalized Mutations', 'Sentence'
for row in data:
    ppr_id = row[0]
    norm_muts = row[-2]
    wbgenes = row[3]
    sentence = row[-1]
        
    # filtering out nan values
    if type(norm_muts) != float and type(wbgenes) != float:    
        norm_muts = norm_muts[1:-1].split("', '")
        wbgenes = wbgenes[1:-1].split("', '")
        for m in norm_muts:
            for g in wbgenes:
                if len(m) and len(g):
                    paper_raw_info_compiled.append([ppr_id, g, m, sentence])

In [12]:
matches = [] 
final_sheet = [] # ppr_id, gene, transcript

for info_from_ppr in paper_raw_info_compiled:
    ppr_id = info_from_ppr[0]
    gene = info_from_ppr[1]
    mut = info_from_ppr[2]
    sent = info_from_ppr[3]
    if not len(mut):
        continue
    if gene not in wb_gene_and_prot.keys():
        continue
    for row in wb_gene_and_prot[gene]:
        transcript, protein_string = row
        wt_res = mut[0]
        pos = int(''.join(n for n in mut if n.isdigit()))
        mut_res = mut[-1]
        try:
            if protein_string[pos-1] == wt_res:
                # adding these weird characters to make the splitting easier later
                # hopefully no author ever decides put these exact sequence of characters in their paper
                # because then the code will break hehe
                matches.append([ppr_id, gene + ' ' + mut + ' ' + transcript + ' ?@#$' + sent])
        except IndexError:
            pass
    
matches = unique_rows(matches)
for r in matches:
    p = r[0]
    temp, sent = r[1].split(' ?@#$')
    wbg, m, t = temp.split()
    # Adding gene common names column, again
    # Current code doesn't keep any link between the WB gene name and the common name
    g_common_name = all_wb_genes[wbg]
    g_common_name = ', '.join(g_common_name)
    final_sheet.append([p,wbg,g_common_name,m,t, sent])

In [13]:
len(final_sheet)

941

In [14]:
# saving things
final_sheet = pd.DataFrame(final_sheet[:], columns=['WBPaper ID', 'WBGene', 'Gene', 'Mutation', 'Transcript', 'Sentence'])
final_sheet.to_csv("data/model_output/processed/final_B.csv", index=False, encoding='utf-8')

# 5 Verification

In [15]:
data_A = pd.read_csv("data/model_output/processed/final_A.csv")
data_A = data_A.to_numpy() 
paper_ids_processed_A = np.unique(data_A[:,0])
paper_ids_processed_A = np.sort(paper_ids_processed_A)

data_B = pd.read_csv("data/model_output/processed/final_B.csv")
data_B = data_B.to_numpy() 
paper_ids_processed_B = np.unique(data_B[:,0])
paper_ids_processed_B = np.sort(paper_ids_processed_B)

temp = pd.read_csv("data/model_output/processed/snippets_1.csv")
temp = temp.to_numpy()
total_paper_ids_processed = np.unique(temp[:,0])
temp = None

In [16]:
print('Total count of papers processed:', len(total_paper_ids_processed))
print('Count of papers from approach A:', len(paper_ids_processed_A))
print('Count of papers from approach B:', len(paper_ids_processed_B))

Total count of papers processed: 100
Count of papers from approach A: 68
Count of papers from approach B: 52


In [17]:
ground_truth = Path('data/gsoc/Variants_best_outcome.txt').read_text().split('\n')
ground_truth = [r.split('\t') for r in ground_truth][:-1]
ground_truth = np.array(ground_truth, dtype=object)

In [18]:
# Checking if any processed paper is not in the ground truth file
for id in paper_ids_processed_A:
    if id not in ground_truth[:,0]:
        print('Approach A', id)
for id in paper_ids_processed_B:
    if id not in ground_truth[:,0]:
        print('Approach B', id)

Approach A WBPaper00030864
Approach B WBPaper00030864


In [19]:
tp_col = []
for row in data_A:
    paper_id = row[0]
    gene = row[1]
    mutation = row[3]
    mutation = mutation.upper()
    transcript = row[4]
    bool_found = False
    for label in ground_truth[ground_truth[:,0] == paper_id]:
        label[-2] = label[-2].upper()
        if transcript == label[-1] and mutation == label[-2]:
            bool_found = True
            # continue bc we're storing all the labels from a paper
            continue
    if bool_found:
        tp_col.append('True Positive')
    else:
        tp_col.append('False Positive')

In [20]:
tp_col.count('True Positive'), tp_col.count('False Positive')

(399, 1540)

In [21]:
print('Precision ',tp_col.count('True Positive')*100/(tp_col.count('True Positive') + tp_col.count('False Positive')), '%')

Precision  20.577617328519857 %


In [22]:
tp_col = np.array(tp_col).T.reshape(-1, 1)
final_sheet = np.hstack((data_A,tp_col))
# saving things
final_sheet = pd.DataFrame(final_sheet[:], columns=['WBPaper ID', 'WBGene', 'Gene', 'Mutation', 'Transcript', 'Gene Sentence', 'Mutation Sentence', 'Result'])
final_sheet.to_csv("data/model_output/processed/final_A_verified.csv", index=False, encoding='utf-8')

In [23]:
tp_col = []
for row in data_B:
    paper_id = row[0]
    gene = row[1]
    mutation = row[3]
    mutation = mutation.upper()
    transcript = row[4]
    bool_found = False
    for label in ground_truth[ground_truth[:,0] == paper_id]:
        label[-2] = label[-2].upper()
        if transcript == label[-1] and mutation == label[-2]:
            bool_found = True
            # continue bc we're storing all the labels from a paper
            continue
    if bool_found:
        tp_col.append('True Positive')
    else:
        tp_col.append('False Positive')

In [24]:
tp_col.count('True Positive'), tp_col.count('False Positive')

(457, 484)

In [25]:
print('Precision ',tp_col.count('True Positive')*100/(tp_col.count('True Positive') + tp_col.count('False Positive')), '%')

Precision  48.565356004250795 %


In [26]:
tp_col = np.array(tp_col).T.reshape(-1, 1)
final_sheet = np.hstack((data_B,tp_col))
# saving things
final_sheet = pd.DataFrame(final_sheet[:], columns=['WBPaper ID', 'WBGene', 'Gene', 'Mutation', 'Transcript', 'Sentence', 'Result'])
final_sheet.to_csv("data/model_output/processed/final_B_verified.csv", index=False, encoding='utf-8')

## Results of approach A and B used in section 4:

Approach A-  
TP: 444, FP: 1501  
Precision: 22.82%  
Approach B-  
TP: 494, FP: 446  
Precision: 52.55%  

### Checking how many matches are present in the ground truth for the processed papers

In [27]:
all_from_truth = []
for ppr in paper_ids_processed_A:
    for label in ground_truth[ground_truth[:,0] == ppr]:
        label[-2] = label[-2].upper()
        all_from_truth.append(label)
len(all_from_truth)

2285

In [28]:
all_from_truth = []
for ppr in paper_ids_processed_B:
    for label in ground_truth[ground_truth[:,0] == ppr]:
        label[-2] = label[-2].upper()
        all_from_truth.append(label)
len(all_from_truth)

1953

Error in atexit._run_exitfuncs:
Traceback (most recent call last):
  File "/mnt/c/Users/Rishab/Documents/GitHub/genomic-info-from-papers/genomic-info-from-papers/env/lib/python3.8/site-packages/IPython/core/history.py", line 576, in end_session
    self.db.execute("""UPDATE sessions SET end=?, num_cmds=? WHERE
sqlite3.OperationalError: disk I/O error
