# For training BioBERT NER on nala
Run this in CLI -  

python train_ner.py --model_name_or_path dmis-lab/biobert-base-cased-v1.1 --train_file data/nala/train_dev.json --validation_file data/nala/devel.json --text_column_name tokens --label_column_name tags --pad_to_max_length --max_length 192 --per_device_train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 10 --output_dir models/nala --seed 1
  
Note:  Training took <30 mins on 1660 ti. You can decrease the num_train_epochs count to 5 without any substantial difference in accuracy.

# 1 Regex 
## Has high precision and low recall.    
## Consists of 3 parts - MutationFinder, tmVar and some custom patterns from WB papers 
### 1.1 Mutation Finder [Link](https://github.com/divyashan/MutationFinder), Modified regex from SETH [Link](https://github.com/rockt/SETH/blob/master/resources/mutations.txt)

In [1]:
import re
import os
import time
 
import numpy as np
import pandas as pd
import glob
from pathlib import Path

from sklearn.metrics import precision_recall_fscore_support

In [2]:
from utils.misc.regex_block import mutation_finder_from_regex_filepath, TmVar, BOWdictionary, CustomWBregex

In [3]:
mf_regex_path = 'data/regexs/mutationfinder_regex/seth_modified.txt'
mf_mut_extract = mutation_finder_from_regex_filepath(mf_regex_path)

In [4]:
text = 'A(1154)C'
for mutation, snip in mf_mut_extract(raw_text=text, span_size=150).items():
    mutation_entry = snip + ' : ' + mutation.OriginalMention
    print(mutation_entry)

A(1154)C : A(1154)C


### 1.2 tmVar [Link](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/tmvar/)

In [5]:
tmvar_mut_extract = TmVar('data/regexs/tmvar_regex/final_regex_path')

  self._regular_expressions.append(re.compile(reg))


In [6]:
tmvar_mut_extract(' n2923 (A347V), n2870(R429K), and n1163(S486F')

[['A347V', ' n2923 (A347V), n2870(R429K), and n1163(S486F'],
 ['R429K', ' n2923 (A347V), n2870(R429K), and n1163(S486F']]

### 1.3 Extra custom regexs 

In [7]:
import configparser

In [8]:
db_config = configparser.ConfigParser()
db_config.read('utils/all_config.cfg')

['utils/all_config.cfg']

In [9]:
custom_mut_extract = CustomWBregex(db_config)

- in the statement ced-3(n2888) 
n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT

In [10]:
custom_mut_extract('n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT')

[['n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT',
  'n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT']]

### 1.4 Bag of words 

In [11]:
bow_mut_extract = BOWdictionary()
# bow_mut_extract('This mutation deletes 471bp of the promoter region, the transcriptional start and 56 amino acids of the second exon.')

### 1.6 MF + tmVar + Custom regex + BOW

In [None]:
def unique_rows(a):
    a = np.ascontiguousarray(a)
    unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
    return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))


def regex_block(sentence, span_size=150):
    mut_and_snippets = []
    
    # MutationFinder
    for mutation, snip in mf_mut_extract(raw_text=sentence, span_size=span_size).items():
        mut_and_snippets.append([mutation.OriginalMention, snip])
    
    # tmVar
    mut_and_snippets = mut_and_snippets + tmvar_mut_extract(sentence, span_size=span_size)
    # Custom patterns
    mut_and_snippets = mut_and_snippets + custom_mut_extract(sentence, span_size=span_size)
    # Bag of words
    mut_and_snippets = mut_and_snippets + bow_mut_extract(sentence)
    
    if mut_and_snippets:
        mut_and_snippets = unique_rows(mut_and_snippets).tolist()
    return mut_and_snippets

In [13]:
regex_block(' asdf gpa-2 ::Tc1 asdf as')

[]

### 1.7 * Additional details  

In [14]:
def extra_info_block(sentence, span_size=150):
    info_and_snippets = []

    # look for gene and variant combo
    info_and_snippets = info_and_snippets + custom_mut_extract.var_and_gene_close(sentence, span_size=span_size)
    
    if info_and_snippets:
        info_and_snippets = unique_rows(info_and_snippets).tolist()
    return info_and_snippets

In [15]:
extra_info_block('in the statement ced-3(n2888)')

[['ced-3(n2888', 'Gene & Variant']]

## Changes in scores
MF + tmVar only: on remarks - (0.9459459459459459, 0.39923954372623577, 0.5614973262032086, None)   
on remarks + nala - (0.8671477079796265, 0.6698360655737705, 0.755826859045505, None)  
    
MF + tmVar + Custom + BOW: on remarks - (0.9606741573033708, 0.6501901140684411, 0.7755102040816326, None)  
on remarks + nala - (0.8680868496517821, 0.6947540983606557, 0.7718084137679839, None)  

# 2 BioBERT NER

In [16]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from transformers import TokenClassificationPipeline

import re
import pandas as pd
import numpy as np
import sklearn as sk
import math 
import string
import time
import json
import csv
import shutil
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/risubu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
model_name_or_path = 'models/nala'
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
)
# LABEL_0 - B-mut, LABEL_1 - I-mut, LABEL_2 - O
nala_ner  = TokenClassificationPipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='first')

In [18]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words = [w for w in stop_words if len(w) > 1]

def ner_score(sentence):
    mutations = []
    try:
        ner_output = nala_ner(sentence)
        for i, grp in enumerate(ner_output):
            if grp['entity_group'] == 'LABEL_0':
                mut = grp['word']
                for j in range(i+1, len(ner_output)):
                    if ner_output[j]['entity_group'] == 'LABEL_1':
                        mut  = mut + ' ' + ner_output[j]['word']
                    else:
                        # NER would be handling only data in NL form
                        if len(mut.split()) > 3 and any(word in mut.split() for word in stop_words):
                            mutations.append([mut, sentence])
                        break
    except:
        pass
    return mutations

# 3 Testing 
## 3.2 On scored curator remarks and nala mutation corpus 

In [19]:
# all_texts = []
# issues_count = 0

# df = pd.read_csv(r"data\gsoc\Remarks_scored.csv")
# df = df.to_numpy()
# # extract sentences
# text = []
# y_true = []
# for idx, row in enumerate(df):
#     loc = str(row[2]).find('Paper_evidence')
#     if loc != -1:
#         if row[0].split()[0] == 'Yes':
#             y_true.append(1)
#         elif row[0].split()[0] == 'No':
#             y_true.append(0)
#         else:
#             continue
#         temp_str = str(row[2][1:loc-2]).replace("\"", "'")
#         text.append(temp_str)

# assert len(text) == len(y_true)
# print('Count from Remarks_scored = {}'.format(len(text)))

# y_pred = []
# for sentence in text:
#     if regex_block(sentence):
#         y_pred.append(1)
#     elif ner_score(sentence):
#         y_pred.append(1)
#     else:
#         y_pred.append(0)

# assert len(y_pred) == len(y_true)
# all_texts = all_texts + text


# df = pd.read_csv(r"data\nala\binary_nala_NOT_NER.csv")
# df = df.to_numpy()
# print('Count from nala = {}'.format(len(df)))

# print('Entries processed: ', end=' ')
# for i, row in enumerate(df):
#     if i%500 == 0: print(i, end=' ')
#     sentence = row[0]
#     label = row[1]
#     try:
#         if regex_block(sentence):
#             y_pred.append(1)
#         elif ner_score(sentence):
#             y_pred.append(1)
#         else:
#             y_pred.append(0)

#         if label == 1:
#             y_true.append(1)
#         else:
#             y_true.append(0)
        
#         all_texts.append(sentence)
#     except:
#         issues_count += 1
#         pass

# assert len(y_pred) == len(y_true) == len(all_texts)
# print('\nTotal count = {}'.format(len(y_pred)))
# if issues_count: print('Note: Could not process {} sentences'.format(issues_count))

# precision_recall_fscore_support(y_true, y_pred, average='binary')

In [20]:
# # Manually inspecting incorrect preds
# bad = []
# assert len(y_pred) == len(y_true) == len(all_texts)
# for i, (t, p,sent) in enumerate(zip(y_true, y_pred, all_texts)):
#     if t != p:
#         if t:
#             bad.append([sent,i])
# len(bad)
# # print(bad[:10])

## Changes in scores (find the test cell block below)
  
Regex only: on remarks - (0.9606741573033708, 0.6501901140684411, 0.7755102040816326, None)  
on remarks + nala - (0.8680868496517821, 0.6947540983606557, 0.7718084137679839, None) 
  
Regex + NER: on remarks - (0.8592057761732852, 0.9049429657794676, 0.8814814814814815, None)   
on remarks + nala - (0.7570694087403599, 0.9655737704918033, 0.8487031700288185, None)  
  
### Retraining on NL and SST  
Regex + NER: on remarks - (0.9627906976744186, 0.7870722433460076, 0.8661087866108786, None)      
on remarks + nala - (0.8461816865725661, 0.7665573770491804, 0.804403922243248, None)  

## 3.2 On WB papers
### Get the paper texts from textpresso API and wbtools

### How it works - First paper ID is searched through textpresso API. If the recieved output is blank, then wbtools is used.  

In [21]:
import sys
import os.path
import argparse
import subprocess
import requests
import csv
import json
import nltk.data
import os
from xml.dom import minidom
from bs4 import BeautifulSoup
from wbtools.literature.corpus import CorpusManager
import platform

Uncomment and run the cell below only once to save the paper sentences in your local computer to avoid pulling paper text everytime during dev  

In [None]:
'''
Create a separate temporary numpy file to store the paper sentences  
to avoid spending time on pulling sentences while in development 
Format - [paper_id, sentence]
'''
# final_data = []
# remove_sections = []
# # random 100 papers mentioned the remarks ace file in data/gsoc
# paper_ids = np.load('data\top100.npy')
# cm = CorpusManager()
# for i, paper_id in enumerate(paper_ids):
#     paper_id = paper_id[7:]
#     cm.load_from_wb_database(db_name=config['wb_database']['db_name'], db_user=config['wb_database']['db_user'], db_password=config['wb_database']['db_password'],
#         db_host=config['wb_database']['db_host'], paper_ids=[paper_id],
#         ssh_host=config['wb_database']['ssh_host'], ssh_user=config['wb_database']['ssh_user'], ssh_passwd=config['wb_database']['ssh_passwd'],
#         load_bib_info=False, load_afp_info=False, load_curation_info=False)
#     sentences = cm.get_paper(paper_id).get_text_docs(remove_sections=remove_sections,split_sentences=True)
#     for sent in sentences:
#       final_data.append([paper_id, sent])
#     print(i, end = " ")
# final_data = np.array(final_data)
# np.save('id_and_sentence.npy', final_data)

In [22]:
def textpresso_paper_text(wbpid, path, token):
    """This sub takes a wbpid eg WBPaper00056731 and returns the fulltext paper in sentences"""
    
    ft=[0];
    # Check that wbpid is a valid WBPaper
    if not re.match( 'WBPaper', wbpid):
        print (wbpid, "is not a valid WBPaper ID")
        return ft
    # Download paper if it doesn't exist
    fn = path + '/temp/' + wbpid + '.json'

    if os.path.exists(fn) and os.path.getsize(fn) > 16:
        pass
    else:
        com1 = '-o '+fn +'\n-k '+ '\n'+'-d "{\\"token\\":\\"'+ token + '\\", \\"query\\": {\\"accession\\": \\"' + wbpid +'\\", \\"type\\": \\"document\\", \\"corpora\\": [\\"C. elegans\\"]}, \\"include_fulltext\\": true}"'
        configf= path +'/temp/' + wbpid + '.tmp.config'
        curlf = open(configf,'w')
        print (com1, file=curlf)
        curlf.close()
        command = 'curl -o '+ fn +' -K '+ configf+' https://textpressocentral.org:18080/v1/textpresso/api/search_documents' 
        comlist = command.split()
        os.system(command)

    # Read the paper, and split into sentences
    if os.path.exists(fn) and os.path.getsize(fn) > 20:
        # Open our JSON file and load it into python
        input_file = open (fn)
        json_array = json.load(input_file)
        for item in json_array:
            abs = item["abstract"]
            fullt =  item["fulltext"]
            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            ft = tokenizer.tokenize(abs)
            ftt=tokenizer.tokenize(fullt)
            ft = ft +ftt
    else:
        # some paper texts are blank for some reason
        # pipeline uses wbtools to get sentences in such case
        pass

    outfilen = os.path.join(path, 'text_flatfiles', wbpid+'.txt')
    outf = open(outfilen, 'w')
    for sen in ft:
        sen =str(sen)
        print(sen, file=outf)
    outf.close()
    return outfilen


'''
Sentences out of wbtools are sometimes weird, especially for the old papers
This increases the false neg rate so this is used only when the textpresso api provides bad text
Most of the issues have band aid fixes in the text preprocessing cell below but there are some 
with no easy fix or need to be worked on - 
1. Table content is extracted column wise, not row wise 
2. Some sentences have white space between every character and are also somehow inverted (???)
    e.g. Check out the sentences of WBPaper00002018 (1994)
    Line 154 -  '0 7 1 u ( 7 - c e m    ) F (   .'
    inverted and without the white space is (mec-7)u170 which is extremely useful and will get missed 
    by the pipeline unless processed correctly.
TODO: Not sure how to solve point 1 but point 2 is easy to solve and also helps a LOT.
    Rishab, work on this after you complete the project. 
    Not high priority as this might be only for the >10 year old papers (which are already manually curated)
'''
cm = CorpusManager()
def wbtools_paper_text(wbpid, db_name, db_user, db_password, db_host, ssh_host,\
    ssh_user, ssh_passwd):
    # sectioning might not be always correct, text processing is done separately in the pipeline
    # remove_sections = [PaperSections.ACKNOWLEDGEMENTS, PaperSections.REFERENCES, PaperSections.RELATED_WORK, PaperSections.INTRODUCTION]
    remove_sections = []
    paper_id = wbpid[7:]
    cm.load_from_wb_database(db_name=db_name, db_user=db_user, db_password=db_password,
        db_host=db_host, paper_ids=[paper_id],
        ssh_host=ssh_host, ssh_user=ssh_user, ssh_passwd=ssh_passwd,
        load_bib_info=False, load_afp_info=False, load_curation_info=False)
    sentences = cm.get_paper(paper_id).get_text_docs(remove_sections=remove_sections,split_sentences=True)
    return sentences


def get_paper_sentences(wbpids, config, store_ppr_path):
    '''
    Takes WB Paper IDs and returns a list of sentences from those papers after filtering
    Arg:
    wbpids - List of wb papers ids 
        e.g. ['WBPaper00002379']
    config_path - Config file path
    store_ppr_path - Folder path to store the paper flatfiles retrieved from TextPresso for future use
    Returns:
    paperid_sentence_list: List of paper ID and sentence
        e.g. [['WBPaper00002379', 'First sentence'], ['WBPaper00002379', 'Second sentence'], ....]
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words = [w for w in stop_words if len(w) > 1]

    all_special_chars = []
    with open('data/nala/train_dev.json') as f:
        for jsonObj in f:
            nala_json = json.loads(jsonObj)['tokens']
            for word in nala_json:
                if not word.isalnum():
                    all_special_chars.append(word)
    # list of special characters to keep during inference
    # helps with clearing out the bad characters from old papers
    all_special_chars = list(set(all_special_chars))
    
    token = config['textpresso']['token']
    db_name=config['wb_database']['db_name']
    db_user=config['wb_database']['db_user']
    db_password=config['wb_database']['db_password']
    db_host=config['wb_database']['db_host']
    ssh_host=config['wb_database']['ssh_host']
    ssh_user=config['wb_database']['ssh_user']
    ssh_passwd=config['wb_database']['ssh_passwd']

    temp_paperid_sentence = np.array([])
    if os.path.isfile('data/id_and_sentence.npy'):
        temp_paperid_sentence = np.load('data/id_and_sentence.npy')
    paperid_sentence_list = np.array([['WBPaperID', 'Sentence']])

    for id in wbpids:
        # textpresso_paper_text() also saves the text in flatfiles for future use 
        paper_path = textpresso_paper_text(id, store_ppr_path, token)
        txt = Path(paper_path).read_text().split('\n')
        # deals with empty text files with only "0"
        if len(txt) == 2:
            if platform.system() != 'Windows':
                txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                    ssh_user, ssh_passwd)
            elif temp_paperid_sentence.size != 0:
                txt = temp_paperid_sentence[temp_paperid_sentence[:, 0] == id[7:]][:, 1]
            
        for row in txt: 
            if row.find('fifi') == -1:
                if platform.system() != 'Windows':
                    txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                        ssh_user, ssh_passwd)
                elif temp_paperid_sentence.size != 0:
                    txt = temp_paperid_sentence[temp_paperid_sentence[:, 0] == id[7:]][:, 1]
                break
            
        count_total_rows = len(txt)
        for current_i, row in enumerate(txt):
            if row.lower().find("we thank") == 0 or row.lower().find("this work was supported") == 0 \
                or row.lower().find("references") == 0 or row.lower().find("we also thank") == 0 \
                or row.lower().find("this research was supported") == 0 or row.lower().find("we acknowledge") == 0 \
                or row.lower().find("acknowledgments") == 0 or row.lower().find('literature cited') != -1:
                if current_i > count_total_rows/2:
                    break

            # usually is bad sentence
            if len(row) < 40 or not any(word in row.lower().split() for word in stop_words):
                continue
            # remove sentences with links and email ids
            if re.search('\S+@\S+\.', row) or re.search('www.\S+\.', row):
                continue
            # filters one word sentences
            if len(row.split()) == 1:
                continue
            # sentences comprised of only single characters 
            # ^ seems to be issue with wbtools extraction pipeline 
            if all(len(word) < 5 for word in row.split()):
                continue
            row = re.sub("\( *cid *: *\d+ *\)", " ", row)
            temp_row = row
            for c in temp_row:
                if (not c.isalnum() and not c == ' ') and c not in all_special_chars:
                        row = row.replace(c, "")
            # fixes bad space between each character of flanking sequence from old papers
            flanking_regex = re.compile('([ACTG]( +)){4,}', re.IGNORECASE)
            for m in flanking_regex.finditer(row):
                span = (m.start(0), m.end(0))   
                span = row[span[0]:span[1]-1]
                correct_flank = re.sub('([ACTG])( +)', r'\1', row, flags=re.I)
                row = row.replace(span, correct_flank)

            # filters out repeated lines, e.g. check out WBPaper00028727.txt in flatfiles folder
            if row not in paperid_sentence_list[paperid_sentence_list[:,0]==id][:,1]:
                paperid_sentence_list = np.vstack((paperid_sentence_list, [id, row]))
    return paperid_sentence_list[1:]

In [23]:
# papers mentioned the remarks ace file in data/gsoc
ids_to_extract = np.load('data/top100.npy').tolist()[-2:]
paperid_sentence_list = get_paper_sentences(ids_to_extract, db_config, store_ppr_path='data/wbpapers')

In [24]:
print('Number of sentences and characters: ', end=' ')
print(len(paperid_sentence_list), sum([len(sent[1]) for sent in paperid_sentence_list]))

Number of sentences and characters:  492 82385


This cell takes a while to run - mainly due to the huge regex blocks   
~ 2 seconds per sentence   
Can't really switch them off though. There might be a smarter way to work with the regex block but this pipeline would be probably running every month on 50 or so papers (around 8 hours) so low priority.   

In [30]:
start_time = time.time()
final = [
    ['temporary', 'temporary', 'temporary', 'temporary', 'temporary'],\
    ['WBPaper ID', 'Method', '*Gene-Variant combo', 'Mutation', 'Sentence']]
total_sentences = len(paperid_sentence_list)
print('Total sentences to process ', len(paperid_sentence_list))
print('{total sentences processed}>{sentences with mutation}')
for ppr_sen_idx, row in enumerate(paperid_sentence_list):
    if (ppr_sen_idx+1) % 50 == 0: print(f"{ppr_sen_idx+1}>{len(final)-1}", end = " ")
    paper_id = row[0]
    sentence = str()
    limit = min(ppr_sen_idx+2, total_sentences)
    # some sentences - mostly table content are super long
    # temp fix, need to have a nice sentence splitter to minimize manual verification time
    not_single_sentence = False
    for i in range(ppr_sen_idx, limit):

        sentence = sentence + paperid_sentence_list[i][1] + ' '

        if (len(sentence) > 250 and not_single_sentence):
            break
        if paper_id != paperid_sentence_list[i][0]:
            break
        
        var_plus_genes = ''
        # Look for gene-variant combo e.g 'ced-3(n2888)' only on single sentences 
        if not not_single_sentence:
            var_plus_genes = []
            for data_and_cat in custom_mut_extract.var_and_gene_close(sentence.strip()):
                var_plus_genes.append(data_and_cat[0])
            if var_plus_genes:
                var_plus_genes = "'" + "', '".join(var_plus_genes) + "'"
            else:
                var_plus_genes = ''
                
        output = regex_block(sentence.strip())
        if output:
            mutations = []
            for mut_and_snip in output:
                # temp fix to deal with same mutation getting detected due to stiching multiple sentences
                if (mut_and_snip[0] not in final[-1][3][1:-1].split(", ") and mut_and_snip[0] not in final[-2][3][1:-1].split(", ")) \
                            and mut_and_snip[0] not in mutations:
                    mutations.append(mut_and_snip[0])
            if mutations:
                mutations = "'" + "', '".join(mutations) + "'"
                print(1, mutations)
                final.append([paper_id, 'Regex', var_plus_genes, mutations, 'Line '+str(ppr_sen_idx)+': '+sentence.strip()])
            break

        output = ner_score(sentence.strip())
        if output:
            mutations = []
            for mut_and_snip in output:
                # temp fix to deal with same mutation getting detected due to stiching multiple sentences
                if (mut_and_snip[0] not in final[-1][3][1:-1].split(", ") and mut_and_snip[0] not in final[-2][3][1:-1].split(", ")) \
                        and not all(len(word) < 4 for word in mut_and_snip[0].split())\
                    and mut_and_snip[0] not in mutations:
                    mutations.append(mut_and_snip[0])
            if mutations:
                mutations = "'" + "', '".join(mutations) + "'"
                print(2, mutations)
                final.append([paper_id, 'NER', var_plus_genes, mutations, 'Line '+str(ppr_sen_idx)+': '+sentence.strip()])
            break
        
        # these data, if found, are going to be important if no mutations are in that sentence
        if var_plus_genes:
            final.append([paper_id, '', var_plus_genes, '', 'Line '+str(ppr_sen_idx)+': '+sentence.strip()])
        
        not_single_sentence = True
print('Total time for processing in seconds: ', int(time.time() - start_time))

Total sentences to process  492
{total sentences processed}>{sentences with mutation}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


2 'mutations in conserved regions'
1 'C777Y'
1 'R739H'
1 'R741H', 'R749P'
1 'G785E'
2 'alteration of the corre'
50>11 1 '1051 N/I', '1086 G/E', '1110 G/E', '1336 S/F', '1406 G/R', '363 A/V', '636 G/R', '739 R/H', '746 A /V', '777 C/Y', '785 G/E', '869 V/M', 'A/V 355', 'C/Y 764', 'E 1073 G', 'E 1097 G', 'E 772 G', 'F 1318 T', 'G/E 1073', 'G/E 1097', 'G/E 772', 'G/R 1388', 'G/R 623', 'H 726 R', 'I 1038 T', 'M 856 T', 'N/I 1038', 'R 1388 G', 'R 623 G', 'R/H 726', 'S/F 1318', 'V 355 G', 'V 733 A', 'V/M 856', 'Y 764 C'
1 'C777Y', 'G785E'
1 'G636R'
1 'A746V'
1 'S678N'
2 'mutations in the funnel domain'
1 'S747L'
2 'amino acid corresponding to m322'
1 'V869M'
1 'G1086E', 'G1110E'
2 'substitutions at either glycine'
1 'G1110 to E'
100>27 1 'G1406R'
1 'S1336F'
1 'A363V'
2 'a363 is also predicted'
1 'N1051I'
2 'yeast amino acid position corresponding to n1051 [ t1038'
2 'n to i substitution'
2 'prox - imity to bridge helix ( bottom a -'
150>38 200>38 250>43 2 'missense mutants were found in a 16

In [31]:
temp = final[2:] # removing the temporary first row and header
np.save('data/model_output/results.npy', temp)

# columns with asterisk contain data which is useful regardless of whether the sentence has  mutation info
data = pd.DataFrame(temp[:], columns=['WBPaper ID', 'Method', '* Gene-Variant combo ', 'Mutation', 'Sentence'])
data.to_csv("data/model_output/extracted_snippets.csv", index=False, encoding='utf-8')

## Checking which papers had zero mutations

In [32]:
results = np.load('data/model_output/results.npy')
for i in np.unique(ids_to_extract):
    if i not in results[:, 0]:
        print(i, end= ' ')
        continue