# For training BioBERT NER on nala
Run this in CLI -  

python train_ner.py --model_name_or_path dmis-lab/biobert-base-cased-v1.1 --train_file data/nala/train_dev.json --validation_file data/nala/devel.json --text_column_name tokens --label_column_name tags --pad_to_max_length --max_length 192 --per_device_train_batch_size 8 --learning_rate 2e-5 --num_train_epochs 10 --output_dir models/nala --seed 1
  
Note:  Training took <30 mins on 1660 ti. You can decrease the num_train_epochs count to 5 without any substantial difference in accuracy.

# 1 Regex 
### Has high precision and low recall.    
### Consists of 3 parts - MutationFinder, tmVar and some custom patterns from WB papers 
### 1.1 Mutation Finder [Link](https://github.com/divyashan/MutationFinder), Modified regex from SETH [Link](https://github.com/rockt/SETH/blob/master/resources/mutations.txt)

In [1]:
import re
import os
import time
 
import numpy as np
import pandas as pd
import glob
from pathlib import Path

from sklearn.metrics import precision_recall_fscore_support

In [2]:
from utils.misc.regex_block import MutationFinder, TmVar, BOWdictionary, CustomWBregex

In [3]:
mf_mut_extract = MutationFinder('data/regexs/mutationfinder_regex/seth_modified.txt')

In [4]:
mf_mut_extract('wa7 ) is a null allele, because a 1-bp deletion')

[]

### 1.2 tmVar [Link](https://www.ncbi.nlm.nih.gov/research/bionlp/Tools/tmvar/)

In [5]:
tmvar_mut_extract = TmVar('data/regexs/tmvar_regex/final_regex_path')

  self._regular_expressions.append(re.compile(reg))


In [6]:
tmvar_mut_extract('wa7 ) is a null allele, because a 1-bp deletion')

[]

### 1.3 Extra custom regexs 

In [7]:
import configparser

In [8]:
db_config = configparser.ConfigParser()
db_config.read('utils/all_config.cfg')

['utils/all_config.cfg']

In [9]:
custom_mut_extract = CustomWBregex(db_config, extra_regex=True)

- in the statement ced-3(n2888) 
n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT

In [10]:
custom_mut_extract('n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT')

[['n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT',
  'n2888 bp AAAAAAAAAAAAAAAAA TTTTTTTTTTTTTTTTTT']]

In [11]:
custom_mut_extract.get_genes('Our results suggest that in C. elegans ,the trimethylation of histone H3 lysine 36 by MET-1/Set2 promotes a transcriptional repression cascade mediated by a NuRD-like complex and by the trimethylation of histone H3K9 by a SETDB1-like HMT.')

[['MET-1', 'Just gene']]

### 1.4 Bag of words 

In [12]:
bow_mut_extract = BOWdictionary()
# bow_mut_extract('This mutation deletes 471bp of the promoter region, the transcriptional start and 56 amino acids of the second exon.')

### 1.5 MF + tmVar + Custom regex + BOW

In [13]:
def unique_rows(a):
    a = np.ascontiguousarray(a)
    unique_a = np.unique(a.view([('', a.dtype)]*a.shape[1]))
    return unique_a.view(a.dtype).reshape((unique_a.shape[0], a.shape[1]))


def regex_block(sentence, span_size=150):
    mut_and_snippets = []
    
    # MutationFinder
    mut_and_snippets = mut_and_snippets + mf_mut_extract(sentence, span_size=span_size)
    # tmVar
    mut_and_snippets = mut_and_snippets + tmvar_mut_extract(sentence, span_size=span_size)
    # Custom patterns
    mut_and_snippets = mut_and_snippets + custom_mut_extract(sentence, span_size=span_size)
    # Bag of words
    mut_and_snippets = mut_and_snippets + bow_mut_extract(sentence)
    
    if mut_and_snippets:
        mut_and_snippets = unique_rows(mut_and_snippets).tolist()
    return mut_and_snippets

In [14]:
regex_block(' asdf gpa-2 ::Tc1 asdf as')

[]

### 1.6 * Additional details  
These will get extracted regardless of whether the sentence has genomic information

In [15]:
def extra_info_block(sentence, span_size=150):
    info_and_snippets = []

    # look for gene and variant combo
    info_and_snippets = info_and_snippets + custom_mut_extract.var_and_gene_close(sentence, span_size=span_size)
    
    if info_and_snippets:
        info_and_snippets = unique_rows(info_and_snippets).tolist()
    return info_and_snippets

In [16]:
extra_info_block('in the statement ced-3(n2888)')

[['ced-3(n2888', 'Gene & Variant']]

# 2 BioBERT NER

In [17]:
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from transformers import TokenClassificationPipeline

import re
import pandas as pd
import numpy as np
import sklearn as sk
import math 
import string
import time
import json
import csv
import shutil
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /home/risubu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/risubu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
model_name_or_path = 'models/nala'
config = AutoConfig.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
)
# LABEL_0 - B-mut, LABEL_1 - I-mut, LABEL_2 - O
nala_ner  = TokenClassificationPipeline(model=model, tokenizer=tokenizer, task='ner', aggregation_strategy='first')

In [19]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words = [w for w in stop_words if len(w) > 1]

def ner_mutations(sentence):
    mutations = []
    try:
        ner_output = nala_ner(sentence)
        for i, grp in enumerate(ner_output):
            if grp['entity_group'] == 'LABEL_0':
                mut = grp['word']
                for j in range(i+1, len(ner_output)):
                    if ner_output[j]['entity_group'] == 'LABEL_1':
                        mut  = mut + ' ' + ner_output[j]['word']
                    else:
                        # NER would be handling only data in NL form
                        if len(mut.split()) > 3 and any(word in mut.split() for word in stop_words):
                            mutations.append([mut, sentence])
                        break
    except:
        pass
    return mutations

# 3 Testing
Using the IDP4+ dataset downloaded and setup in notebook 1.

In [20]:
nala_db = pd.read_csv('data/nala/nala_binary.csv').to_numpy()

### 3.1 RegEx

In [23]:
y_true = []
y_pred = []
print('Total sentences to process: ', len(nala_db))
for i, row in enumerate(nala_db):
    if (i+1) % 500 == 0: print(f"{i+1}", end = " ")
    sentence = row[0]
    true = row[1]
    if regex_block(sentence):
        pred = 1
    else:
        pred = 0
    y_true.append(true)
    y_pred.append(pred)

Total sentences to process:  19235
500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500 10000 10500 11000 11500 12000 12500 13000 13500 14000 14500 15000 15500 16000 16500 17000 17500 18000 18500 19000 

In [24]:
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.9758194519075766, 0.6515966989594546, 0.7814113597246128, None)

### 3.2 BioBERT NER

In [25]:
y_true = []
y_pred = []
print('Total sentences to process: ', len(nala_db))
for i, row in enumerate(nala_db):
    if (i+1) % 500 == 0: print(f"{i+1}", end = " ")
    sentence = row[0]
    true = row[1]
    if ner_mutations(sentence):
        pred = 1
    else:
        pred = 0
    y_true.append(true)
    y_pred.append(pred)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Total sentences to process:  19235
500 1000 1500 2000 2500 3000 3500 4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500 10000 10500 11000 11500 12000 12500 13000 13500 14000 14500 15000 15500 16000 16500 17000 17500 18000 18500 19000 

In [26]:
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.6675148430873622, 0.28238249013275923, 0.39687342410489157, None)

### 3.3 RegEx (custom) + NER

In [30]:
y_true = []
y_pred = []
start_time = time.time()
print('Total sentences to process: ', len(nala_db))
for i, row in enumerate(nala_db):
    if (i+1) % 500 == 0: print(f"{i+1}", end = " ")
    if (i+1) % 1000 == 0:
        print('Time for 1000 lines:', int((time.time() - start_time)/60))
        start_time = time.time()
    sentence = row[0]
    true = row[1]
    if regex_block(sentence) or ner_mutations(sentence):
        pred = 1
    else:
        pred = 0
    y_true.append(true)
    y_pred.append(pred)

Total sentences to process:  19235
100 200 300 400 500 600 700 800 900 1000 Time for 1000 lines:  3
1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 Time for 1000 lines:  4
2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 Time for 1000 lines:  3
3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 Time for 1000 lines:  2
4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 Time for 1000 lines:  3
5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 Time for 1000 lines:  3
6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 Time for 1000 lines:  3
7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 Time for 1000 lines:  3
8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 Time for 1000 lines:  3
9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 Time for 1000 lines:  4
10100 10200 10300 10400 10500 10600 10700 10800 10900 11000 Time for 1000 lines:  3
11100 11200 11300 11400 11500 11600 11700 11800 11900 12000 Time for 1000 lines:  3
12100 12200 12300 12400 12500 12600 12700 12800 12900 13000 Time 

In [32]:
precision_recall_fscore_support(y_true, y_pred, average='binary')

(0.8377769432970334, 0.800502332256907, 0.8187155963302752, None)

## 4 Run on WB papers
### Get the paper texts from textpresso API and wbtools

### How it works - First paper ID is searched through textpresso API. If the recieved output is blank, then wbtools is used.  

In [20]:
import sys
import os.path
import argparse
import subprocess
import requests
import csv
import json
import nltk.data
import os
from xml.dom import minidom
from bs4 import BeautifulSoup
from wbtools.literature.corpus import CorpusManager
import platform

Uncomment and run the cell below only once to save the paper sentences in your local computer to avoid pulling paper text everytime during dev  

In [23]:
'''
Create a temporary cvs file to store the paper sentences  
to avoid spending time on pulling sentences while in development 
Format - [paper_id, sentence]
'''
# final_data = []
# # remove_sections = [PaperSections.ACKNOWLEDGEMENTS, PaperSections.REFERENCES, PaperSections.RELATED_WORK, PaperSections.INTRODUCTION]
# remove_sections = []
# # random 100 papers mentioned the remarks ace file in data/gsoc
# paper_ids = np.load('data/top100.npy').tolist()
# cm = CorpusManager()
# for i, paper_id in enumerate(paper_ids):
#     paper_id = paper_id[7:]
#     cm.load_from_wb_database(db_name=db_config['wb_database']['db_name'], db_user=db_config['wb_database']['db_user'], db_password=db_config['wb_database']['db_password'],
#         db_host=db_config['wb_database']['db_host'], paper_ids=[paper_id],
#         ssh_host=db_config['wb_database']['ssh_host'], ssh_user=db_config['wb_database']['ssh_user'], ssh_passwd=db_config['wb_database']['ssh_passwd'],
#         load_bib_info=False, load_afp_info=False, load_curation_info=False)
#     sentences = cm.get_paper(paper_id).get_text_docs(remove_sections=remove_sections,split_sentences=True)
#     for sent in sentences:
#         final_data.append([paper_id, sent])
#     print(i, end = " ")
# final_data = pd.DataFrame(final_data[:], columns=['WBPaper ID', 'Sentence'])
# final_data.to_csv("data/id_and_sentence.csv", index=False, encoding='utf-8')

'\nCreate a separate temporary numpy file to store the paper sentences  \nto avoid spending time on pulling sentences while in development \nFormat - [paper_id, sentence]\n'

In [21]:
def textpresso_paper_text(wbpid, path, token):
    """This sub takes a wbpid eg WBPaper00056731 and returns the fulltext paper in sentences"""
    ft=[0];
    # Check that wbpid is a valid WBPaper
    if not re.match( 'WBPaper', wbpid):
        print (wbpid, "is not a valid WBPaper ID")
        return ft
    # Download paper if it doesn't exist
    fn = path + '/temp/' + wbpid + '.json'

    if os.path.exists(fn) and os.path.getsize(fn) > 16:
        pass
    else:
        com1 = '-o '+fn +'\n-k '+ '\n'+'-d "{\\"token\\":\\"'+ token + '\\", \\"query\\": {\\"accession\\": \\"' + wbpid +'\\", \\"type\\": \\"document\\", \\"corpora\\": [\\"C. elegans\\"]}, \\"include_fulltext\\": true}"'
        configf= path +'/temp/' + wbpid + '.tmp.config'
        curlf = open(configf,'w')
        print (com1, file=curlf)
        curlf.close()
        command = 'curl -o '+ fn +' -K '+ configf+' https://textpressocentral.org:18080/v1/textpresso/api/search_documents' 
        comlist = command.split()
        os.system(command)

    # Read the paper, and split into sentences
    if os.path.exists(fn) and os.path.getsize(fn) > 20:
        # Open our JSON file and load it into python
        input_file = open (fn)
        json_array = json.load(input_file)
        for item in json_array:
            abs = item["abstract"]
            fullt =  item["fulltext"]
            tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
            ft = tokenizer.tokenize(abs)
            ftt=tokenizer.tokenize(fullt)
            ft = ft +ftt
    else:
        # some paper texts are blank for some reason
        # pipeline uses wbtools to get sentences in such case
        pass

    outfilen = os.path.join(path, 'text_flatfiles', wbpid+'.txt')
    outf = open(outfilen, 'w')
    for sen in ft:
        sen =str(sen)
        print(sen, file=outf)
    outf.close()
    return outfilen

In [22]:
'''
Sentences out of wbtools are sometimes weird, especially for the old papers
This increases the false neg rate so this is used only when the textpresso api provides bad text
Most of the issues have band aid fixes in the text preprocessing cell below but there are some 
with no easy fix or need to be worked on - 
1. Some sentences have white space between every character and are also somehow inverted (???)
    e.g. Check out the sentences of WBPaper00002018 (1994)
    Line 154 -  '0 7 1 u ( 7 - c e m    ) F (   .'
    inverted and without the white space is (mec-7)u170 which is extremely useful and will get missed 
    by the pipeline unless processed correctly.
    Only a problem on very old papers.
'''
cm = CorpusManager()
def wbtools_paper_text(wbpid, db_name, db_user, db_password, db_host, ssh_host,\
    ssh_user, ssh_passwd):
    # sectioning might not be always correct, text processing is done separately in the pipeline
    # remove_sections = [PaperSections.ACKNOWLEDGEMENTS, PaperSections.REFERENCES, PaperSections.RELATED_WORK, PaperSections.INTRODUCTION]
    remove_sections = []
    paper_id = wbpid[7:]
    cm.load_from_wb_database(db_name=db_name, db_user=db_user, db_password=db_password,
        db_host=db_host, paper_ids=[paper_id],
        ssh_host=ssh_host, ssh_user=ssh_user, ssh_passwd=ssh_passwd,
        load_bib_info=False, load_afp_info=False, load_curation_info=False)
    sentences = cm.get_paper(paper_id).get_text_docs(remove_sections=remove_sections,split_sentences=True)
    return sentences

In [23]:
def get_paper_sentences(wbpids, config, store_ppr_path):
    '''
    Takes WB Paper IDs and returns a list of sentences from those papers after filtering
    Arg:
    wbpids - List of wb papers ids 
        e.g. ['WBPaper00002379']
    config_path - Config file path
    store_ppr_path - Folder path to store the paper flatfiles retrieved from TextPresso for future use
    Returns:
    paperid_sentence_list: List of paper ID and sentence
        e.g. [['WBPaper00002379', 'First sentence'], ['WBPaper00002379', 'Second sentence'], ....]
    '''
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stop_words = [w for w in stop_words if len(w) > 1]

    all_special_chars = []
    with open('data/nala/train_dev.json') as f:
        for jsonObj in f:
            nala_json = json.loads(jsonObj)['tokens']
            for word in nala_json:
                if not word.isalnum():
                    all_special_chars.append(word)
    # list of special characters to keep during inference
    # helps with clearing out the bad characters from old papers
    all_special_chars = list(set(all_special_chars))
    
    token = config['textpresso']['token']
    db_name=config['wb_database']['db_name']
    db_user=config['wb_database']['db_user']
    db_password=config['wb_database']['db_password']
    db_host=config['wb_database']['db_host']
    ssh_host=config['wb_database']['ssh_host']
    ssh_user=config['wb_database']['ssh_user']
    ssh_passwd=config['wb_database']['ssh_passwd']

    temp_paperid_sentence = np.array([])
    if os.path.isfile('data/id_and_sentence.csv'):
        temp_paperid_sentence = pd.read_csv("data/id_and_sentence.csv", lineterminator='\n', dtype = str).to_numpy() # WBPaper ID, Sentence
    paperid_sentence_list = []
    
    for curr_ppr_i, id in enumerate(wbpids):
        print(f"{curr_ppr_i+1}", end = " ")
        # textpresso_paper_text() also saves the text in flatfiles for future use 
        start_time = time.time()
        paper_path = textpresso_paper_text(id, store_ppr_path, token)
        txt = Path(paper_path).read_text().split('\n')
        # deals with empty text files with only "0"
        if len(txt) == 2:
            if temp_paperid_sentence.size != 0:
                txt = temp_paperid_sentence[temp_paperid_sentence[:, 0] == id[7:]][:, 1]
                # incase the loaded numpy file didn't have the required paper 
                if len(txt) == 0 and platform.system() != 'Windows':
                    txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                        ssh_user, ssh_passwd)
            elif platform.system() != 'Windows' :
                txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                    ssh_user, ssh_passwd)
            
        for row in txt: 
            if row.find('fifi') != -1:
                if temp_paperid_sentence.size != 0:
                    txt = temp_paperid_sentence[temp_paperid_sentence[:, 0] == id[7:]][:, 1]
                    # incase the loaded numpy file didn't have the required paper 
                    if len(txt) == 0 and platform.system() != 'Windows':
                        txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                            ssh_user, ssh_passwd)
                elif platform.system() != 'Windows':
                    txt = wbtools_paper_text(id, db_name, db_user, db_password, db_host, ssh_host,\
                        ssh_user, ssh_passwd)
                break
            
        count_total_rows = len(txt)
        for current_i, row in enumerate(txt):
            if row.lower().find("we thank") == 0 or row.lower().find("this work was supported") == 0 \
                or row.lower().find("references") == 0 or row.lower().find("we also thank") == 0 \
                or row.lower().find("this research was supported") == 0 or row.lower().find("we acknowledge") == 0 \
                or row.lower().find("acknowledgments") == 0 or row.lower().find('literature cited') != -1:
                if current_i > count_total_rows/3:
                    break

            # usually is bad sentence
            if len(row) < 40 or not any(word in row.lower().split() for word in stop_words):
                continue
            # remove sentences with links and email ids
            if re.search('\S+@\S+\.', row) or re.search('www\.\S+\.', row) or re.search('http.?://', row):
                continue
            # filters one word sentences
            if len(row.split()) == 1:
                continue
            # sentences comprised of only single characters 
            # ^ seems to be issue with wbtools extraction pipeline 
            if all(len(word) < 5 for word in row.split()):
                continue
            row = re.sub("\( *cid *: *\d+ *\)", " ", row)
            # TODO: replace this block with a regex sub 
            temp_row = row
            for c in temp_row:
                if (not c.isalnum() and not c == ' ') and c not in all_special_chars:
                        row = row.replace(c, "")
                        
            # fixes bad space between each character of flanking sequence from old papers
            # Switching this off as it increases the processing time
            # also affects very small subset of old papers so not worth the extra time
            flanking_regex = re.compile('([ACTG]( +)){4,}')
            for m in flanking_regex.finditer(row):
                span = (m.start(0), m.end(0))   
                span = row[span[0]:span[1]-1]
                correct_flank = re.sub('([ACTG])( +)', r'\1', row)
                row = row.replace(span, correct_flank)
            row = 'Line '+ str(current_i) + ': ' + row.strip()
            paperid_sentence_list.append((id, row))
    return paperid_sentence_list[1:]

In [29]:
# papers mentioned the remarks ace file in data/gsoc
ids_to_extract = np.load('data/top100.npy').tolist()[70:]
paperid_sentence_list = get_paper_sentences(ids_to_extract, db_config, store_ppr_path='data/wbpapers')

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 

In [30]:
# remove duplicates keeping the order
seen = set()
paperid_sentence_list = np.array([x for x in paperid_sentence_list if x not in seen and not seen.add(x)])

In [31]:
print('Number of sentences and characters: ', end=' ')
print(len(paperid_sentence_list), sum([len(sent[1]) for sent in paperid_sentence_list]))

Number of sentences and characters:  8539 1444637


This cell takes a while to run - mainly due to the huge regex blocks   
~1 hour per 10 papers   
Can't really switch them off though. There might be a smarter way to work with the regex block but this pipeline would be probably running every month on 50 or so papers so low priority.   
That being said, if you want to reduce the processing time NOW, remove the travesing of lines below  
by changing the limit value to 'min(ppr_sen_idx+1, total_sentences)'  
Traversing was originally implemented due to bad sentence extraction from super old papers. If paper text extraction works well (which it usually does on new papers), traversing can be removed.

In [32]:
start_time = time.time()
final = [
    ['temporary', 'temporary', 'temporary', 'temporary', 'temporary', 'temporary'],\
    ['WBPaper ID', 'Method', '*Genes', '*Gene-Variant combo', 'Mutation', 'Sentence']]
total_sentences = len(paperid_sentence_list)
print_snips = False
print('Total sentences to process ', len(paperid_sentence_list))
# there would be lots of duplicates in the gene col which might cause high count of
# snippets with useful info. they'll get filtered out in the third notebook
print('{total sentences processed}>{snippets with useful info}')
for ppr_sen_idx, row in enumerate(paperid_sentence_list):
    if (ppr_sen_idx+1) % 50 == 0: print(f"{ppr_sen_idx+1}>{len(final)-1}", end = " ")
    paper_id = row[0]
    sentence = str()
    # traverse upto 2 sentences at a time (if they aren't super long)
    # this should be removed if table content thing from wbtools is ever fixed
    # or else, it might group lines of table which will result in higher false positive count
    limit = min(ppr_sen_idx+2, total_sentences)
    # some sentences - mostly table content are all in a single sentence
    # temp fix, need to have a nice sentence splitter to minimize manual verification time
    not_single_sentence = False
    for i in range(ppr_sen_idx, limit):

        sentence = sentence + paperid_sentence_list[i][1] + ' '
        raw_sentence = sentence

        if (len(sentence) > 250 and not_single_sentence):
            break
        if paper_id != paperid_sentence_list[i][0]:
            break
        
        var_plus_genes = ''
        # Look for the special data e.g. gene-variant combo (e.g 'ced-3(n2888)') only on single sentences 
        if not not_single_sentence:
            var_plus_genes = []
            all_genes = []
            
            for data_and_cat in custom_mut_extract.var_and_gene_close(sentence.strip()):
                var_plus_genes.append(data_and_cat[0])
            if var_plus_genes:
                var_plus_genes  = list(set(var_plus_genes))
                var_plus_genes = "'" + "', '".join(var_plus_genes) + "'"
            else:
                var_plus_genes = ''
                
            for data_and_cat in custom_mut_extract.get_genes(sentence.strip()):
                all_genes.append(data_and_cat[0])
            if all_genes:
                all_genes  = list(set(all_genes))
                # Removing gene mentions from sentence
                # e.g. "lysine 36 by MET-1/Set2" regex will classify this as protein mutation
                # but MET-1 is a gene name so the mutation isn't valid
                for gene in all_genes:
                    sentence = sentence.replace(gene, "")
                all_genes = "'" + "', '".join(all_genes) + "'"
            else:
                all_genes = ''   
                
        output = regex_block(sentence.strip())
        if output:
            mutations = []
            for mut_and_snip in output:
                # temp fix to deal with same mutation getting detected due to stiching multiple sentences
                if (mut_and_snip[0] not in final[-1][4][1:-1].split(", ") and mut_and_snip[0] not in final[-2][4][1:-1].split(", ")) \
                            and mut_and_snip[0] not in mutations:
                    mutations.append(mut_and_snip[0])
            if mutations:
                mutations = "'" + "', '".join(mutations) + "'"
                if print_snips: print(1, mutations)
                final.append([paper_id, 'Regex', all_genes, var_plus_genes, mutations, raw_sentence.strip()])
            break

        output = ner_mutations(sentence.strip())
        if output:
            mutations = []
            for mut_and_snip in output:
                # temp fix to deal with same mutation getting detected due to stiching multiple sentences
                if (mut_and_snip[0] not in final[-1][4][1:-1].split(", ") and mut_and_snip[0] not in final[-2][4][1:-1].split(", ")) \
                        and not all(len(word) < 4 for word in mut_and_snip[0].split())\
                    and mut_and_snip[0] not in mutations:
                    mutations.append(mut_and_snip[0])
            if mutations:
                mutations = "'" + "', '".join(mutations) + "'"
                if print_snips: print(2, mutations)
                final.append([paper_id, 'NER', all_genes, var_plus_genes, mutations, raw_sentence.strip()])
            break
        
        # these data, if found, are going to be important if no mutations are in that sentence
        if var_plus_genes or all_genes:
            final.append([paper_id, '', all_genes, var_plus_genes, '', raw_sentence.strip()])
        
        not_single_sentence = True
print('Total time for processing in minutes: ', int((time.time() - start_time)/60))

Total sentences to process  8539
{total sentences processed}>{snippets with useful info}
50>21 100>62 150>95 200>139 250>180 300>205 350>227 400>249 450>283 500>316 550>341 600>361 650>382 700>409 750>451 800>495 850>536 900>572 950>605 1000>652 1050>721 1100>781 1150>835 1200>888 1250>916 1300>940 1350>971 1400>1000 1450>1030 1500>1053 1550>1091 1600>1108 1650>1126 1700>1161 1750>1197 1800>1234 1850>1256 1900>1272 1950>1299 2000>1320 2050>1346 2100>1394 2150>1423 2200>1455 2250>1479 2300>1508 2350>1549 2400>1587 2450>1620 2500>1670 2550>1723 2600>1769 2650>1797 2700>1799 2750>1799 2800>1799 2850>1800 2900>1803 2950>1821 3000>1839 3050>1880 3100>1929 3150>1973 3200>2015 3250>2034 3300>2082 3350>2119 3400>2157 3450>2179 3500>2217 3550>2263 3600>2287 3650>2319 3700>2357 3750>2403 3800>2449 3850>2495 3900>2535 3950>2568 4000>2576 4050>2588 4100>2617 4150>2644 4200>2663 4250>2686 4300>2712 4350>2739 4400>2779 4450>2801 4500>2817 4550>2833 4600>2852 4650>2877 4700>2901 4750>2927 4800>2950 4

In [33]:
temp = final[2:] # removing the temporary first row and header

# this sheet will contain high number of duplicates - which will get filtered in 3rd notebook
# columns with asterisk contain data which are useful regardless of whether the sentence has  mutation info
temp = pd.DataFrame(temp[:], columns=['WBPaper ID', 'Method', '* Genes', '* Gene-Variant combo', 'Mutation', 'Sentence'])
temp.to_csv("data/model_output/extracted_snippets_20_50.csv", index=False, encoding='utf-8')