In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification,pipeline
import pandas as pd
import os
import xmltodict
import requests
import xml.etree.ElementTree as ET
import json
import pickle 
import random

# Nature Text Mining

## Documentation same as Cell journals. See that notebook for details

In [None]:
input_df = pd.read_csv('NatureArticles.csv')
input_df = input_df[input_df["Open Access"].isna()]
disease_tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
disease_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner")
genetic_tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_genetic_ner")
genetic_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_genetic_ner")
pubmedbert_gene = AutoTokenizer.from_pretrained("pruas/BENT-PubMedBERT-NER-Gene", model_max_length=512)
pubmedbert_gene_model = AutoModelForTokenClassification.from_pretrained("pruas/BENT-PubMedBERT-NER-Gene")
pubmedbert_disease = AutoTokenizer.from_pretrained("pruas/BENT-PubMedBERT-NER-Disease",model_max_length=512)
pubmedbert_disease_model = AutoModelForTokenClassification.from_pretrained("pruas/BENT-PubMedBERT-NER-Disease")

disease_nlp = pipeline("ner", model=disease_model, tokenizer=disease_tokenizer)
genetic_nlp = pipeline("ner", model=genetic_model, tokenizer=genetic_tokenizer)
pubmedbert_gene_nlp = pipeline("ner", model=pubmedbert_gene_model, tokenizer=pubmedbert_gene)
pubmedbert_disease_nlp = pipeline("ner", model=pubmedbert_disease_model, tokenizer=pubmedbert_disease)

disease = [disease_nlp,pubmedbert_disease_nlp]
genetic = [genetic_nlp,pubmedbert_gene_nlp]


nlps={'disease':disease,'genetic':genetic}

In [None]:
def flatten(container):
    for i in container:
        if isinstance(i, (list,tuple)):
            for j in flatten(i):
                yield j
        else:
            yield i 
def df_cleaning(df):
    new_entries = []
    ends = []
    starts = []
    entities = []
    for i, row in df.iterrows():
        # if the entry starts with '##', combine it with the previous entry
        if row['word'].startswith('##'):
            try:
                new_entries[-1] = new_entries[-1].strip() + row['word'][2:].strip()
            except:
                continue
            ends[-1] = row['end']
        else:
            new_entries.append(row['word'].strip())
            ends.append(row['end'])
            starts.append(row['start'])
            entities.append(row['entity'])
    concatenated_text = []
    if not new_entries:
        return(pd.DataFrame())
    current_text = new_entries[0]
    current_start = starts[0]
    current_end = ends[0]
    current_ent = entities[0]
    for i in range(1, len(new_entries)):
        if starts[i] == current_end+1 or starts[i] == current_end:
            current_text = current_text + " " +new_entries[i]
            current_end = ends[i]
            current_ent = entities[i]
        else:
            concatenated_text.append((current_ent, current_text, current_start, current_end))
            current_text = new_entries[i]
            current_start = starts[i]
            current_end = ends[i]
            current_ent = entities[i]
    concatenated_text.append((current_ent, current_text, current_start, current_end))
    return pd.DataFrame(concatenated_text, columns=['entity','word', 'start', 'end'])


def getKey(d, key):
    res_list = []
    fin_list = []
    try:
        res_list.append(d[key])
    except:
        try:
            for i in d.keys():
                res_list.append(getKey(d[i], key))
        except:
            if type(d) is list:
                for i in d:
                    if type(i) is dict:
                        res_list.append(getKey(i, key))
    for i in res_list:
        if i:
            fin_list.append(i)
    return(fin_list)

def NER_results(nlps,string_result):
    result_df=pd.DataFrame()
    for i in nlps:
        temp_df = pd.DataFrame(i(string_result))
        temp_df = df_cleaning(temp_df)
        try:
            temp_df = temp_df[temp_df["entity"] != '0']
        except:
            pass
        result_df = pd.concat([result_df, temp_df])
    result_df.sort_values(by=['start'])
    return result_df

def getNatureArticles(doi):
    api_key = '499d84c073f0ade469f211fd37104d7d'
    query = f'https://spdi.public.springernature.app/xmldata/jats?q=doi:{doi}&api_key={api_key}/wustl-api'
    response = requests.get(query)
    dict_data = xmltodict.parse(response.content)
    nature_article = (list(flatten(getKey(dict_data['response']['records']['article']['body'],'#text')))) ##Only want books
    return(nature_article)

def collision_cleanup(model_type,full_output):
    output= pd.DataFrame()
    full_output[full_output.columns[0]] = model_type
    process_df = full_output.applymap(lambda s: s.lower() if type(s) == str else s)
    df = process_df.drop_duplicates() ##Handle simple true duplicates
    while output.equals(df) == False:
        output = df
        df = df.sort_values(['start','end'])
        c1 = df['word'].shift() == df['word']
        c2 = df['end'].shift() - df['start'] <= 0
        #c3 = df['end'].shift() - df['end'] < 0
        df['interval'] = df['end'] - df['start']
        df['overlap'] = (c1 | c2).cumsum()
        df = df.sort_values(['interval'], ascending=False).groupby('overlap').first()
        df = df.reset_index(drop=True)
    return(df)

In [None]:
success = {}
fail = []
for doi in input_df['DOI']:
    result = pd.DataFrame()
    print(f'\r{doi}', end='',)
    try:
        string_result = ''.join(getNatureArticles(doi))
        for i in nlps:
            temp=collision_cleanup(i,(NER_results(nlps[i],string_result)))
            result = pd.concat([result, temp])
        result = result.sort_values(by=['start']).reset_index(drop=True)
        success[doi] = [string_result,result]
    except:
        fail.append([doi,i])


In [None]:
for i in success:
    replace = []
    for j in success[i][1]['word']:
        replace.append(j.replace(" - ", "-"))
    success[i][1]['word'] = replace

In [None]:
reader = pd.read_csv('readertbl.csv')
writer = pd.read_csv('writertbl.csv')
eraser = pd.read_csv('erasertbl.csv')
reader['classification']= 'reader'
writer['classification']= 'writer'
eraser['classification']= 'eraser'
classification_df = pd.concat([reader, writer,eraser]).reset_index(drop=True)
classification_df['gene'] = classification_df['gene'].str.lower()

for i in success:
    success[i][1]['classification'] = 'NULL'
    queries = (set(list(success[i][1]['word'])) & set(classification_df['gene']))
    for query in queries:
        index = classification_df[classification_df['gene']==query].index.values
        classification = list(classification_df.loc[index, 'classification'])
        index_change = list(success[i][1][success[i][1]['word']==query].index.values)
        for j in index_change:
            success[i][1].loc[[j], 'classification'] = pd.Series([classification], index=success[i][1].index[[j]])

In [None]:
#dict_data['response']['records']['article']['@article-type'] ##Could be useful for screening

In [None]:
#dict_data['response']['records']['article']['front'] ## Author/institutions etc

In [None]:
#dict_data['response']['records']['article']['back'] ## supplementary/acknowledgements/citations

In [None]:
# with open('savedNatureArticles.pkl', 'wb') as f:
#     pickle.dump(success, f)

In [3]:
with open('savedNatureArticles.pkl', 'rb') as f:
    success = pickle.load(f)

# Scratch

In [4]:
len(success)

401

In [7]:
with open('savedCellArticles.pkl', 'rb') as f:
    success1 = pickle.load(f)

In [8]:
len(success1)

924

In [6]:
article_list = []
for i in success:
    for j in success[i][1]['classification']:
        if isinstance(j, list):
            article_list.append(i)
result = [*set(article_list)]
sample = random.sample(result, 5)
            

    

In [30]:
sample = ['10.1016/j.bbagrm.2018.10.019',
 '10.1016/j.beha.2004.08.011',
 '10.1016/j.jbior.2012.04.003',
 '10.1016/j.dnarep.2009.04.003',
 '10.1016/j.tig.2006.09.007',
 '10.1016/j.mce.2017.03.016',
 '10.1016/j.dnarep.2011.01.012',
 '10.1016/j.jmb.2008.09.011',
 '10.1016/j.currproblcancer.2018.03.001',
 '10.1016/j.ejmg.2019.103739',
 '10.1007/s11010-010-0586-3',
 '10.1038/s41418-022-00992-3',
 '10.1038/ni1046',
 '10.1038/s41388-019-1081-2',
 '10.1007/s00412-004-0311-7']

In [9]:
merged = success1 | success

In [10]:
len(merged)

1325

In [11]:
annotation_subset = dict((k, success[k]) for k in sample if k in merged)


In [12]:
with open('annotation_subset.pkl', 'wb') as f:
    pickle.dump(annotation_subset, f)