Run following in terminal:
   ```
   cd ~
/bin/bash
perl -MNet::FTP -e \
    '$ftp = new Net::FTP("ftp.ncbi.nlm.nih.gov", Passive => 1);
    $ftp->login; $ftp->binary;
    $ftp->get("/entrez/entrezdirect/edirect.tar.gz");'
gunzip -c edirect.tar.gz | tar xf -
rm edirect.tar.gz
builtin exit
export PATH=$PATH:$HOME

   ```
Then run

```
esearch -db pubmed -query "cancer" |   efetch -format abstract >abstracts.txt
```

In [1]:
with open('abstracts.txt') as f:
    abstracts = f.read()

In [3]:
all_abstracts = abstracts.split("\n\n\n")

In [6]:
len(all_abstracts)

3120043

In [7]:
import csv
import pandas as pd

with open("abstracts.csv", "wt") as abstracts_file, open ("partial_abstracts.csv", "wt") as partial_abstracts:
    # csv writer for full abstracts
    abstract_writer = csv.writer(abstracts_file)
    abstract_writer.writerow(['Journal', 'Title', 'Authors', 'Author_Information', 'Abstract', 'DOI', 'Misc'])
    # csv writer for partial abstracts
    partial_abstract_writer = csv.writer(partial_abstracts)
    #For each abstract, split into categories and write it to the csv file
    for abstract in all_abstracts:
        processed = []
        #To obtain categories, split every double newline.
        split_abstract = abstract.split("\n\n")
        for i in split_abstract:
            i = i.replace('\n','')
            processed.append(i)
        if len(split_abstract) > 5:
            abstract_writer.writerow(processed)
        else:
            partial_abstract_writer.writerow(processed)

In [2]:
df = pd.read_csv('abstracts.csv', on_bad_lines='skip')

In [8]:
df

Unnamed: 0,Journal,Title,Authors,Author_Information,Abstract,DOI,Misc
0,1. Arzneimittelforschung. 1975 Sep;25(9):1369-79.,[Demonstration of tumor inhibiting properties ...,[Article in German],"Ardenne M, Reitnauer PG.",A report is given on the recent discovery of o...,PMID: 22 [Indexed for MEDLINE],
1,13. J Gynecol Obstet Biol Reprod (Paris). 1975...,[Use of isotopes in the diagnosis of malignant...,[Article in French],"Destailleur G, Vernaillen P, Pluygers E.","The authors, with 67 Gallium, have obtained po...",PMID: 439 [Indexed for MEDLINE],
2,25. Nouv Presse Med. 1975 Oct 11;4(33):2377-81.,[Zollinger-Ellison syndrome treated medically ...,[Article in French],"Bonfils S, Bernier JJ, Mignon M, Hautefeuille ...",Metiamide an histamine H2-receptors antagonist...,PMID: 653 [Indexed for MEDLINE],
3,39. Arch Geschwulstforsch. 1975;45(2):135-45.,[Tumour hyperacidulation through intravenous g...,[Article in German],"Ardenne M, Reitnauer PG.",Tumour peracidity in otherwise moderately hype...,PMID: 979 [Indexed for MEDLINE],
4,40. Arkh Patol. 1975;37(4):60-5.,[Morphological manifestations and morphogenesi...,[Article in Russian],"Zarudin VV, Shcherbakov VIa.",It was established that intracerebral introduc...,PMID: 984 [Indexed for MEDLINE],
...,...,...,...,...,...,...,...
1959389,994. Ned Tijdschr Geneeskd. 2015;159:A8729.,[Starting and stopping palliative tumour treat...,[Article in Dutch],"Vreugdenhil GA(1), van den Beuken-van Everding...","Author information:(1)MUMC+, Maastricht.",Although palliative treatment options are incr...,PMID: 26131748 [Indexed for MEDLINE]
1959390,995. Ned Tijdschr Geneeskd. 2015;159:A8732.,[A woman with a symptomatic abdominal swelling].,[Article in Dutch],"Gray SA(1), Raber MH, Klaase JM.","Author information:(1)Medisch Spectrum Twente,...",A 51-year-old woman visited the surgery outpat...,PMID: 26131749 [Indexed for MEDLINE]
1959391,996. Ned Tijdschr Geneeskd. 2015;159:A8886.,[An infant with a bowed lower leg].,[Article in Dutch],"Vanhooymissen IJ(1), Ouwendijk R, Snijder PM.","Author information:(1)Erasmus MC, Rotterdam.",The parents of a 5-month-old boy noticed bowin...,PMID: 26131751 [Indexed for MEDLINE]
1959392,997. Pediatr Blood Cancer. 2015 Oct;62(10):168...,Biology of childhood acute lymphoblastic leuke...,Ravindranath Y(1).,Author information:(1)Georgie Ginopolis Chair ...,Comment on Pediatr Blood Cancer. 2015 Oct;6...,DOI: 10.1002/pbc.25639PMID: 26131757 [Indexed ...,


In [7]:
df.to_csv('abstracts.csv')  

# Full Text

In [307]:
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
import pandas as pd
import os

def read_text_files_to_df(folder_path, encoding='UTF-8'):
    file_list = []
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            file_list.append(os.path.join(folder_path, file))
    
    data = []
    for file in file_list:
        try:
            with open(file, 'r', encoding=encoding) as f:
                data.append(f.read())
        except UnicodeDecodeError:
            with open(file, 'r', encoding='ISO-8859-1') as f:
                data.append(f.read())
    
    df = pd.DataFrame({'content': data})
    return df

def check_keywords(row):
    for keyword in keywords:
        if keyword not in row['content']:
            return False
    return True

def NER_per_text(nlp, i, threshold):
    start = 0 
    end = 512

    sentence = df['content'][i][start:end]
    result = nlp(sentence)
    final = pd.DataFrame(result)

    while end < len(df['content'][i]):
        start = end +1
        end = start + 512
        if end > len(df['content'][i]):
            end = len(df['content'][i])
        sentence = df['content'][i][start:end]
        result = pd.DataFrame(nlp(sentence))
        if not result.empty:
            result['start'] +=  start
            result['end'] += start
        final = final.append(result, ignore_index=True)
    final = final[final['score'] >= threshold]
    final = final.sort_values(by=['start'])
    return final

def df_cleaning(df):
    new_entries = []
    ends = []
    starts = []
    entities = []
    for i, row in df.iterrows():
        # if the entry starts with '##', combine it with the previous entry
        if row['word'].startswith('##'):
            new_entries[-1] = new_entries[-1].strip() + row['word'][2:].strip()
            ends[-1] = row['end']
        else:
            new_entries.append(row['word'].strip())
            ends.append(row['end'])
            starts.append(row['start'])
            entities.append(row['entity'])
    concatenated_text = []
    current_text = new_entries[0]
    current_start = starts[0]
    current_end = ends[0]
    current_ent = entities[0]
    for i in range(1, len(new_entries)):
        if starts[i] == current_end+1 or starts[i] == current_end:
            current_text = current_text + " " +new_entries[i]
            current_end = ends[i]
            current_ent = entities[i]
        else:
            concatenated_text.append((current_ent, current_text, current_start, current_end))
            current_text = new_entries[i]
            current_start = starts[i]
            current_end = ends[i]
            current_ent = entities[i]
    concatenated_text.append((current_ent, current_text, current_start, current_end))
    return pd.DataFrame(concatenated_text, columns=['entity','word', 'start', 'end'])


In [6]:
##Corpus Aggregation
df1 = read_text_files_to_df("PMC001xxxxxx")
df2 = read_text_files_to_df("PMC002xxxxxx")
df3 = read_text_files_to_df("PMC003xxxxxx")
df4 = read_text_files_to_df("PMC004xxxxxx")
df = pd.concat([df1, df2, df3, df4])
keywords = ['cancer', 'histone modifier']
df = df[df.apply(check_keywords, axis=1)]


In [311]:
result_dict = dict()
#tokenizer = AutoTokenizer.from_pretrained("drAbreu/bioBERT-NER-BC2GM_corpus")
disease_tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_diseases_ner")
disease_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_diseases_ner")
genetic_tokenizer = AutoTokenizer.from_pretrained("alvaroalon2/biobert_genetic_ner")
genetic_model = AutoModelForTokenClassification.from_pretrained("alvaroalon2/biobert_genetic_ner")
disease_nlp = pipeline("ner", model=disease_model, tokenizer=disease_tokenizer)
genetic_nlp = pipeline("ner", model=genetic_model, tokenizer=genetic_tokenizer)

for i in df.index[0:5]:
    temp_df = NER_per_text(disease_nlp,i,0.85)
    temp_df = temp_df[temp_df['entity'] != '0']
    temp_df = df_cleaning(temp_df)
    temp_df = temp_df.append(df_cleaning(NER_per_text(genetic_nlp,i,0.85)))
    temp_df.sort_values(by=['start'])
    result_dict[i] = temp_df


In [317]:
result_dict[10146][60:120]

Unnamed: 0,entity,word,start,end
55,B-GENETIC,PA3,13399,13402
56,B-GENETIC,Fpg,15142,15145
57,I-GENETIC,endonuclea,15231,15241
58,B-GENETIC,Fpg,15246,15249
59,I-GENETIC,Fpg protein,15370,15381
60,B-GENETIC,Fpg,15462,15465
61,B-GENETIC,Fpg,15590,15593
62,I-GENETIC,anti - GFP,16765,16773
63,I-GENETIC,anti - mouse - Alexa488,16862,16881
64,I-GENETIC,8 - oxoG glycosylase,17052,17070
