In [1]:
# conda env:  /mnt/scratch2/Maggie/miniconda3/envs/scispacy
import scispacy
import spacy
import pandas as pd
import numpy as np
from collections import defaultdict

# get webscraping output
geo = pd.read_csv('geo_webscrap.csv')

In [2]:
# load spacy models
ner_bio = spacy.load("en_ner_bionlp13cg_md")
ner_bc5 = spacy.load('en_ner_bc5cdr_md')
ner_jnl = spacy.load('en_ner_jnlpba_md')

models = [ner_bio, ner_bc5, ner_jnl]

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [12]:
# add new columns to geo file
new_geo = geo.copy(deep=True)

# minimum score for label assignment to be added to geo
threshold = 0.5

for i in range(geo.shape[0]):
    # get Summary and Overall Design as text
    text = [geo['Summary'][i], geo['Overall design'][i]]
    text = " ".join(text)

    # iterate over all available models
    for model in models:
        doc = model(text)
        beams = model.get_pipe('ner').beam_parse([ doc ], beam_width = 16, beam_density = 0.0001)
        entity_scores = defaultdict(float)
        
        for beam in beams:
            for score, ents in model.get_pipe('ner').moves.get_beam_parses(beam):
                for start, end, label in ents:
                    entity_scores[(start, end, label)] += score
    
        for key in entity_scores:
            start, end, label = key
            score = entity_scores[key]
            ent_text = str(doc[start:end])
            
            if (score > threshold):
                if label in new_geo.columns and new_geo.loc[i, label] not in ['nan', 'na']:
                    if ent_text not in new_geo.loc[i, label]:
                        new_geo.loc[i, label].append(ent_text)
                else:
                    new_geo.loc[i, label] = [ent_text]
            

# convert 'nan' and 'na' to np.nan to match existing columns
new_geo = new_geo.mask(new_geo == 'nan')
new_geo = new_geo.mask(new_geo == 'na')

In [13]:
new_geo

Unnamed: 0,Title,Summary,Overall design,Contact name,Organization name,City,State/province,Country,Platforms,Samples,...,PROTEIN,SIMPLE_CHEMICAL,ORGANISM_SUBSTANCE,CELLULAR_COMPONENT,MULTI_TISSUE_STRUCTURE,RNA,PATHOLOGICAL_FORMATION,IMMATERIAL_ANATOMICAL_ENTITY,ANATOMICAL_SYSTEM,AMINO_ACID
0,Characterizing the molecular spatial and tempo...,Gene expression alterations in response to cig...,Bronchial brushings and biopsies were obtained...,Humam Kadara,University of Texas MD Anderson Cancer Center,Houston,TX,USA,GPL6244,391,...,,,,,,,,,,
1,Agilent-013282 array CGH on NCI-60 cancer cell...,The NCI-60 cancer cell lines have been used fo...,60 cell lines (including PR:DU145 and its drug...,Sudhir Varma,HiThru Analytics,Princeton,NJ,USA,GPL11068,129,...,[PR],,,,,,,,,
2,Microbial community structure and functions ar...,"Despite the global importance of forests, it i...",12 samples were collected from two long-term p...,Hamed Azarbad,Jagiellonian University,Krakow,,Poland,GPL18982,12,...,"[microbial community structure (p = 0.037), bu...",[Firmicutes],,,,,,,,
3,Transcriptional profiling of lung tumour cell ...,Microarray expression data generated to determ...,Murine tumour cell lines were isolated from in...,Emma Kerr,Queen's University Belfast,Belfast,,United Kingdom,GPL17543,72,...,"[p53, WT p53, p53R270H/ER, p53R172H/ER, p53 WT...","[4-hydroxytamoxifen, 4-OHT, 2hr, 2mM L-Glutami...",[FBS],,,,,,,
4,Transcriptional profiling of lung tumour cell ...,Microarray expression data generated to determ...,Murine tumour cell lines were isolated from in...,Emma Kerr,Queen's University Belfast,Belfast,,United Kingdom,GPL17543,72,...,"[p53, WT p53, p53R270H/ER, p53R172H/ER, p53 WT...","[4-hydroxytamoxifen, 4-OHT, 2hr, 2mM L-Glutami...",[FBS],,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,Spatial control of m6A deposition on enhancer ...,Interaction between the m6A methyltransferase ...,Chromatin-associated RNA m6A profiles of METTL...,meijun ye,SUN YAT-SEN UNIVERSITY,GuangZhou,-- Please Select --,China,GPL30209,6,...,"[m6A methyltransferase METTL3, METTL14, METTL3...",[m6A],,[chromatin],,[chromatin-associated RNAs],,,,
232,Spatial control of m6A deposition on enhancer ...,Interaction between the m6A methyltransferase ...,The mRNA expression profile of METTL3-WT or ME...,meijun ye,SUN YAT-SEN UNIVERSITY,GuangZhou,-- Please Select --,China,GPL30209,6,...,"[m6A methyltransferase METTL3, METTL14, METTL3...",[m6A],,[chromatin],,[chromatin-associated RNAs],,,,
233,Spatial control of m6A deposition on enhancer ...,Interaction between the m6A methyltransferase ...,CUT&Tag in METTL3-WT or METTL3-3KR A549 cells ...,meijun ye,SUN YAT-SEN UNIVERSITY,GuangZhou,-- Please Select --,China,GPL34633,8,...,"[m6A methyltransferase METTL3, METTL14, METTL3...",[m6A],,[chromatin],,[chromatin-associated RNAs],,,,
234,DDX54 downregulation enhances anti-PD1 therapy...,High tumor mutational burden (TMB) is a predic...,This study employs a syngeneic mouse model of ...,Jeong-Ryeol Gong,KAIST,Yuseong-gu,Daejeon,South Korea,GPL33896,4,...,"[anti-PD1, Ddx54, isotype control antibody, an...",[anti-PD1],[Xenium Mouse],,,,,,,


In [14]:
# save new geo file
new_geo.to_csv('new_geo_scispacy.csv', index=False)