### Input

detected pattern patternlist.xlsx

tagged named entities (chemical, gene and fuzzy"O") train0_truth.tsv

### Output

confidence of patterns and named entities

Then evaluate them by the matching them to train1.ner.txt

### data location

tagged file from fuzzy-ner: /shared/data/shizhi2/data_fuzzyCRF/case/train0_all.tsv, train1_all.tsv (edited) **train0_gene.tsv**, **train1_chem.tsv**

ground truth: /shared/data/shizhi2/data_fuzzyCRF/bio-ner/**train0_all.tsv**, **train1_all.tsv** contain all the gene and chemical labels

patternlist: **patternlist.xlsx**

In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from textblob import TextBlob

In [3]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/flyingturtle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

S-GENE single

B-GENE begining of the gene

E-GENE end of the gene

# Context Patterns

In [21]:
named_entity_list = {}
with open("train1_all.tsv", "r") as f:
    for line in f:
        segs = line[:-1].split("\t")
        named_entity = " ".join(segs[:-1])
        entity_label = segs[-1]
        
        # check if there is a collision
        try:
            if(named_entity_list[named_entity]):
                pass
        except:
            named_entity_list[named_entity] = [entity_label]
            
        if(entity_label != "O" and entity_label not in named_entity_list[named_entity]):
            named_entity_list[named_entity].append(entity_label)

print("number of named entities with labels: ", len(named_entity_list.keys()))

number of named entities with labels:  43135


In [23]:
pattern_list = pd.read_excel("patternlist.xlsx", header=None, names=["id", "pattern"])
print(pattern_list.head())
print(len(pattern_list))

            id                           pattern
0    PATTERN10                  CHEMICAL therapy
1   PATTERN100  transcriptional activity of GENE
2  PATTERN1001               100 microm CHEMICAL
3  PATTERN1003               activity in DISEASE
4  PATTERN1005             the DISEASE cell line
4780


In [24]:
with open("train1.ner.txt") as f:
    whole_context = f.read()
    context_segs = whole_context.split("\n")

In [25]:
context_segs[:5]

['Effect of CHEMICAL_D002994_clofibrate on plasma proteins including components of the hemostatic mechanism.',
 'The effect of CHEMICAL_D002994_clofibrate (1.5 g/day) on different plasma proteins and on components of the hemostatic system was studied in eight SPECIES_9606_men with either mild DISEASE_D003920_diabetes_mellitus or cardiosclerosis.',
 'Before treatment, the subjects were investigated weekly on five occasions.',
 'The means of these determinations were compared with the values observed after 2, 6 and 14 weeks of treatment.',
 'During the treatment albumin and GENE_7018_transferrin increased significantly while orosomucoid, ceruloplasmin, beta1 E-globulin, IgA, IgM and fibrinogen decreased significantly.']

In [26]:
def pmatch(pattern, document):
    """
    This function matchs a pattern containing "CHEMICAL", "GENE", "DISEASE" to a given document
    """
    
    res = {"content": pattern, "matches":[]}
    
    # generate re query for search
    match_query = re.sub('CHEMICAL|GENE|DISEASE', "(.*?)", pattern)
    if(match_query[-5:]=="(.*?)"):
        match_query = match_query[:-5] + "(.*)"
        
    # search pattern in the document
    matches = re.search(match_query, document)
    pos_pattern = list(re.finditer("CHEMICAL|GENE|DISEASE", pattern))
    
    index = 1
    for p in pos_pattern[:-1]:
        res["matches"].append((p.group(0), matches.group(index)))
        index += 1
    tail = matches.group(index).split(" ")
    if(tail[0] in ["the", "a", "in"]):
        last_word = tail[1]
    else:
        last_word = tail[0]
    res["matches"].append((pos_pattern[-1].group(0), last_word))
    
    return res

    
    
pmatch("development of DISEASE such as GENE will DISEASE", "The development of the last proteins such as we will king of all the issue")

{'content': 'development of DISEASE such as GENE will DISEASE',
 'matches': [('DISEASE', 'the last proteins'),
  ('GENE', 'we'),
  ('DISEASE', 'king')]}

In [27]:
from textblob import TextBlob
txt = """king of all the issue"""
blob = TextBlob(txt)
print(blob.noun_phrases)

[]


In [30]:
for inex, row in pattern_list.iterrows():
    pattern = row["pattern"]
    print(pattern)
    try: # in case pattern == "nan"
        print(re.split('CHEMICAL|GENE|DISEASE|\n',pattern))
    except:
        pass

CHEMICAL therapy
['', ' therapy']
transcriptional activity of GENE
['transcriptional activity of ', '']
100 microm CHEMICAL
['100 microm ', '']
activity in DISEASE
['activity in ', '']
the DISEASE cell line
['the ', ' cell line']
transcription of GENE
['transcription of ', '']
a metabolite of CHEMICAL
['a metabolite of ', '']
similar DISEASE
['similar ', '']
nuclear CHEMICAL
['nuclear ', '']
SPECIES disease
['SPECIES disease']
aged SPECIES
['aged SPECIES']
CHEMICAL ( pb )
['', ' ( pb )']
CHEMICAL pump
['', ' pump']
SPECIES heart
['SPECIES heart']
enos and GENE
['enos and ', '']
GENE protein abundance
['', ' protein abundance']
DISEASE lung
['', ' lung']
GENE activation
['', ' activation']
regulation of CHEMICAL
['regulation of ', '']
GENE neuron
['', ' neuron']
GENE messenger rna ( mrna )
['', ' messenger rna ( mrna )']
CHEMICAL injection (
['', ' injection (']
relative to GENE
['relative to ', '']
CHEMICAL side
['', ' side']
the regulation of GENE
['the regulation of ', '']
constituti

['a variety of ', '']
DISEASE cell type
['', ' cell type']
SPECIES b
['SPECIES b']
the inhibition of DISEASE
['the inhibition of ', '']
the initiation of DISEASE
['the initiation of ', '']
GENE ) gene
['', ' ) gene']
CHEMICAL product
['', ' product']
expression of SPECIES
['expression of SPECIES']
injection of CHEMICAL
['injection of ', '']
mutant SPECIES
['mutant SPECIES']
preincubation with CHEMICAL
['preincubation with ', '']
the mrna level of GENE
['the mrna level of ', '']
SPECIES cell line
['SPECIES cell line']
GENE in cell
['', ' in cell']
lead CHEMICAL
['lead ', '']
the GENE protein
['the ', ' protein']
sex CHEMICAL
['sex ', '']
DISEASE challenge
['', ' challenge']
vitro DISEASE
['vitro ', '']
CHEMICAL induce apoptosis
['', ' induce apoptosis']
CHEMICAL donor
['', ' donor']
CHEMICAL transport
['', ' transport']
effect of CHEMICAL on expression
['effect of ', ' on expression']
specific CHEMICAL
['specific ', '']
CHEMICAL phosphatase
['', ' phosphatase']
contrast to GENE
['contra

after CHEMICAL administration
['after ', ' administration']
DISEASE inhibition
['', ' inhibition']
topical application of CHEMICAL
['topical application of ', '']
characterization of CHEMICAL
['characterization of ', '']
the regulation of DISEASE
['the regulation of ', '']
presence of GENE
['presence of ', '']
DISEASE and cell
['', ' and cell']
DISEASE genotype
['', ' genotype']
endothelial GENE
['endothelial ', '']
SPECIES exposure
['SPECIES exposure']
GENE , gene
['', ' , gene']
SPECIES preadipocyte
['SPECIES preadipocyte']
CHEMICAL x receptor
['', ' x receptor']
GENE and mapk
['', ' and mapk']
DISEASE accumulation
['', ' accumulation']
cultured DISEASE
['cultured ', '']
metastatic DISEASE
['metastatic ', '']
body CHEMICAL
['body ', '']
liver and DISEASE
['liver and ', '']
a significant increase in DISEASE
['a significant increase in ', '']
SPECIES mammary
['SPECIES mammary']
CHEMICAL ( ba )
['', ' ( ba )']
the rate of DISEASE
['the rate of ', '']
apoptotic and DISEASE
['apoptotic an

DISEASE and mortality
['', ' and mortality']
upregulate GENE
['upregulate ', '']
DISEASE and insulin
['', ' and insulin']
adipose tissue of SPECIES
['adipose tissue of SPECIES']
GENE enhancer
['', ' enhancer']
plasma concentration of CHEMICAL
['plasma concentration of ', '']
combination therapy with CHEMICAL
['combination therapy with ', '']
SPECIES melanocyte
['SPECIES melanocyte']
SPECIES ear
['SPECIES ear']
CHEMICAL alone or in combination
['', ' alone or in combination']
GENE ) activity
['', ' ) activity']
CHEMICAL phosphokinase
['', ' phosphokinase']
CHEMICAL ( aa ) ,
['', ' ( aa ) ,']
plasma GENE
['plasma ', '']
DISEASE metabolism
['', ' metabolism']
novel CHEMICAL
['novel ', '']
the potential of CHEMICAL
['the potential of ', '']
CHEMICAL turnover
['', ' turnover']
CHEMICAL substituent
['', ' substituent']
CHEMICAL 15
['', ' 15']
selective CHEMICAL receptor modulator ( serm )
['selective ', ' receptor modulator ( serm )']
plasma CHEMICAL
['plasma ', '']
ovx SPECIES
['ovx SPECIES

DISEASE and level
['', ' and level']
SPECIES dendritic cell
['SPECIES dendritic cell']
receptor , GENE
['receptor , ', '']
contact DISEASE
['contact ', '']
CHEMICAL oxidase
['', ' oxidase']
SPECIES cell
['SPECIES cell']
CHEMICAL export
['', ' export']
polymorphism of GENE
['polymorphism of ', '']
the GENE channel
['the ', ' channel']
depletion of GENE
['depletion of ', '']
CHEMICAL transporter 1
['', ' transporter 1']
enzyme and CHEMICAL
['enzyme and ', '']
increase of DISEASE
['increase of ', '']
CHEMICAL residue
['', ' residue']
apoptosis , DISEASE
['apoptosis , ', '']
the spleen of SPECIES
['the spleen of SPECIES']
SPECIES osteoblast
['SPECIES osteoblast']
pharmacological inhibition of GENE
['pharmacological inhibition of ', '']
intraperitoneal administration of CHEMICAL
['intraperitoneal administration of ', '']
DISEASE angiogenesis
['', ' angiogenesis']
nm CHEMICAL
['nm ', '']
dose of CHEMICAL
['dose of ', '']
SPECIES protein
['SPECIES protein']
natural and synthetic CHEMICAL
['na

['SPECIES renal proximal tubular cell']
effect of CHEMICAL on receptor
['effect of ', ' on receptor']
therapeutic potential of CHEMICAL
['therapeutic potential of ', '']
novo synthesis of CHEMICAL
['novo synthesis of ', '']
CHEMICAL administration (
['', ' administration (']
CHEMICAL strongly inhibit
['', ' strongly inhibit']
( CHEMICAL ) exposure
['( ', ' ) exposure']
the SPECIES cell
['the SPECIES cell']
damage by CHEMICAL
['damage by ', '']
type 2 DISEASE
['type 2 ', '']
CHEMICAL tolerance test
['', ' tolerance test']
regulation of GENE expression
['regulation of ', ' expression']
GENE protein stability
['', ' protein stability']
CHEMICAL dehydrogenase
['', ' dehydrogenase']
the reduction of CHEMICAL
['the reduction of ', '']
e.g. , CHEMICAL )
['e.g. , ', ' )']
reductase ( GENE )
['reductase ( ', ' )']
CHEMICAL ( bp )
['', ' ( bp )']
( CHEMICAL ) administration
['( ', ' ) administration']
the CHEMICAL ring
['the ', ' ring']
CHEMICAL ( ra
['', ' ( ra']
water CHEMICAL
['water ', '']
C

In [31]:
# re
re.search("%", "wh ehiu ) ().., %,")

<_sre.SRE_Match object; span=(16, 17), match='%'>

In [37]:
pattern = []
with open("train_CRAFT_cnt_N_p.tsv", "r") as f:
    for line in f:
        segs = line[:-1].split("\t")  # delete the '\n' at the end of each line
        if(len(segs)!=3):
            #print(line)
            pass
        else:
            pattern.append(segs)
print (pattern)
print ("finish loading structure pattern: ", len(pattern))

[['$W$$N$', 'GENE', '11849'], ['$W$ - $W$', 'GENE', '2959'], ['$W$ - $N$', 'GENE', '2844'], ['$W$$N$', 'Chemical', '2239'], ['$W$ - $W$', 'Chemical', '1789'], ['$W$$N$$W$', 'GENE', '1438'], ['$W$$N$ $W$', 'GENE', '1339'], ['$W$ - $W$ $W$', 'GENE', '1118'], ['$N$ - $W$', 'Chemical', '986'], ['$W$ - $N$ $W$', 'GENE', '892'], ['$W$$N$$W$$N$', 'GENE', '691'], ['$W$ - $N$', 'Chemical', '679'], ['$W$ $W$$N$', 'GENE', '664'], ['$W$ ( $N$ )', 'Chemical', '533'], ['$W$ ( $N$ + )', 'Chemical', '522'], ['$W$ ( $W$ )', 'Chemical', '406'], ['$W$ - $W$ $W$ $W$', 'GENE', '383'], ['( $N$ ) $W$', 'Chemical', '316'], ['$W$ - $W$ $W$', 'Chemical', '274'], ['$W$ $N$', 'GENE', '266'], ['$W$$N$$W$', 'Chemical', '265'], ['$W$$N$$W$$N$', 'Chemical', '258'], ['$W$$N$ $W$ $W$', 'GENE', '254'], ['$W$ $W$ - $W$ $W$', 'GENE', '250'], ['$W$ ( + )', 'Chemical', '236'], ['$W$ $N$', 'Chemical', '230'], ['$W$ $W$$N$', 'Chemical', '194'], ['$W$ - $N$$W$', 'GENE', '185'], ['$W$ - $W$ - $W$', 'Chemical', '182'], ['$W$ $W$

In [5]:
re.search(r"and/or", r"we have and/or")

<_sre.SRE_Match object; span=(8, 14), match='and/or'>

# Structure Patterns

In [43]:
re.search('[-+]?[0-9]+', "this is a -45 . 89 word")

<_sre.SRE_Match object; span=(10, 13), match='-45'>

In [3]:
re.search('\w+[-+]?[0-9]+', "Bl3")

<_sre.SRE_Match object; span=(0, 3), match='Bl3'>

In [3]:
with open("train1_all.tsv", "r") as f:
    a = []
    for line in f:
        segs= line.split("\t")
        if(len(segs)>2):
            print(line)
        if("-" in segs[0]):
            a.append(segs[0])
set(a)

{'+/-', '-->', '-1', '-2', '-3', '19-Apr', '2-Oct', '4-Oct', '<--', '=-'}

In [4]:
with open("train0_all.tsv", "r") as f:
    a = []
    for line in f:
        segs= line.split("\t")
        if(len(segs)>2):
            print(line)
        if("-" in segs[0]):
            a.append(segs[0])
set(a)

{'1-Oct', '2-Oct', '3-Oct', '4-Oct', '=-', 'Oct-91'}

In [5]:
with open("train_CRAFT.tsv", "r") as f:
    a = []
    for line in f:
        segs= line.split("\t")
        if(len(segs)>2):
            print(line)
        if("-" in segs[0]):
            a.append(segs[0])
set(a)

{'+/-', '-', '--', '-->', '<-->'}

In [13]:
with open("train_CRAFT.tsv", "r") as f:
    tmp = []
    type_e = []
    for line in f:
        segs = line.split("\t")
        if(len(segs)==2):
            if(segs[1]=="O\n"):
                if(len(tmp)>0):
                    print(" ".join(tmp))
                tmp = []
            else:
                tmp.append(segs[0])
                type_e.append(segs[1][:-1])

        

carbonic anhydrase II
leptin receptor
tyrosinase
pigmented
Lepr
Tyr
molecular
Pigment
anesthetic
anesthetic
anesthetic
anesthetic
Myoc
myocilin
MYOC
mouse Myoc
Myoc
Myoc
Car2
Lepr
Car2
carbonic anhydrase II
Car2
Lepr
leptin receptor
Tyr
pigmented
tyrosinase
pigmented
pigmented
pigmented
Tyr
anesthetic agents
anesthetic
drug
terpenes
Terpene
terpenes
drug
anesthetic agents
leptin receptor
molecular
molecular
Car2
Bicarbonate
carbonic
CAII
CAII
Car2
CAIV
CAIV
CAII
CAII
mouse CAIV
CAII
Car2
Tyrosinase
Tyrosinase
pigment
Tyrosinase
tyrosinase
pigmented
pigmented
tyrosinase
pigmented
molecular
chow
water
chow
anesthetic agents
acid
alcohol
buffered
phosphate
Myoc
mouse Myoc
carbonic
Car2
Myoc
BRCA2
BRCA2
BRCA2
BRCA2
BRCA2
RAD51
BRCA2
RAD51
BRCA2 BRCA2
BRCA2 protein
protein
amino acids
protein
RAD51
BRCA1
BRCA1
BRCA2
BRCA1
BRCA2
BRCA1
BRCA2
BRCA2
RAD51 recombinase
protein
protein
BRCA1
RAD51
BRCA2
RAD51
BRCA1
RAD51
proteins
Brca1
BRCA1
BRCA2
BRCA1
BRCA2
BRCA2
BRCA1
BRCA2
protein
I - SceI
I -

Snail
Snail
E - cadherin
Snail
protein
E - cadherin
beta - catenin
LEF - 1
proteins
cyclin D
Snail
beta - catenin
beta - catenin
Snail
LEF - 1
beta - catenin
Ajuba
Snail
Ajuba
growth factor receptor - bound protein - 2
Grb - 2
sevenless
nucleotide
Ajuba
Snail
Ajuba
Grb - 2
protein
Snail
alpha - catenin
Grb - 2
Ajuba
protein
alpha - catenin
Snail
E - cadherin
alpha - catenin
Ajuba
Grb - 2
Sos
Grb - 2
Sos
alpha - catenin
Ajuba
Ajuba
Ajuba
K14
inhibitor
Grb - 2
Ajuba
Grb - 2
inhibitor
Grb - 2
Ajuba
Ajuba
Grb - 2
Sos
nucleotide
Snail
molecular
Snail
TGF - beta 2
Snail mRNA
Snail
Snail
BMPs
TGF - beta s
Snail
BMPs
TGF - beta
Snail
Wnt - 3a
BMP inhibitor noggin
LEF - 1
E - cadherin
LEF - 1
beta - catenin
Snail
noggin
Snail
TGF - beta 1
Snail
TGF - beta 1
TGF - beta
TGF - beta 2
TGF - beta 2
Snail
TGF - beta 2
Snail
Snail protein
Snail
TGF - beta 2
TGF - beta
effector SMAD2
Snail
Snail
SMAD2
Snail
mRNA
Snail
TGF - beta 2
mouse Snail
TGF - beta 2
Snail protein
TGF - beta 2
Snail
Snail
Snail
Sn

Annexin A7
Annexin A7
Annexin A7
Annexin A7
Annexin A7
Annexin A7
Annexin A7
ethanol
formaldehyde
mouse Annexin A7
Annexin A7
mouse Annexin A7
glial fibrillary acidic protein
GFAP
Emerin
Molecular
Annexin A7
GFAP
Molecular
Annexin A7
xylene
ethanol
NaCl
KCl
Na2HPO4
formaldehyde
BSA
water
gelvatol
annexin A7
Annexin A7
Annexin A7
sucrose
Tris
CaCl2
Mg
acetate
EDTA
DTT
PMSF
Hepes
MgCl2
KCl
EDTA
glycerine
DTT
PMSF
Hepes
MgCl2
KCl
EDTA
glycerine
DTT
PMSF
protease inhibitor
SDS
SDS
SDS
Annexin A7 mRNA
A
32P
dATP
annexin A7
beta - actin
Annexin A7
AnxA7
Molecular
Pea3
molecular
neurotrophin
Bax
neurotrophin
BMP
Apterous
Squeeze
Er81
Pea3
Er81
neurotrophin 3
NT - 3
proteins
Er81
Pea3
Er81
neurotrophin
proteins
neurotrophin
effectors
Er81
Er81
Er81
EWS
Pea3
Er81
Er81
Er81
Pea3
Erm
Pea3
amino acid
Er81
Pea3
Erm
EWS
Pea3
Ewing sarcoma
EWS
Pea3
Er81
EWS
Pea3
Er81
Pea3
EWS
Pea3
Er81
EWS
Pea3
Er81
Pea3
calcium
protein Parvalbumin
PV
Er81
Pea3
EWS
Pea3
Er81
EWS
Pea3
Pea3
EWS
Pea3
Er81
Er81
EWS
Pea3


MCAD
MCAD
MCAD
MCAD
Acadm mRNA
MCAD
human MCAD
MCAD
MCAD
human MCAD
MCAD
VLCAD
long chain fat
CPT - 1 and - 2
mouse MCAD
MCAD
acyl - CoA
MCAD
LCAD
VLCAD
SCAD
LCAD
MCAD
VLCAD
SCAD
MCAD
fatty acids
MCAD
neomycin
Acadm
MCAD
Acadm
Acadm
Acadm
Acadm
MCAD
MCAD
MCAD
MCAD
MCAD
guanidinium thiocyanate
Acadm
Acadm mRNA
MCAD
MCAD
MCAD
agarose
formaldehyde
32P
radiolabeled
mouse Acadm
formamide
dextran sulfate
reagent
SDS
SDS
SDS
agarose
formaldehyde
ethidium bromide
MCAD protein
MCAD protein
MCAD
MCAD
nitrogen
sodium deoxycholate
SDS
glycerol
Protease Inhibitor
phenylmethylsulfonylfluoride
sodium orthovanadate
protein
Protein
SDS
phosphate
buffered
MCAD
MCAD
MCAD
MCAD
octanoyl - CoA
palmitoyl - CoA
MCAD
MCAD
glucose
fatty acid
acid
carnitine
Glucose
fatty acids
reagents
acid
tetracosane
Acylcarnitine
MCAD
MCAD
MCAD
MCAD
Acadm
MCAD
medium - chain acyl - CoA dehydrogenase SCAD
short - chain acyl - CoA dehydrogenase
VLCAD
very long - chain acyl - CoA dehydrogenase
reagents
Medium - chain acyl - CoA 

molecular
RanBP2
HKI
Cox11
RanBP2
HKI
RanBP2
Cox11
urea
Cox11
RanBP2
Cox11
RanBP2
leucine
leucine
Ran
zinc
KIF5B
KIF5C
Cox 11
HKI
RanBP2
Cox11
HKI
Cox11
HKI
RanBP2
HKI
Cox11
HKI
Cox11
HKI
Cox11
HKI
Cox11
inhibitor
HKI
HKI
glucose
RanBP2
Cox11
glucose
Cox11
HKI
HKI
Cox11
HKI
HKI
Cox11
RanBP2
HKI
glucose
HKI
Cox11
HKI
Cox11
HKI
Cox11
glucose
Cox11
Cox11
inhibitor
HKI
HKI
glucose
HKI
glucose
Cox11
RanBP2
HKI
Cox11
RanBP2
HKI
Cox11
glucose
RanBP2
RanBP2
HKI
glucose
RanBP2
HK1
Cox11
mHsp70
RanBP2
RanBP2
RanBP2
Cox11
HKI
mHsp70
proteins
RanBP2
Molecular
HKI
RanBP2
RanBP2
HKI
HKI
RanBP2
mHsp70
RanBP2
mHsp70
Cox11
HKI
Cox11
RanBP2
mHsp70
Cox11
RanBP2
Cox11
mHsp70
HKI
Cox11
HKI
Cox11
RanBP2
HKI
mHsp70
RanBP2
HKI
HKI
RanBP2
HKI
RanBP2
RanBP2
RanBP2
RanBP2
RanBP2
RanBP2
RanBP2
RanBP2
RanBP2
proteins
RanBP2
RanBP2
human placental alkaline phophatase ( PLAP
RanBP2
HindIII
RanBP2
PLAP
PLAP
PLAP
PLAP
RanBP2
PLAP
PLAP
human placental alkaline phophatase
RanBP2
Cox11
HKI
HKI
RanBP2
Cox11
HKI
RanBP2
HKI

Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo2
Pygo2
Pygo1
Pygo2
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo2
Pygo1
Pygo2
Pygo1
Wnt1
Wnt9b
lithium chloride
LiCl
LiCl
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Cxcl13
Slc5a2
Slco1a4
Slc5a2
glucose
organic anion
Slco1a4
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Klk5
Klk6
Ren2
Timeless
Klk5
Klk6
serine
Ren2
Renin 1
Timeless
Pygo1
Pygo2
Pygo1
Pygo2
Ccnd1
cyclin D1
Wisp1
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Pygo1
Pygo2
Pygo1
Pygo2
Pygopus
Wnt
Wnt
Pygo
Wnt
Pygo
Pygo
Wg
Pygo1
Pygo2
Pygo1
Pygo2
Pygo2
Pygo1
Pygo1
Pygo2
Pygo1
Pygo2
Pygopus
Pygopus
Wg
amino acids
cysteines
histidine
zinc ions
proteins
protein
protein
amino acid
cysteine
tyrosine
Wnt
Pygo1
Pygo2
Pygo1
amino acids
Pygo2
amino acids
Wnt9b
Wnt9b
Wnt4
Wnt11
GDNF
Pygo2
Pygo1
Pygo2
Wnt11
Wnt11
Wnt11
GDNF
Pygo1
Pygo2
Pygo1
Wnt9b
Pygopus
Pygopus
Wnt
Pygo1
Pygo2
Pygo1
Pygo2
Pygo1
Py

In [14]:
set(type_e)

{'B-Chemical',
 'B-GENE',
 'E-Chemical',
 'E-GENE',
 'I-Chemical',
 'I-GENE',
 'S-Chemical',
 'S-GENE'}

# Pos Tagging

In [5]:
from pycorenlp import StanfordCoreNLP
nlp = StanfordCoreNLP('http://localhost:9000')
def pos_noun(sentence):
    sent2 = ""
    sent = sentence.replace("@", " ")
    res = nlp.annotate(sent, properties={'annotators': "pos", 'outputFormat': 'json'})
    res = res["sentences"][0]["tokens"]
    noun_list = []
    current_noun = []
    for word in res:
        print(word)
        if (word['pos'][0] == 'N'):
            current_noun.append(word['word'])
        elif(len(current_noun) != 0):
            n = "_".join(current_noun)
            sent2 = sent2 + " " + n + " " + word['word']
            current_noun = []
            noun_list.append(n)
        else:
            sent2 = sent2 + " " + word['word']
    return sent2, noun_list
pos_noun("Southern analysis on genomic DNA isolated from tissues and cell lines from several mouse strains using mCD22 cDNA demonstrated that the Cd22 locus encoding mCD22 is a single copy gene of < or = 30 kb .")

{'index': 1, 'word': 'Southern', 'originalText': 'Southern', 'characterOffsetBegin': 0, 'characterOffsetEnd': 8, 'pos': 'NN', 'before': '', 'after': ' '}
{'index': 2, 'word': 'analysis', 'originalText': 'analysis', 'characterOffsetBegin': 9, 'characterOffsetEnd': 17, 'pos': 'NN', 'before': ' ', 'after': ' '}
{'index': 3, 'word': 'on', 'originalText': 'on', 'characterOffsetBegin': 18, 'characterOffsetEnd': 20, 'pos': 'IN', 'before': ' ', 'after': ' '}
{'index': 4, 'word': 'genomic', 'originalText': 'genomic', 'characterOffsetBegin': 21, 'characterOffsetEnd': 28, 'pos': 'JJ', 'before': ' ', 'after': ' '}
{'index': 5, 'word': 'DNA', 'originalText': 'DNA', 'characterOffsetBegin': 29, 'characterOffsetEnd': 32, 'pos': 'NN', 'before': ' ', 'after': ' '}
{'index': 6, 'word': 'isolated', 'originalText': 'isolated', 'characterOffsetBegin': 33, 'characterOffsetEnd': 41, 'pos': 'VBN', 'before': ' ', 'after': ' '}
{'index': 7, 'word': 'from', 'originalText': 'from', 'characterOffsetBegin': 42, 'cha

(' Southern_analysis on genomic DNA isolated from tissues and cell_lines from several mouse_strains using mCD22_cDNA demonstrated that the Cd22_locus encoding mCD22 is a single copy_gene of < or = 30 kb .',
 ['Southern_analysis',
  'DNA',
  'tissues',
  'cell_lines',
  'mouse_strains',
  'mCD22_cDNA',
  'Cd22_locus',
  'mCD22',
  'copy_gene',
  'kb'])