## CONLL 2003 (English) - Pre-Trained Model

In [109]:
!pip install pandas sklearn nltk

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1316 sha256=a1e9ce17e702ee8a2150a7233085156366daf4d44fc4766f83da2ad994189d6c
  Stored in directory: /Users/aminov.sb/Library/Caches/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [1]:
import pandas as pd
from sklearn.metrics import f1_score
import nltk
from nltk.tag import StanfordNERTagger

In [2]:
!ls ../../datasets/conll/conll2003/

ls: ../../datasets/conll/conll2003/: No such file or directory


In [3]:
raw_conll_test = open("../../../datasets/conll/conll2003/test.txt").read()
split_conll_test = raw_conll_test.split()

In [6]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(split_conll_test):
    if i == "I-PER":
        split_conll_test[n] = "PERSON"
    if i == "I-ORG":
        split_conll_test[n] = "ORGANIZATION"
    if i == "I-LOC":
        split_conll_test[n] = "LOCATION"
    if i == "B-PER":
        split_conll_test[n] = "PERSON"
    if i == "B-ORG":
        split_conll_test[n] = "ORGANIZATION"
    if i == "B-LOC":
        split_conll_test[n] = "LOCATION"
    if i == "I-MISC":
        split_conll_test[n] = "MISC"
    if i == "B-MISC":
        split_conll_test[n] = "MISC"

In [7]:
reference_conll_test=list(zip(split_conll_test[0::4],split_conll_test[3::4]))

In [8]:
conll_tokens_test = split_conll_test[::4]

In [12]:
!wget https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip

--2021-04-08 01:52:03--  https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip [following]
--2021-04-08 01:52:04--  https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180437064 (172M) [application/zip]
Saving to: ‘stanford-ner-4.2.0.zip’


2021-04-08 01:58:41 (446 KB/s) - ‘stanford-ner-4.2.0.zip’ saved [180437064/180437064]



In [17]:
!unzip stanford-ner-4.2.0.zip

Archive:  stanford-ner-4.2.0.zip
replace stanford-ner-2020-11-17/lib/jollyday-0.4.9.jar? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [18]:
!ls stanford-ner-2020-11-17

LICENSE.txt                    [31mner.sh[m[m
NERDemo.java                   sample-conll-file.txt
README.txt                     sample-w-time.txt
build.xml                      sample.ner.txt
[34mclassifiers[m[m                    [31msample.txt[m[m
[34mlib[m[m                            stanford-ner-4.2.0-javadoc.jar
ner-gui.bat                    stanford-ner-4.2.0-sources.jar
[31mner-gui.command[m[m                stanford-ner-4.2.0.jar
[31mner-gui.sh[m[m                     stanford-ner.jar
[31mner.bat[m[m


In [19]:
!ls stanford-ner-2020-11-17/classifiers

english.all.3class.distsim.prop         english.muc.7class.distsim.prop
english.conll.4class.distsim.crf.ser.gz example.serialized.ncc.prop
english.conll.4class.distsim.prop


In [20]:
st = StanfordNERTagger('stanford-ner-2020-11-17/classifiers/english.conll.4class.distsim.crf.ser.gz',
					   'stanford-ner-2020-11-17/stanford-ner.jar',
					   encoding='utf-8')                 
conll_pred = st.tag(conll_tokens_test)

In [21]:
conll_pred_tags = [x[1] for x in conll_pred]

In [22]:
f1_score(split_conll_test[3::4], conll_pred_tags, average='micro',labels=['LOCATION'])

0.9007987632053595

In [23]:
f1_score(split_conll_test[3::4], conll_pred_tags, average='micro',labels=['PERSON'])

0.9497165843847138

In [25]:
f1_score(split_conll_test[3::4], conll_pred_tags, average='micro',labels=['ORGANIZATION'])

0.8582253240279163

In [26]:
f1_score(split_conll_test[3::4], conll_pred_tags, average='micro',labels=['MISC'])

0.7972399150743099

In [27]:
f1_score(split_conll_test[3::4], conll_pred_tags, average='micro')

0.9750782154030772

## CONLL 2003 (English) - Custom-Trained Model

In [None]:
!ls ../../../conll/conll2003

In [31]:
conll_train = pd.read_csv('../../../conll/conll2003/train.txt',sep=' ')

In [32]:
conll_train=conll_train.replace({'O':'O', 'B-ORG':'ORGANIZATION', 'B-PER':'PERSON', 'B-LOC':'LOCATION', 'B-MISC':'MISC','I-ORG':'ORGANIZATION', 'I-PER':'PERSON', 'I-LOC':'LOCATION', 'I-MISC':'MISC'}) 

In [33]:
conll_train.head()

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,EU,NNP,B-NP,ORGANIZATION
1,rejects,VBZ,B-VP,O
2,German,JJ,B-NP,MISC
3,call,NN,I-NP,O
4,to,TO,B-VP,O


In [34]:
conll_train['O'].value_counts() 

O               168345
PERSON           11128
ORGANIZATION     10025
LOCATION          8297
MISC              4593
Name: O, dtype: int64

In [35]:
conll_train = conll_train.drop(['-X-','-X-.1'],axis=1) 

In [36]:
conll_train=conll_train.dropna(subset=['-DOCSTART-']) 

In [226]:
conll_train.to_csv('../../../conll/conll2003/train.tsv', sep = '\t', index=False,header=False) 

In [228]:
!java -cp "stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop prop_conll.txt

Invoked on Mon Apr 05 03:44:46 ALMT 2021 with arguments: -prop prop_conll.txt
usePrevSequences=true
useClassFeature=true
useTypeSeqs2=true
useSequences=true
wordShape=chris2useLC
useTypeySequences=true
useDisjunctive=true
noMidNGrams=true
serializeTo=conll_ner.ser.gz
maxNGramLeng=3
useNGrams=true
usePrev=true
useNext=true
maxLeft=1
trainFile=train.tsv
map=word=0,answer=1
useWord=true
useTypeSeqs=true
numFeatures = 406281
Time to convert docs to feature indices: 3.4 seconds
Current memory used: 279m
numClasses: 5 [0=O,1=ORGANIZATION,2=MISC,3=PERSON,4=LOCATION]
numDocuments: 2154
numDatums: 202385
numFeatures: 406281
Time to convert docs to data/labels: 2.8 seconds
Current memory used: 378m
Running gradient on 8 threads
numWeights: 5260025
QNMinimizer called on double function of 5260025 variables, using M = 25.
               An explanation of the output:
Iter           The number of iterations
evals          The number of function evaluations
SCALING        <D> Diagonal scaling was use

In [44]:
import os
jar_engine = 'stanford-ner.jar'
#load your own ner model
model = 'conll_ner.ser.gz'

In [45]:
entity_tagger=StanfordNERTagger(model,jar_engine, encoding='utf8') 

In [46]:
conll_test = open("../../../conll/conll2003/test.txt").read()
conll_test_split = conll_test.split()

In [47]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(conll_test_split):
    if i == "I-PER":
        conll_test_split[n] = "PERSON"
    if i == "I-ORG":
        conll_test_split[n] = "ORGANIZATION"
    if i == "I-LOC":
        conll_test_split[n] = "LOCATION"
    if i == "B-PER":
        conll_test_split[n] = "PERSON"
    if i == "B-ORG":
        conll_test_split[n] = "ORGANIZATION"
    if i == "B-LOC":
        conll_test_split[n] = "LOCATION"
    if i == "I-MISC":
        conll_test_split[n] = "MISC"
    if i == "B-MISC":
        conll_test_split[n] = "MISC"

In [48]:
conll_test=list(zip(conll_test_split[0::4],conll_test_split[3::4]))

In [49]:
conll_words = conll_test_split[0::4]
conll_tags = conll_test_split[3::4]

In [50]:
conll_pred = entity_tagger.tag(conll_words)

In [51]:
conll_pred = [x[1] for x in conll_pred]

In [52]:
f1_score(conll_tags, conll_pred, average='micro',labels=['LOCATION'])

0.8553326293558606

In [53]:
f1_score(conll_tags, conll_pred, average='micro',labels=['PERSON'])

0.9105895000895897

In [54]:
f1_score(conll_tags, conll_pred, average='micro',labels=['ORGANIZATION'])

0.8175241157556271

In [55]:
f1_score(conll_tags, conll_pred, average='micro',labels=['MISC'])

0.7560064068339563

In [56]:
f1_score(conll_tags, conll_pred, average='micro',labels=['O'])

0.9907150452576705

In [57]:
f1_score(conll_tags, conll_pred, average='micro')

0.9664852355033643

## SEC-Filings - Pre-Trained

In [18]:
!ls ../../../datasets/SEC-filings/CONLL-format/data/test/

FIN3.txt


In [58]:
sec_test = open("../../../datasets/SEC-filings/CONLL-format/data/test/FIN3.txt").read()
sec_test_split = sec_test.split()

In [59]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(sec_test_split):
    if i == "I-PER":
        sec_test_split[n] = "PERSON"
    if i == "I-ORG":
        sec_test_split[n] = "ORGANIZATION"
    if i == "I-LOC":
        sec_test_split[n] = "LOCATION"
    if i == "B-PER":
        sec_test_split[n] = "PERSON"
    if i == "B-ORG":
        sec_test_split[n] = "ORGANIZATION"
    if i == "B-LOC":
        sec_test_split[n] = "LOCATION"
    if i == "I-MISC":
        sec_test_split[n] = "MISC"
    if i == "B-MISC":
        sec_test_split[n] = "MISC"

In [61]:
sec_test=list(zip(sec_test_split[0::4],sec_test_split[3::4]))

In [63]:
sec_words = sec_test_split[0::4]
sec_tags = sec_test_split[3::4]

In [67]:
st = StanfordNERTagger('/stanford-ner-2020-11-17/classifiers/english.conll.4class.distsim.crf.ser.gz',
					   '/stanford ner/stanford-ner-2020-11-17/stanford-ner.jar',
					   encoding='utf-8')              
sec_pred = st.tag(sec_words)

In [68]:
sec_pred = [x[1] for x in sec_pred]

In [69]:
f1_score(sec_test_split[3::4], sec_pred, average='micro',labels=['LOCATION'])

0.39999999999999997

In [70]:
f1_score(sec_test_split[3::4], sec_pred, average='micro',labels=['PERSON'])

0.2967741935483871

In [71]:
f1_score(sec_test_split[3::4], sec_pred, average='micro',labels=['ORGANIZATION'])

0.2608695652173913

In [72]:
f1_score(sec_test_split[3::4], sec_pred, average='micro',labels=['MISC'])

0.04307692307692307

In [73]:
f1_score(sec_test_split[3::4], sec_pred, average='micro')

0.9269378821043097

## SEC-Filings - Custom-Trained

In [118]:
sec_train = open("../../../datasets/SEC-filings/CONLL-format/data/test/FIN5.txt").read()
sec_train_split = sec_train.split()

In [119]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(sec_train_split):
    if i == "I-PER":
        sec_train_split[n] = "PERSON"
    if i == "I-ORG":
        sec_train_split[n] = "ORGANIZATION"
    if i == "I-LOC":
        sec_train_split[n] = "LOCATION"
    if i == "B-PER":
        sec_train_split[n] = "PERSON"
    if i == "B-ORG":
        sec_train_split[n] = "ORGANIZATION"
    if i == "B-LOC":
        sec_train_split[n] = "LOCATION"
    if i == "I-MISC":
        sec_train_split[n] = "MISC"
    if i == "B-MISC":
        sec_train_split[n] = "MISC"

In [120]:
sec_train=list(zip(sec_train_split[0::4],sec_train_split[3::4]))

In [122]:
sec_words = sec_test_split[0::4]
sec_tags = sec_test_split[3::4]

In [123]:
import pandas as pd
df = pd.DataFrame(sec_train)

In [125]:
df.to_csv('sec_train.tsv', sep = '\t', index=False,header=False)

In [126]:
!java -cp "stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop prop_sec.txt

Invoked on Thu Apr 08 03:29:56 ALMT 2021 with arguments: -prop prop_sec.txt
usePrevSequences=true
useClassFeature=true
useTypeSeqs2=true
useSequences=true
wordShape=chris2useLC
useTypeySequences=true
useDisjunctive=true
noMidNGrams=true
serializeTo=sec_custom.ser.gz
maxNGramLeng=3
useNGrams=true
usePrev=true
useNext=true
maxLeft=1
trainFile=sec_train.tsv
map=word=0,answer=1
useWord=true
useTypeSeqs=true
numFeatures = 64044
Time to convert docs to feature indices: 0.9 seconds
Current memory used: 144m
numClasses: 5 [0=O,1=ORGANIZATION,2=LOCATION,3=PERSON,4=MISC]
numDocuments: 1
numDatums: 41015
numFeatures: 64044
Time to convert docs to data/labels: 0.5 seconds
Current memory used: 175m
Running gradient on 8 threads
numWeights: 800520
QNMinimizer called on double function of 800520 variables, using M = 25.
               An explanation of the output:
Iter           The number of iterations
evals          The number of function evaluations
SCALING        <D> Diagonal scaling was used; <I

In [128]:
import os
jar_engine = 'stanford-ner.jar'
#load your own ner model
model = 'sec_custom.ser.gz'

In [129]:
entity_tagger=StanfordNERTagger(model,jar_engine, encoding='utf8')

In [131]:
sec_test = open("../../../datasets/SEC-filings/CONLL-format/data/test/FIN3.txt").read()
sec_test_split = sec_test.split()

In [132]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(sec_test_split):
    if i == "I-PER":
        sec_test_split[n] = "PERSON"
    if i == "I-ORG":
        sec_test_split[n] = "ORGANIZATION"
    if i == "I-LOC":
        sec_test_split[n] = "LOCATION"
    if i == "B-PER":
        sec_test_split[n] = "PERSON"
    if i == "B-ORG":
        sec_test_split[n] = "ORGANIZATION"
    if i == "B-LOC":
        sec_test_split[n] = "LOCATION"
    if i == "I-MISC":
        sec_test_split[n] = "MISC"
    if i == "B-MISC":
        sec_test_split[n] = "MISC"

In [133]:
sec_test=list(zip(sec_test_split[0::4],sec_test_split[3::4]))

In [134]:
sec_words = sec_test_split[0::4]
sec_tags = sec_test_split[3::4]

In [136]:
sec_pred = entity_tagger.tag(sec_words)

In [137]:
sec_pred = [x[1] for x in sec_pred]

In [138]:
f1_score(sec_tags, sec_pred, average='micro',labels=['LOCATION'])

0.6814814814814815

In [139]:
f1_score(sec_tags, sec_pred, average='micro',labels=['PERSON'])

0.9023255813953489

In [140]:
f1_score(sec_tags, sec_pred, average='micro',labels=['ORGANIZATION'])

0.3636363636363636

In [141]:
f1_score(sec_tags, sec_pred, average='micro',labels=['MISC'])

0.25

In [142]:
f1_score(sec_tags, sec_pred, average='micro')

0.986942410747981

## NCBI-Disease Custom-Trained

In [146]:
ncbi_train = open("../../../datasets/ncbi/train.txt").read()
ncbi_train_split = ncbi_train.split()

In [147]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(ncbi_train_split):
    if i == "B-Disease":
        ncbi_train_split[n] = "DISEASE"
    if i == "I-Disease":
        ncbi_train_split[n] = "DISEASE"

In [149]:
ncbi_train=list(zip(ncbi_train_split[0::4],ncbi_train_split[3::4]))

In [151]:
ncbi_words = ncbi_train_split[0::4]
ncbi_tags = ncbi_train_split[3::4]

In [153]:
import pandas as pd
df = pd.DataFrame(ncbi_train)

In [154]:
df.to_csv('ncbi_train.tsv', sep = '\t', index=False,header=False)

In [155]:
!java -cp "stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop prop_ncbi.txt

Invoked on Thu Apr 08 03:56:37 ALMT 2021 with arguments: -prop prop_ncbi.txt
usePrevSequences=true
useClassFeature=true
useTypeSeqs2=true
useSequences=true
wordShape=chris2useLC
useTypeySequences=true
useDisjunctive=true
noMidNGrams=true
serializeTo=ncbi_ner.ser.gz
maxNGramLeng=3
useNGrams=true
usePrev=true
useNext=true
maxLeft=1
trainFile=ncbi_train.tsv
map=word=0,answer=1
useWord=true
useTypeSeqs=true
numFeatures = 189623
Time to convert docs to feature indices: 2.5 seconds
Current memory used: 143m
numClasses: 2 [0=O,1=DISEASE]
numDocuments: 1
numDatums: 136012
numFeatures: 189623
Time to convert docs to data/labels: 2.1 seconds
Current memory used: 563m
Running gradient on 8 threads
numWeights: 536518
QNMinimizer called on double function of 536518 variables, using M = 25.
               An explanation of the output:
Iter           The number of iterations
evals          The number of function evaluations
SCALING        <D> Diagonal scaling was used; <I> Scaled Identity
LINESEARCH 

In [156]:
import os
jar_engine = 'stanford-ner.jar'

#load your own ner model
model = 'ncbi_ner.ser.gz'

In [157]:
entity_tagger=StanfordNERTagger(model,jar_engine, encoding='utf8')

In [158]:
ncbi_test = open("../../../datasets/ncbi/test.txt").read()
ncbi_test_split = ncbi_test.split()

In [159]:
# Amend class annotations to reflect Stanford's NERTagger
for n,i in enumerate(ncbi_test_split):
    if i == "B-Disease":
        ncbi_test_split[n] = "DISEASE"
    if i == "I-Disease":
        ncbi_test_split[n] = "DISEASE"

In [160]:
ncbi_test=list(zip(ncbi_test_split[0::4],ncbi_test_split[3::4]))

In [162]:
ncbi_words = ncbi_test_split[0::4]
ncbi_tags = ncbi_test_split[3::4]

In [163]:
ncbi_pred = entity_tagger.tag(ncbi_words)

In [164]:
ncbi_pred = [x[1] for x in ncbi_pred]

In [165]:
f1_score(ncbi_tags, ncbi_pred, average='micro',labels=['DISEASE'])

0.8613630474216117

In [166]:
f1_score(ncbi_tags, ncbi_pred, average='micro',labels=['O'])

0.9881435188263192

In [167]:
f1_score(ncbi_tags, ncbi_pred, average='micro')

0.9781552407006655