### Stanford NER Tagger

In [None]:
#install and unzip files
!wget 'https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip'
!unzip stanford-ner-4.2.0.zip

--2023-03-20 09:54:01--  https://nlp.stanford.edu/software/stanford-ner-4.2.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip [following]
--2023-03-20 09:54:01--  https://downloads.cs.stanford.edu/nlp/software/stanford-ner-4.2.0.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 180437064 (172M) [application/zip]
Saving to: ‘stanford-ner-4.2.0.zip’


2023-03-20 09:54:31 (5.75 MB/s) - ‘stanford-ner-4.2.0.zip’ saved [180437064/180437064]

Archive:  stanford-ner-4.2.0.zip
   creating: stanford-ner-2020-11-17/
   creating: stanford-ner-2020-11-17/lib/
  inflating: stanford-ner-

In [None]:
#import ntlk and then import the file from before
import nltk
from nltk.tag.stanford import StanfordNERTagger

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
#3 class: Location, Person, Organization
#4 class: Location, Person, Organization, Misc
#7 class: Location, Person, Organization, Money, Percent, Date, Time

#Loading 3 classes model
st = StanfordNERTagger('/content/stanford-ner-2020-11-17/classifiers/english.all.3class.distsim.crf.ser.gz',
                       '/content/stanford-ner-2020-11-17/stanford-ner.jar',
                       encoding = 'utf-8')


In [None]:
sentence = "I live in Paris and I like to visit the Versailles Palace of Louis 14"

In [None]:
#tokenize the phrase
from nltk.tokenize import word_tokenize
nltk.download('punkt')
tokens = word_tokenize(sentence)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#Test with upper and lower case: versailles/ Versailles
st.tag(tokens)

[('I', 'O'),
 ('live', 'O'),
 ('in', 'O'),
 ('Paris', 'LOCATION'),
 ('and', 'O'),
 ('I', 'O'),
 ('like', 'O'),
 ('to', 'O'),
 ('visit', 'O'),
 ('the', 'O'),
 ('Versailles', 'ORGANIZATION'),
 ('Palace', 'ORGANIZATION'),
 ('of', 'ORGANIZATION'),
 ('Louis', 'ORGANIZATION'),
 ('14', 'O')]

### Train a NER

In [None]:
import os
os.mkdir("train")

In [None]:
fileName = "own_ner.txt"

#all tokens must be tagged

f = open("train/"+fileName,"w")
with open("train/"+fileName,"w") as f:
  f.write('I O\n')
  f.write('live O\n')
  f.write('in O\n')
  f.write('Paris Location\n')
  f.write('and O\n')
  f.write('I O\n')
  f.write('like O\n')
  f.write('to O\n')
  f.write('visit O\n')
  f.write('the O\n')
  f.write('Versailles CITY\n')
  f.write('Palace CHATEAU\n')
  f.write('of O\n')
  f.write('Louis KING\n')
  f.write('14 KING_NUMBER\n')

In [None]:
fileName = "params.txt"
f = open("train/"+fileName,"w")
with open("train/"+fileName, 'w') as f:
  f.write('trainFile = train/own_ner.txt\n')
  f.write('serializeTo = my-model.ser.gz\n')
  f.write('map = word=0,answer=1\n')
  f.write('useClassFeature=true\n')
  f.write('useWord=true\n')
  f.write('useNGrams=true\n')
  f.write('noMidNGrams=true\n')
  f.write('maxNGramLeng=6\n')
  f.write('usePrev=true\n')
  f.write('useNext=true\n')
  f.write('useSequences=true\n')
  f.write('usePrevSequences=true\n')
  f.write('maxLeft=1\n')
  f.write('useTypeSeqs=true\n')
  f.write('useTypeSeqs2=true\n')
  f.write('useTypeySequences=true\n')
  f.write('wordShape=chris2useLC\n')
  f.write('useDisjunctive=true\n')

In [None]:
!java -cp "stanford-ner-2020-11-17/stanford-ner.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop train/params.txt
ner_tagger = StanfordNERTagger("my-model.ser.gz","stanford-ner-2020-11-17/stanford-ner.jar" )

Invoked on Mon Mar 20 10:51:36 UTC 2023 with arguments: -prop train/params.txt
useTypeSeqs2=true
noMidNGrams=true
trainFile=train/own_ner.txt
maxNGramLeng=6
maxLeft=1
serializeTo=my-model.ser.gz
wordShape=chris2useLC
useDisjunctive=true
useClassFeature=true
useNGrams=true
useNext=true
usePrev=true
useTypeySequences=true
usePrevSequences=true
useTypeSeqs=true
useSequences=true
map=word=0,answer=1
useWord=true
numFeatures = 341
Time to convert docs to feature indices: 0.1 seconds
Current memory used: 6m
numClasses: 6 [0=O,1=Location,2=CITY,3=CHATEAU,4=KING,5=KING_NUMBER]
numDocuments: 1
numDatums: 15
numFeatures: 341
Time to convert docs to data/labels: 0.0 seconds
Current memory used: 6m
Running gradient on 2 threads
numWeights: 4596
QNMinimizer called on double function of 4596 variables, using M = 25.
               An explanation of the output:
Iter           The number of iterations
evals          The number of function evaluations
SCALING        <D> Diagonal scaling was used; <I> S

In [None]:
sentence2 = "I live in Paris and I like to visit the Versailles Palace of Louis 14"

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
tokens2 = word_tokenize(sentence2)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
ner_tagger.tag(tokens2)

[('I', 'O'),
 ('live', 'O'),
 ('in', 'O'),
 ('Paris', 'Location'),
 ('and', 'O'),
 ('I', 'O'),
 ('like', 'O'),
 ('to', 'O'),
 ('visit', 'O'),
 ('the', 'O'),
 ('Versailles', 'CITY'),
 ('Palace', 'CHATEAU'),
 ('of', 'O'),
 ('Louis', 'KING'),
 ('14', 'KING_NUMBER')]