In [164]:
import pandas as pd
import numpy as np

path = '/Users/maddie/Downloads/ner.csv'
training_data = pd.read_csv(path, encoding="latin1")

In [165]:
training_data = training_data.fillna(method="ffill")

In [166]:
# lessen the amount of data so the computer can handle it; caused kernel to repeatedly crash otherwise

training_data = training_data.head(5000)
words = list(set(training_data["Sentence"].values))
n_words = len(words); n_words
pos = list(set(training_data["POS"].values))

In [167]:
# create a list of tags used in the training data

labels = list(set(training_data["Tag"].values))

In [168]:
import collections
label_counts = collections.Counter(list(training_data["Tag"].values))

In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Sentence"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(training_data)

In [8]:
sent = getter.get_next()
print(sent)

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', "['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']", "['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']")]


In [9]:
# ensure that the sentences were split properly
sentences = getter.sentences
print(sentences[0])

[('Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .', "['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'IN', 'NNP', 'CC', 'VB', 'DT', 'NN', 'IN', 'JJ', 'NNS', 'IN', 'DT', 'NN', '.']", "['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']")]


In [138]:
def word2features(sent, i):
    word = sent[i][0]
    print(word)
    postag = sent[i][1]
    
    # data structure consisting of a feature name and value for the token
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), # lower case variant of the token
        'word[-3:]': word[-3:], #suffix of 3 characters
        'word[-2:]': word[-2:], #suffix of 2 characters
        'word.isupper()': word.isupper(), # initial captial
        'word.istitle()': word.istitle(), # all words ini caps
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2], #first two characters of the PoS Tag
    }
    if i > 0:
        # adding features for the word based on the previous word
        word1 = sent[i-1][0] # previous word
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True # Beginning of sentence as a feature

    if i < len(sent)-1:
        # adding features for the word based on the next word
        word1 = sent[i+1][0] # next word
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # end of sentence as a feature

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [139]:
# define X and y for the training data
# once again limit the data set so as to prevent crashes

X = [sent2features(s) for s in sentences[:1000]]
y = [sent2labels(s) for s in sentences[:1000]]

It
took
eight
years
for
Warner
Brothers
to
recover
from
the
disaster
that
was
this
movie
.
All
the
New
York
University
students
love
this
diner
in
Soho
so
it
makes
for
a
fun
young
atmosphere
.
This
Italian
place
is
really
trendy
but
they
have
forgotten
about
the
most
important
part
of
a
restaurant
,
the
food
.
In
conclusion
,
my
review
of
this
book
would
be
:
I
like
Jane
Austen
and
understand
why
she
is
famous
.
The
story
of
this
movie
is
focused
on
Carl
Brashear
played
by
Cuba
Gooding
Jr.
who
wants
to
be
the
first
African
American
deep
sea
diver
in
the
Navy
.
Chris
O'Donnell
stated
that
while
filming
for
this
movie
,
he
felt
like
he
was
in
a
toy
commercial
.
My
husband
and
I
moved
to
Amsterdam
6
years
ago
and
for
as
long
as
we
have
lived
here
,
Blauwbrug
has
been
our
favorite
place
to
eat
!
Dame
Maggie
Smith
performed
her
role
excellently
,
as
she
does
in
all
her
movies
.
The
new
movie
by
Mr.
Kruno
was
shot
in
New
York
,
but
the
story
takes
place
in
Los
Angeles
.
I
always
have
loved
E

In [140]:
print(X[0])

[{'bias': 1.0, 'word.lower()': 'it', 'word[-3:]': 'It', 'word[-2:]': 'It', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', 'BOS': True, '+1:word.lower()': 'took', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'O', '+1:postag[:2]': 'O'}, {'bias': 1.0, 'word.lower()': 'took', 'word[-3:]': 'ook', 'word[-2:]': 'ok', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', '-1:word.lower()': 'it', '-1:word.istitle()': True, '-1:word.isupper()': False, '-1:postag': 'O', '-1:postag[:2]': 'O', '+1:word.lower()': 'eight', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'O', '+1:postag[:2]': 'O'}, {'bias': 1.0, 'word.lower()': 'eight', 'word[-3:]': 'ght', 'word[-2:]': 'ht', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', '-1:word.lower()': 'took', '-1:word.istitle()': False, '-1:wor

In [141]:
import sklearn_crfsuite

from sklearn_crfsuite import CRF

# initialize the model

crf = CRF(algorithm='lbfgs',
          c1=0.1, .
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)                         

In [142]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report
import sklearn_crfsuite

In [163]:
# train the model on the training data

try:
    crf.fit(X, y)
except AttributeError:
    pass

In [144]:
# upload the test data

test_path = '/Users/maddie/Downloads/NER-final-test.tsv'
test_data = pd.read_table(test_path, on_bad_lines='skip')
test_data.head()

Unnamed: 0,sentence id,token id,token,BIO NER tag
0,0,0,It,O
1,0,1,took,O
2,0,2,eight,O
3,0,3,years,O
4,0,4,for,O


In [145]:
# observe the words and tags of the test data set

words = list(set(test_data["token"].values))
print(words)
n_words = len(test_data); n_words
print(n_words)
pos = list(set(test_data["BIO NER tag"].values))
print(pos)

['deep', 'has', 'diver', 'Gooding', 'Chris', 'as', 'was', 'The', 'new', 'Smith', 'conclusion', 'in', 'he', 'husband', 'wants', 'be', 'commercial', 'all', 'review', 'All', 'been', 'she', 'my', 'lived', 'movie', 'famous', 'York', 'young', 'first', 'Los', 'atmosphere', 'they', 'I', 'played', 'one', 'Cuba', 'My', 'her', 'sea', 'Mr.', 'place', 'Kruno', 'Blauwbrug', 'part', 'In', 'moved', 'but', 'here', 'performed', 'recover', 'is', 'could', 'story', 'focused', 'that', 'excellently', 'forgotten', 'loved', 'into', 'It', 'diner', 'and', 'by', 'Navy', 'our', 'students', 'a', 'University', ':', 'we', 'this', 'English', 'Jr.', 'Carl', 'Angeles', 'movies', 'felt', 'novels', 'get', 'Amsterdam', 'restaurant', 'would', 'it', 'from', 'filming', 'eat', 'just', 'so', 'really', 'eight', 'Austen', 'African', "n't", 'Soho', 'Jane', 'Italian', 'important', '.', 'makes', 'disaster', 'who', 'Maggie', ',', 'on', 'understand', 'like', 'does', 'American', 'of', 'This', 'love', 'years', 'why', 'trendy', 'the', 'f

In [146]:
# redefine SentenceGetter to fit the indices of the test data

class SentenceGetterTest(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["token"].values.tolist(),
                                                     s["BIO NER tag"].values.tolist())]
        self.grouped = self.data.groupby("sentence id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [147]:
test_getter = SentenceGetterTest(test_data)

In [148]:
sentences = test_getter.sentences

In [149]:
print(sentences[3])

[('In', 'O'), ('conclusion', 'O'), (',', 'O'), ('my', 'O'), ('review', 'O'), ('of', 'O'), ('this', 'O'), ('book', 'O'), ('would', 'O'), ('be', 'O'), (':', 'O'), ('I', 'O'), ('like', 'O'), ('Jane', 'B-PER'), ('Austen', 'I-PER'), ('and', 'O'), ('understand', 'O'), ('why', 'O'), ('she', 'O'), ('is', 'O'), ('famous', 'O'), ('.', 'O')]


In [150]:
# redefine word2features to match the format of test_data

def word2features(sent, i):
    word = sent[i][0]
    print(word)
    postag = sent[i][1]
    
    # data structure consisting of a feature name and value for the token
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(), # lower case variant of the token
        'word[-3:]': word[-3:], #suffix of 3 characters
        'word[-2:]': word[-2:], #suffix of 2 characters
        'word.isupper()': word.isupper(), # initial captial
        'word.istitle()': word.istitle(), # all words ini caps
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2], #first two characters of the PoS Tag
    }
    if i > 0:
        # adding features for the word based on the previous word
        word1 = sent[i-1][0] # previous word
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True # Beginning of sentence as a feature

    if i < len(sent)-1:
        # adding features for the word based on the next word
        word1 = sent[i+1][0] # next word
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True # end of sentence as a feature

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [158]:
# use the sentences of the test data to obtain X_test and y_test

X_test = [sent2features(s) for s in sentences]
y_test = [sent2labels(s) for s in sentences]

It
took
eight
years
for
Warner
Brothers
to
recover
from
the
disaster
that
was
this
movie
.
All
the
New
York
University
students
love
this
diner
in
Soho
so
it
makes
for
a
fun
young
atmosphere
.
This
Italian
place
is
really
trendy
but
they
have
forgotten
about
the
most
important
part
of
a
restaurant
,
the
food
.
In
conclusion
,
my
review
of
this
book
would
be
:
I
like
Jane
Austen
and
understand
why
she
is
famous
.
The
story
of
this
movie
is
focused
on
Carl
Brashear
played
by
Cuba
Gooding
Jr.
who
wants
to
be
the
first
African
American
deep
sea
diver
in
the
Navy
.
Chris
O'Donnell
stated
that
while
filming
for
this
movie
,
he
felt
like
he
was
in
a
toy
commercial
.
My
husband
and
I
moved
to
Amsterdam
6
years
ago
and
for
as
long
as
we
have
lived
here
,
Blauwbrug
has
been
our
favorite
place
to
eat
!
Dame
Maggie
Smith
performed
her
role
excellently
,
as
she
does
in
all
her
movies
.
The
new
movie
by
Mr.
Kruno
was
shot
in
New
York
,
but
the
story
takes
place
in
Los
Angeles
.
I
always
have
loved
E

In [159]:
# ensure that the X_test data looks correct
print(X_test[0])

[{'bias': 1.0, 'word.lower()': 'it', 'word[-3:]': 'It', 'word[-2:]': 'It', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', 'BOS': True, '+1:word.lower()': 'took', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'O', '+1:postag[:2]': 'O'}, {'bias': 1.0, 'word.lower()': 'took', 'word[-3:]': 'ook', 'word[-2:]': 'ok', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', '-1:word.lower()': 'it', '-1:word.istitle()': True, '-1:word.isupper()': False, '-1:postag': 'O', '-1:postag[:2]': 'O', '+1:word.lower()': 'eight', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'O', '+1:postag[:2]': 'O'}, {'bias': 1.0, 'word.lower()': 'eight', 'word[-3:]': 'ght', 'word[-2:]': 'ht', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'O', 'postag[:2]': 'O', '-1:word.lower()': 'took', '-1:word.istitle()': False, '-1:wor

In [160]:
# make a prediction based on the test data

predictions = crf.predict(X_test)

In [170]:
predictions

[['O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'B-ORG',
  'I-ORG',
  'I-ORG',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-LOC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'B-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'O',
  'O',
  'B-PER',
  'I-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-MISC',
  'I-MISC',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-ORG',
  'O'],
 ['B-PER',
  'I-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O']

In [171]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# calculate precision, recall, and F1 based on the predictions and the actual
print("precision-score: {:.1%}".format(precision_score(y_test, predictions)))
print("recall-score: {:.1%}".format(recall_score(y_test, predictions)))
print("F1-score: {:.1%}".format(f1_score(y_test, predictions)))

precision-score: 100.0%
recall-score: 100.0%
F1-score: 100.0%


In [172]:
# make a report based on the above

print(classification_report(y, predictions))

              precision    recall  f1-score   support

         LOC       1.00      1.00      1.00         4
        MISC       1.00      1.00      1.00         3
         ORG       1.00      1.00      1.00         4
         PER       1.00      1.00      1.00         6

   micro avg       1.00      1.00      1.00        17
   macro avg       1.00      1.00      1.00        17
weighted avg       1.00      1.00      1.00        17

