In [1]:
##space for importing libraries
import nltk
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Perceptron
import pickle

from collections import defaultdict
import random
import os


# DATA READ

In [2]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/ritesh.ratti/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [3]:
tagged_sentences = nltk.corpus.brown.tagged_sents()
print(tagged_sentences[2])


[('The', 'AT'), ('September-October', 'NP'), ('term', 'NN'), ('jury', 'NN'), ('had', 'HVD'), ('been', 'BEN'), ('charged', 'VBN'), ('by', 'IN'), ('Fulton', 'NP-TL'), ('Superior', 'JJ-TL'), ('Court', 'NN-TL'), ('Judge', 'NN-TL'), ('Durwood', 'NP'), ('Pye', 'NP'), ('to', 'TO'), ('investigate', 'VB'), ('reports', 'NNS'), ('of', 'IN'), ('possible', 'JJ'), ('``', '``'), ('irregularities', 'NNS'), ("''", "''"), ('in', 'IN'), ('the', 'AT'), ('hard-fought', 'JJ'), ('primary', 'NN'), ('which', 'WDT'), ('was', 'BEDZ'), ('won', 'VBN'), ('by', 'IN'), ('Mayor-nominate', 'NN-TL'), ('Ivan', 'NP'), ('Allen', 'NP'), ('Jr.', 'NP'), ('.', '.')]


In [4]:
print("Tagged sentences: ", len(tagged_sentences))
print("Tagged words:", len(nltk.corpus.brown.tagged_words()))

Tagged sentences:  57340
Tagged words: 1161192


# FEATURE EXTRACTION

In [5]:
def features(sentence, index):
    """ sentence: [w1, w2, ...], index: the index of the word """
    return {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
    }
 

In [6]:
def untag(tagged_sentence):
    return [w for w, t in tagged_sentence]



In [8]:
tagged_sentences[0]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [10]:
b= features(untag(tagged_sentences[0]),5)
b2= features(untag(tagged_sentences[2]),5)
print(b,b2)
print(len(b))

{'word': 'said', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 's', 'prefix-2': 'sa', 'prefix-3': 'sai', 'suffix-1': 'd', 'suffix-2': 'id', 'suffix-3': 'aid', 'prev_word': 'Jury', 'next_word': 'Friday', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False} {'word': 'been', 'is_first': False, 'is_last': False, 'is_capitalized': False, 'is_all_caps': False, 'is_all_lower': True, 'prefix-1': 'b', 'prefix-2': 'be', 'prefix-3': 'bee', 'suffix-1': 'n', 'suffix-2': 'en', 'suffix-3': 'een', 'prev_word': 'had', 'next_word': 'charged', 'has_hyphen': False, 'is_numeric': False, 'capitals_inside': False}
17


# VECTORIZATION

In [11]:
f1=DictVectorizer(sparse=False)
f1.fit_transform([b,b2])

array([[0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 1.,
        1., 0., 1., 0., 0., 1., 1., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0.,
        0., 1., 0., 1., 1., 0., 0., 1., 1., 0.]])

In [12]:
cutoff = int(.75 * len(tagged_sentences)) # Specifying the ratio for train and test
training_sentences = tagged_sentences[:cutoff]
test_sentences = tagged_sentences[cutoff:]
 
print(len(training_sentences))  
print(len(test_sentences))     
 
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        for index in range(len(tagged)):
            X.append(features(untag(tagged), index))
            y.append(tagged[index][1])
 
    return X, y
 
X, y = transform_to_dataset(training_sentences)


43005
14335


# MODEL TRAINING

In [13]:

 
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=True)),
    ('classifier', Perceptron(n_jobs=-1))
])


clf.fit(X, y)   
 
print('Training completed')


KeyboardInterrupt: 

In [17]:
clf = pickle.load(open('perceptronpos.pickle', 'rb'))

# MODEL EVALUATION

In [18]:
 
X_test, y_test = transform_to_dataset(test_sentences)

print("Accuracy:", clf.score(X_test, y_test)) 

Accuracy: 0.9401518335375069


In [19]:
print(clf.predict(X_test[10]))
s = pickle.dump(clf, open('perceptronpos.pickle', 'wb'))
y_test[10]

['VBD']


'VBD'

In [20]:
 clf.score(X_test[1:100000], y_test[1:100000]) 
    

0.9393893938939389

# TEST ON SINGLE SENTENCE

In [21]:
print(untag(test_sentences[0]))

['When', 'the', 'Plymouth', 'neared', ',', 'it', 'veered', 'toward', 'him', 'and', 'seemed', 'about', 'to', 'run', 'him', 'down', '.']


In [22]:
print(test_sentences[0])

[('When', 'WRB'), ('the', 'AT'), ('Plymouth', 'NP'), ('neared', 'VBD'), (',', ','), ('it', 'PPS'), ('veered', 'VBD'), ('toward', 'IN'), ('him', 'PPO'), ('and', 'CC'), ('seemed', 'VBD'), ('about', 'RB'), ('to', 'TO'), ('run', 'VB'), ('him', 'PPO'), ('down', 'RP'), ('.', '.')]


In [23]:
feats=features(untag(test_sentences[0]),-2)
clf.predict(feats)

array(['RP'], dtype='<U11')

In [24]:
features(untag(test_sentences[0]),0)

{'word': 'When',
 'is_first': True,
 'is_last': False,
 'is_capitalized': True,
 'is_all_caps': False,
 'is_all_lower': False,
 'prefix-1': 'W',
 'prefix-2': 'Wh',
 'prefix-3': 'Whe',
 'suffix-1': 'n',
 'suffix-2': 'en',
 'suffix-3': 'hen',
 'prev_word': '',
 'next_word': 'the',
 'has_hyphen': False,
 'is_numeric': False,
 'capitals_inside': False}

In [27]:
clf.predict(features(untag(test_sentences[0]),10))

array(['VBD'], dtype='<U11')