# Importing basic libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

In [2]:
#Importing Train file here

filename = 'train.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

In [3]:
text[:500]

"\n\n\tThe/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.\n\n\n\tThe/at jury/nn further/rbr said/vbd in/in term-end/nn presentments/nns that/cs the/at City/nn-tl Executive/jj-tl Committee/nn-tl ,/, which/wdt had/hvd over-all/jj charge/nn of/in the/at election/nn ,/, ``/`` deserves/vbz the/at praise/nn and/c"

# Cleaning File

In [4]:
text=re.sub("\s+",' ',text)
text=re.sub("``/``",'',text)
text=re.sub("''/''",'',text)
text=re.sub("/",' ',text)
text=re.sub("''/''",'',text)
text=re.sub(" , , ",'',text)
text=re.sub(" . . ",'',text)
text=re.sub(" -- -- ",'',text)
text[:1000]

" The at Fulton np-tl County nn-tl Grand jj-tl Jury nn-tl said vbd Friday nr an at investigation nn of in Atlanta's np$ recent jj primary nn election nn produced vbd  no at evidence nn  that cs any dti irregularities nns took vbd place nnThe at jury nn further rbr said vbd in in term-end nn presentments nns that cs the at City nn-tl Executive jj-tl Committee nn-tlwhich wdt had hvd over-all jj charge nn of in the at election nn deserves vbz the at praise nn and cc thanks nns of in the at City nn-tl of in-tl Atlanta np-tl  for in the at manner nn in in which wdt the at election nn was bedz conducted vbnThe at September-October np term nn jury nn had hvd been ben charged vbn by in Fulton np-tl Superior jj-tl Court nn-tl Judge nn-tl Durwood np Pye np to to investigate vb reports nns of in possible jj  irregularities nns  in in the at hard-fought jj primary nn which wdt was bedz won vbn by in Mayor-nominate nn-tl Ivan np Allen np Jr. np Only rb a at relative jj handful nn of in such jj repo

In [5]:
tokens = word_tokenize(text) # Tokenizing the file
print(tokens[:100])

['The', 'at', 'Fulton', 'np-tl', 'County', 'nn-tl', 'Grand', 'jj-tl', 'Jury', 'nn-tl', 'said', 'vbd', 'Friday', 'nr', 'an', 'at', 'investigation', 'nn', 'of', 'in', 'Atlanta', "'s", 'np', '$', 'recent', 'jj', 'primary', 'nn', 'election', 'nn', 'produced', 'vbd', 'no', 'at', 'evidence', 'nn', 'that', 'cs', 'any', 'dti', 'irregularities', 'nns', 'took', 'vbd', 'place', 'nnThe', 'at', 'jury', 'nn', 'further', 'rbr', 'said', 'vbd', 'in', 'in', 'term-end', 'nn', 'presentments', 'nns', 'that', 'cs', 'the', 'at', 'City', 'nn-tl', 'Executive', 'jj-tl', 'Committee', 'nn-tlwhich', 'wdt', 'had', 'hvd', 'over-all', 'jj', 'charge', 'nn', 'of', 'in', 'the', 'at', 'election', 'nn', 'deserves', 'vbz', 'the', 'at', 'praise', 'nn', 'and', 'cc', 'thanks', 'nns', 'of', 'in', 'the', 'at', 'City', 'nn-tl', 'of', 'in-tl']


In [6]:
pos_tags=nltk.pos_tag(tokens) # Generating Pos Tags using nltk library
pos_tags[:15]

[('The', 'DT'),
 ('at', 'IN'),
 ('Fulton', 'NNP'),
 ('np-tl', 'JJ'),
 ('County', 'NNP'),
 ('nn-tl', 'JJ'),
 ('Grand', 'NNP'),
 ('jj-tl', 'JJ'),
 ('Jury', 'NNP'),
 ('nn-tl', 'NN'),
 ('said', 'VBD'),
 ('vbd', 'JJ'),
 ('Friday', 'NNP'),
 ('nr', 'FW'),
 ('an', 'DT')]

In [7]:
def features(sentence, index):
    return {
    'word': sentence,
    'is_first': index == 0,
    'is_last': index == len(sentence) - 1,
    'is_capitalized': sentence[0].upper() == sentence[0],
    'is_all_caps': sentence.upper() == sentence,
    'is_all_lower': sentence.lower() == sentence,
    'prefix-1': sentence[0],
    'prefix-2': sentence[:2],
    'prefix-3': sentence[:3],
    'suffix-1': sentence[-1],
    'suffix-2': sentence[-2:],
    'suffix-3': sentence[-3:],
#     'prev_word': '' if index == 0 else sentence[index - 1],
#     'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
    'has_hyphen': '-' in sentence,
    'is_numeric': sentence.isdigit(),
    'capitals_inside': sentence[1:].lower() != sentence[1:]
    }

In [8]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
    for tagged in tagged_sentences:
        X.append(features(tagged[0], range(len(tagged_sentences))))
        y.append(tagged[1])
    return X, y

In [9]:
cutoff = int(.75 * len(pos_tags))
training_sentences = pos_tags[:cutoff]
test_sentences = pos_tags[cutoff:]

# Converting pos tags to features

In [10]:
X, y = transform_to_dataset(training_sentences)

# Intializing Decision Tree Classifier

In [11]:
clf = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('classifier', DecisionTreeClassifier(criterion='entropy'))
])

# Training the data

In [12]:
clf.fit(X[:20000],y[:20000]) # This can take huge time and system crash, proceed from smaller set of records

Pipeline(memory=None,
     steps=[('vectorizer', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('classifier', DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

# Accuracy

In [13]:
X_test, y_test = transform_to_dataset(test_sentences)
print ("Accuracy:{:.3%}".format(clf.score(X_test, y_test)))

Accuracy:77.203%


# Examples to test

In [14]:
sentence_1 = "This is basic POS Tagging assignment."
sentence_1_tokens = word_tokenize(sentence_1)
tags = clf.predict([features(tokens, range(len(sentence_1_tokens))) for tokens in sentence_1_tokens])

In [15]:
print("{:<10}{:>10}".format("Tokens","POS Tags"))
print()
for i in range(len(sentence_1_tokens)):
    print("{:<10}{:>10}".format(sentence_1_tokens[i],tags[i]))
#     print(tags[i])

Tokens      POS Tags

This              DT
is               VBZ
basic             JJ
POS              NNP
Tagging          NNP
assignment        NN
.                  .


In [16]:
sentence_2 = "For almost one-sixth of the national population discrimination in the free selection of residence casts a considerable shadow upon these values assumed as self-evident by most Americans ."
sentence_2_tokens = word_tokenize(sentence_2)
tags_2 = clf.predict([features(tokens, range(len(sentence_2_tokens))) for tokens in sentence_2_tokens])

In [17]:
print("{:<10}{:>10}".format("Tokens","POS Tags"))
print()
for i in range(len(sentence_2_tokens)):
    print("{:<10}{:>10}".format(sentence_2_tokens[i],tags_2[i]))
#     print(tags[i])

Tokens      POS Tags

For              NNP
almost            RB
one-sixth         RB
of                IN
the               DT
national          JJ
population        NN
discrimination        NN
in                IN
the               DT
free              JJ
selection         NN
of                IN
residence         NN
casts            NNS
a                 DT
considerable        NN
shadow            VB
upon              IN
these             DT
values           NNS
assumed          VBN
as                IN
self-evident        NN
by                IN
most             JJS
Americans       NNPS
.                  .
