# Natural Language Processing

## Ruthu S Sanketh

The objective of this tutorial is to experiment with POS tagging which is a standard sequence labeling task using Conditional Random Field (CRF).

In [1]:
#importing all the needed libraries
import pandas as pd       
import nltk
import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re

from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
#reading and storing the data
data = {}
data['train'] = pd.read_csv('/Users/ruthu/Desktop/hi-ud-train.conllu')
data['test'] = pd.read_csv('/Users/ruthu/Desktop/hi-ud-test.conllu', sep = '\t')

print(data['train'], data['test'], sep = '\n\n')

        ID    WORD POS_TAG
0      1.0    yaha     DET
1      2.0   eSiyA   PROPN
2      3.0      kI     ADP
3      4.0  sabase     ADV
4      5.0   badZI     ADJ
...    ...     ...     ...
8105   9.0   TaMdI     ADJ
8106  10.0      ho    VERB
8107  11.0    jAwI     AUX
8108  12.0      hE     AUX
8109  13.0       .   PUNCT

[8110 rows x 3 columns]

        ID      WORD    TAG
0      1.0  rAmAyaNa  PROPN
1      2.0      kAla  PROPN
2      3.0       meM    ADP
3      4.0  BagavAna   NOUN
4      5.0      rAma  PROPN
...    ...       ...    ...
1552  10.0     ISAna  PROPN
1553  11.0        kA    ADP
1554  12.0   maMxira   NOUN
1555  13.0        hE    AUX
1556  14.0         .  PUNCT

[1557 rows x 3 columns]


### Features Used - <br>
1.   The word
2.   The word in lowercase
3.   Prefixes and suffixes of the word of varying lengths
4.   If the word is a digit
5.   If the word is a punctuation mark
6.   If the word is at the beginning of the sentence (BOS) or the end of the sentence (EOS) or neither
7.   The length of the word- no. of characters (since shorter words are expected to be more likely to belong to a particular POS, eg. prepositions, pronouns)
8.   Stemmed version of the word, which deletes all vowels along with g, y, n from the end of the word, but leaves at least a 2 character long stem
9.   Features mentioned above for the previous word, the following word, and the words two places before and after

In [3]:
#function to extract features
def word2features(sent, i):
    word = sent[i][0]

    features = {
        'bias': 1.0,
        'word': word,
        'len(word)': len(word),
        'word[:4]': word[:4],
        'word[:3]': word[:3],
        'word[:2]': word[:2],
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word[-4:]': word[-4:],
        'word.lower()': word.lower(),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'word.ispunctuation': (word in string.punctuation),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word': word1,
            '-1:len(word)': len(word1),
            '-1:word.lower()': word1.lower(),
            '-1:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word1.lower()),
            '-1:word[:3]': word1[:3],
            '-1:word[:2]': word1[:2],
            '-1:word[-3:]': word1[-3:],
            '-1:word[-2:]': word1[-2:],
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['BOS'] = True

    if i > 1:
        word2 = sent[i-2][0]
        features.update({
            '-2:word': word2,
            '-2:len(word)': len(word2),
            '-2:word.lower()': word2.lower(),
            '-2:word[:3]': word2[:3],
            '-2:word[:2]': word2[:2],
            '-2:word[-3:]': word2[-3:],
            '-2:word[-2:]': word2[-2:],
            '-2:word.isdigit()': word2.isdigit(),
            '-2:word.ispunctuation': (word2 in string.punctuation),
        })

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word': word1,
            '+1:len(word)': len(word1),
            '+1:word.lower()': word1.lower(),
            '+1:word[:3]': word1[:3],
            '+1:word[:2]': word1[:2],
            '+1:word[-3:]': word1[-3:],
            '+1:word[-2:]': word1[-2:],
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.ispunctuation': (word1 in string.punctuation),
        })

    else:
        features['EOS'] = True

    if i < len(sent) - 2:
        word2 = sent[i+2][0]
        features.update({
            '+2:word': word2,
            '+2:len(word)': len(word2),
            '+2:word.lower()': word2.lower(),
            '+2:word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word2.lower()),
            '+2:word[:3]': word2[:3],
            '+2:word[:2]': word2[:2],
            '+2:word[-3:]': word2[-3:],
            '+2:word[-2:]': word2[-2:],
            '+2:word.isdigit()': word2.isdigit(),
            '+2:word.ispunctuation': (word2 in string.punctuation),
        })

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [word[1] for word in sent]

def sent2tokens(sent):
    return [word[0] for word in sent]

In [4]:
#formatting the data into sentences
def format_data(csv_data):
    sents = []
    for i in range(len(csv_data)):
        if math.isnan(csv_data.iloc[i, 0]):
            continue
        elif csv_data.iloc[i, 0] == 1.0:
            sents.append([[csv_data.iloc[i, 1], csv_data.iloc[i, 2]]])
        else:
            sents[-1].append([csv_data.iloc[i, 1], csv_data.iloc[i, 2]])
    for sent in sents:
        for i, word in enumerate(sent):
            if type(word[0]) != str:
                del sent[i]
    return sents

In [5]:
#extracting features from all the sentences
train_sents = format_data(data['train'])
test_sents = format_data(data['test'])

Xtrain = [sent2features(s) for s in train_sents]
ytrain = [sent2labels(s) for s in train_sents]

Xtest = [sent2features(s) for s in test_sents]
ytest = [sent2labels(s) for s in test_sents]

In [7]:
%%time                                  
crf = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    c1 = 0.25,
    c2 = 0.3,
    max_iterations = 100,
    all_possible_transitions=True
)
crf.fit(Xtrain, ytrain)                  #training the model

Wall time: 3.5 s


CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.25, c2=0.3,
    keep_tempfiles=None, max_iterations=100)

In [9]:
#obtaining metrics such as accuracy, etc. on the train set
labels = list(crf.classes_)
labels.remove('X')

ypred = crf.predict(Xtrain)
print('F1 score on the train set = {}\n'.format(metrics.flat_f1_score(ytrain, ypred, average='weighted', labels=labels)))
print('Accuracy on the train set = {}\n'.format(metrics.flat_accuracy_score(ytrain, ypred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print('Train set classification report: \n\n{}'.format(metrics.flat_classification_report(
    ytrain, ypred, labels=sorted_labels, digits=3
)))

F1 score on the train set = 0.9989320264718571

Accuracy on the train set = 0.9989329064959317

Train set classification report: 

              precision    recall  f1-score   support

        PART      1.000     1.000     1.000       163
       CCONJ      1.000     1.000     1.000       150
       SCONJ      1.000     1.000     1.000        61
         ADJ      1.000     1.000     1.000       570
         ADP      1.000     1.000     1.000      1387
         ADV      1.000     0.991     0.995       111
        VERB      1.000     0.991     0.995       640
         DET      1.000     0.996     0.998       231
        NOUN      0.999     1.000     1.000      1597
        PRON      0.998     1.000     0.999       431
       PROPN      1.000     1.000     1.000       708
         NUM      1.000     1.000     1.000       152
       PUNCT      1.000     1.000     1.000       564
         AUX      0.992     1.000     0.996       730

   micro avg      0.999     0.999     0.999      7495
   

In [10]:
#obtaining metrics such as accuracy, etc. on the test set
ypred = crf.predict(Xtest)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(ytest, ypred,
                      average='weighted', labels=labels)))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(ytest, ypred)))

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print('Test set classification report: \n\n{}'.format(metrics.flat_classification_report(
    ytest, ypred, labels=sorted_labels, digits=3
)))

F1 score on the test set = 0.8674205655336537

Accuracy on the test set = 0.8683127572016461

Test set classification report: 

              precision    recall  f1-score   support

        PART      1.000     0.879     0.935        33
       CCONJ      1.000     1.000     1.000        25
       SCONJ      0.667     0.667     0.667         3
         ADJ      0.676     0.777     0.723        94
         ADP      0.967     0.955     0.961       309
         ADV      0.583     0.333     0.424        21
        VERB      0.935     0.869     0.901        99
         DET      0.795     0.861     0.827        36
        NOUN      0.785     0.863     0.822       329
        PRON      0.929     0.800     0.860        65
       PROPN      0.692     0.621     0.655       145
         NUM      1.000     0.880     0.936        25
       PUNCT      1.000     0.993     0.996       135
         AUX      0.965     0.978     0.971       139

    accuracy                          0.868      1458
   mac

In [11]:
#obtaining the most likely and the least likely transitions 
from collections import Counter

def print_transitions(transition_features):
    for (label_from, label_to), weight in transition_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top 10 likely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop 10 unlikely transitions - \n")
print_transitions(Counter(crf.transition_features_).most_common()[-10:])

Top 10 likely transitions - 

VERB   -> AUX     3.496273
PROPN  -> PROPN   1.951488
AUX    -> AUX     1.816472
ADJ    -> NOUN    1.766149
AUX    -> SCONJ   1.578886
NUM    -> NOUN    1.525581
PART   -> NUM     1.421199
DET    -> NOUN    1.412161
VERB   -> SCONJ   1.334200
PRON   -> ADP     1.142136

Top 10 unlikely transitions - 

VERB   -> ADJ     -0.897578
DET    -> PROPN   -0.964973
PROPN  -> NOUN    -1.016041
PROPN  -> PART    -1.050191
PROPN  -> AUX     -1.053615
DET    -> ADP     -1.121233
PROPN  -> DET     -1.279584
ADJ    -> PRON    -1.281203
VERB   -> VERB    -1.401497
ADJ    -> ADP     -2.176380
