**Assignment-4 NLP : POS TAGGING USING CRFSuite**

**Name     :  Kanishk Singh**

**Roll No  :  17CS30018**

In [1]:
#installing sklearn-crfsuite
!pip install sklearn-crfsuite



In [2]:
from collections import Counter
import sklearn_crfsuite
from sklearn_crfsuite import metrics,CRF

In [3]:
## Function to preprocess the TRAINING AND TEST DATA

def fileread(filename, delimiter):
    sentences = []
    with open(filename, "r") as f:
        sentence = []
        #print(len(f.readlines()))
        for tag in f.readlines()[1:]:
            eos = delimiter+delimiter+'\n'
            if tag == eos:
                sentences.append(sentence)
                sentence = []
                continue
            fields = tag.strip().split(delimiter)
            sample_word = (fields[1].strip('\"'), fields[2])
            #print(sample_word)
            sentence.append(sample_word)
        sentences.append(sentence)
    #print(fields)
    return sentences

**Word_To_Features(sentence,index_of_word)**

In [4]:
def WordToFeatures(sentence, index):
    Word = sentence[index][0]
    
    features = {
        'Word':           Word,
        'Word.Lower()':   Word.lower(),
        'Word.isTitle()': Word.istitle(),
        'Word.isUpper()': Word.isupper(),
        'Word.isDigit()': Word.isdigit(),
        'Prefix-3':       Word[:3] if len(Word)>2 else '',
        'Suffix-3':       Word[-3:] if len(Word)>2 else '',
        'has_Hyphen':     '-' in Word,
    }
    
    if (index > 0):
        PrevWord = sentence[index-1][0]
        features.update({
            '-1:Word.Lower()'   : PrevWord.lower(),
            '-1:Word.isTitle()' : PrevWord.istitle(),
            '-1:Word.isUpper()' : PrevWord.isupper(),
        })
    else:
        features['BOS'] = True

    if (index < len(sentence)-1):
        NextWord = sentence[index+1][0]
        features.update({
            '+1:Word.Lower()'   : NextWord.lower(),
            '+1:Word.isTitle()' : NextWord.istitle(),
            '+1:Word.isUpper()' : NextWord.isupper(),
        })
    else:
        features['EOS'] = True

    return features

In [5]:
def Sent_To_Features(sentence):
    return [WordToFeatures(sentence, i) for i in range(len(sentence))]

def Sent_To_Labels(sentence):
    return [fields[1] for fields in sentence]

In [6]:
ls

[0m[01;34mAssignment_4_17CS30018[0m/       hi-ud-test .conllu
Assignment_4_17CS30018.ipynb  hi-ud-train (1).conllu
Assignment_4_17CS30018.pdf    hi-ud-train.conllu
[01;31mAssignment_4_17CS30018.zip[0m    [01;35mModel_on_test.png[0m
Assignment4_NLP.pdf           [01;35mmodel_prediction_train.png[0m
Assignment4_YourRollNo.docx   [01;35mmost-and-least-common.png[0m
hi-ud-test  (1).conllu


In [7]:
Train_Set   = fileread('hi-ud-train.conllu', ',')
Test_Set    = fileread('hi-ud-test .conllu', '\t')

In [8]:
x_train     = [Sent_To_Features(sent) for sent in Train_Set]
y_train     = [Sent_To_Labels(sent)   for sent in Train_Set]

x_test      = [Sent_To_Features(sent) for sent in Test_Set]
y_test      = [Sent_To_Labels(sent)   for sent in Test_Set]

**MODEL ON TRAINING_DATA**

In [9]:
Model = CRF(    
    algorithm      = 'lbfgs',
    c1             = 0.1,
    c2             = 0.1,
    max_iterations = 300,
    all_possible_transitions = True
)
Model.fit(x_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=300,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

**RUNNING THE MODEL ON THE TRAINING_DATA**

In [10]:
print("MODEL PREDICTION ON TRAINING DATA".center(50))
print("-"*50)

y_train_predicted = Model.predict(x_train)

print(metrics.flat_classification_report(y_train, y_train_predicted))

print('precision: ',  metrics.flat_precision_score(y_train, y_train_predicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(y_train, y_train_predicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(y_train, y_train_predicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(y_train, y_train_predicted))

        MODEL PREDICTION ON TRAINING DATA         
--------------------------------------------------
              precision    recall  f1-score   support

         ADJ       1.00      1.00      1.00       570
         ADP       1.00      1.00      1.00      1387
         ADV       0.97      0.98      0.98       111
         AUX       0.99      1.00      0.99       730
       CCONJ       0.99      1.00      1.00       150
       COMMA       1.00      1.00      1.00       114
         DET       1.00      0.99      0.99       231
        NOUN       1.00      1.00      1.00      1597
         NUM       1.00      1.00      1.00       152
        PART       1.00      1.00      1.00       163
        PRON       1.00      1.00      1.00       431
       PROPN       1.00      1.00      1.00       708
       PUNCT       1.00      1.00      1.00       564
       SCONJ       0.98      1.00      0.99        61
        VERB       1.00      0.98      0.99       640
           X       1.00      1.00

**Running the Model on the Testing Data**

In [11]:
print("MODEL PREDICTION ON TEST DATA".center(50))
print("-"*50)

y_test_predicted = Model.predict(x_test)

print(metrics.flat_classification_report(y_test, y_test_predicted))

print('precision: ',  metrics.flat_precision_score(y_test, y_test_predicted, average = 'weighted'))
print('recall:    ',  metrics.flat_recall_score(y_test, y_test_predicted, average = 'weighted'))
print('f1-score:  ',  metrics.flat_f1_score(y_test, y_test_predicted, average = 'weighted'))
print('accuracy:  ',  metrics.flat_accuracy_score(y_test, y_test_predicted))

          MODEL PREDICTION ON TEST DATA           
--------------------------------------------------
              precision    recall  f1-score   support

         ADJ       0.67      0.79      0.73        94
         ADP       0.95      0.98      0.96       309
         ADV       0.71      0.48      0.57        21
         AUX       0.94      0.95      0.95       139
       CCONJ       1.00      1.00      1.00        25
         DET       0.86      0.89      0.88        36
        NOUN       0.77      0.90      0.83       329
         NUM       1.00      0.92      0.96        25
        PART       1.00      0.97      0.98        33
        PRON       0.87      0.85      0.86        65
       PROPN       0.65      0.44      0.53       145
       PUNCT       1.00      0.84      0.92       135
       SCONJ       0.50      0.67      0.57         3
        VERB       0.86      0.82      0.84        99

    accuracy                           0.85      1458
   macro avg       0.84      0.8

In [12]:
def printTransitions(transitions):
    for edge, weight in transitions:
        print("%-6s =>  %-7s %0.5f" % (edge[0], edge[1], weight))

In [13]:
def print_10_most_common(x_data,y_data,data_set):
    Model = CRF(    
    algorithm      = 'lbfgs',
    c1             = 0.1,
    c2             = 0.1,
    max_iterations = 300,
    all_possible_transitions = True
    )
    Model.fit(x_data, y_data)
    print("Top 10 Most Common POS Transition Features:")
    print("-"*21,data_set,"-"*21)
    printTransitions(Counter(Model.transition_features_).most_common(10))
    print("\n")

def print_10_least_common(x_data,y_data,data_set):
    Model = CRF( algorithm      = 'lbfgs',
    c1             = 0.1,
    c2             = 0.1,
    max_iterations = 300,
    all_possible_transitions = True)
    Model.fit(x_data, y_data)
    print("Top 10 Least Common POS Transition Features:")
    print("-"*21,data_set,"-"*21)
    printTransitions(Counter(Model.transition_features_).most_common()[-10:])
    print("\n")
    

**Printing the 10-Most Common and Least-Common Transition Features in TRAINING-SET**

In [14]:
print_10_most_common(x_train,y_train,"Training-set")
print_10_least_common(x_train,y_train,"Training-set")

Top 10 Most Common POS Transition Features:
--------------------- Training-set ---------------------
ADJ    =>  NOUN    3.99639
PROPN  =>  PROPN   3.91982
VERB   =>  AUX     3.88266
NOUN   =>  VERB    2.71304
NOUN   =>  ADP     2.63396
DET    =>  NOUN    2.54572
NUM    =>  NOUN    2.53846
ADJ    =>  VERB    2.33121
PROPN  =>  ADP     2.28136
NOUN   =>  NOUN    2.17695


Top 10 Least Common POS Transition Features:
--------------------- Training-set ---------------------
COMMA  =>  ADP     -1.34458
ADJ    =>  PRON    -1.41543
DET    =>  CCONJ   -1.47929
ADP    =>  AUX     -1.49491
ADP    =>  CCONJ   -1.62925
ADP    =>  COMMA   -1.68510
ADJ    =>  ADP     -1.80175
AUX    =>  ADP     -1.80568
CCONJ  =>  AUX     -1.92248
DET    =>  ADP     -2.49674




**Printing the 10-Most Common and Least-Common Transition Features in TEST-SET**

**OPTIONAL!!!!!**

In [15]:
print_10_most_common(x_test,y_test, "Test-set")
print_10_least_common(x_test,y_test,"Test-set")

Top 10 Most Common POS Transition Features:
--------------------- Test-set ---------------------
VERB   =>  AUX     3.04655
PROPN  =>  PROPN   2.89451
ADJ    =>  NOUN    2.81582
NOUN   =>  ADP     2.17165
PROPN  =>  ADP     2.12235
DET    =>  NOUN    1.94831
NOUN   =>  VERB    1.91568
AUX    =>  AUX     1.83766
NUM    =>  NOUN    1.72544
ADJ    =>  VERB    1.66722


Top 10 Least Common POS Transition Features:
--------------------- Test-set ---------------------
PROPN  =>  ADJ     -0.93976
AUX    =>  ADP     -1.01738
PUNCT  =>  PUNCT   -1.01969
VERB   =>  PROPN   -1.04201
PROPN  =>  AUX     -1.06561
ADP    =>  CCONJ   -1.07404
DET    =>  PROPN   -1.14494
ADJ    =>  ADP     -1.23945
AUX    =>  VERB    -1.24320
ADP    =>  PUNCT   -1.55589


