In [1]:
#@ Importing Necessary dependencies and libraries:
from nltk.tag import pos_tag
!pip install sklearn-crfsuite
from sklearn_crfsuite import CRF, metrics
from sklearn.metrics import make_scorer, confusion_matrix
from pprint import pprint
from sklearn.metrics import f1_score, classification_report
from sklearn.pipeline import Pipeline
import string
import warnings
warnings.filterwarnings('ignore')
import nltk
nltk.download('averaged_perceptron_tagger_eng')

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [2]:
#@ Data Loading:
def load_data_conll(file_path):
  myoutput, tokens, tags=[], [], []
  with open(file_path, 'r') as fh:
    for line in fh:
      line=line.strip() #to remove leading and trailing  white space characters
      if '\t' not in line:
        #sentences ended
        myoutput.append([tokens, tags])
        tokens, tags=[], []
      else:
        token ,tag=line.split('\t')
        tokens.append(token)
        tags.append(tag)
    fh.close()
    return myoutput

### Getting features of all the tokens in sentence.

##### Features:
- **Token Context:** a window of 2 tokens on either side of current token, and current token.

- **POS Context:**  a window of 2 tokens on either side of current tag, and current tag.


In [9]:
def sentence2features(sentence):
  features=[]
  sentence_tags=pos_tag(sentence)
  for i in range(0, len(sentence)):
    token=sentence[i]
    tokenfeatures={}

    #@ token features:
    #token
    tokenfeatures['token']=token

    #for 2 prev tokens
    if i==0:
      tokenfeatures['prevtoken']=tokenfeatures['prevsecondtoken']='<S>'
    elif i==1:
      tokenfeatures['prevtoken']=sentence[0]
      tokenfeatures['prevsecondtoken']='</S>'
    else:
      tokenfeatures['prevtoken']=sentence[i-1]
      tokenfeatures['prevsecondtoken']=sentence[i-2]

    #for 2 next token
    if i==len(sentence)-2:
      tokenfeatures['nexttoken']=sentence[i+1]
      tokenfeatures['nextsecondtoken']='</S>'

    elif i==len(sentence)-1:
      tokenfeatures['nexttoken']='</S>'
      tokenfeatures['nextsecondtoken']='</S>'

    else:
      tokenfeatures['nexttoken']=sentence[i+1]
      tokenfeatures['nextsecondtoken']=sentence[i+2]

    #@ POS feature:

    #current tag
    tokenfeatures['tag']=sentence_tags[i][1]

    #prev tag
    if i==0:
      tokenfeatures['prevtag']=tokenfeatures['prevsecondtag']='</S>'
    elif i==1:
      tokenfeatures['prevtag']=sentence_tags[0][1]
      tokenfeatures['prevsecondtag']='</S>'
    else:
      tokenfeatures['prevtag']=sentence_tags[i-1][1]
      tokenfeatures['prevsecondtag']=sentence_tags[i-2][1]

    #next tag
    if i==len(sentence)-2:
      tokenfeatures['nexttag']=sentence_tags[i+1][1]
      tokenfeatures['nextsecondtag']='</S>'

    elif i==len(sentence)-1:
      tokenfeatures['nexttag']='</S>'
      tokenfeatures['nextsecondtag']='</S>'

    else:
      tokenfeatures['nexttag']=sentence_tags[i+1][1]
      tokenfeatures['nextsecondtag']=sentence_tags[i+2][i]

    features.append(tokenfeatures)
  return features

In [10]:
#@ Extracting features:
def get_features_conll(conll_data):
  features=[]
  labels=[]
  for sentence in conll_data:
    features.append(sentence2features(sentence[0]))
    labels.append(sentence[1])
  return features, labels

### Training Model

In [21]:
def train_seq(X_train, Y_train, X_val, Y_val):
  crf=CRF(algorithm='lbfgs', c1=0.1, c2=10, max_iterations=50)
  crf.fit(X_train, Y_train)
  labels=list(crf.classes_)

  # Check if labels is empty and handle accordingly
  if not labels:
    print("Warning: No labels found in the trained model. Check training data and model parameters.")
    return  # or raise an exception

  #testing:
  y_pred=crf.predict(X_val)
  sorted_labels=sorted(labels, key=lambda name: (name[1:], name[0]))
  print(metrics.flat_f1_score(Y_val, y_pred,average='weighted', labels=labels))
  print(metrics.flat_classification_report(Y_val, y_pred, labels=sorted_labels, digits=3))
  get_confusion_matrix(Y_val, y_pred, labels=sorted_labels)

### Confusion matrix helper fucntion:
- Note: copied below two functions

In [22]:
def print_cm(cm, labels):
    print("\n")
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        sum = 0
        for j in range(len(labels)):
            cell = "%{0}.0f".format(columnwidth) % cm[i, j]
            sum =  sum + int(cell)
            print(cell, end=" ")
        print(sum) #Prints the total number of instances per cat at the end.

In [23]:
#python-crfsuite does not have a confusion matrix function,
#so writing it using sklearn's confusion matrix and print_cm from github
def get_confusion_matrix(y_true,y_pred,labels):
    trues,preds = [], []
    for yseq_true, yseq_pred in zip(y_true, y_pred):
        trues.extend(yseq_true)
        preds.extend(yseq_pred)
    print_cm(confusion_matrix(trues,preds,labels),labels)

In [26]:
#@ Calling all our functions inside main method:

def main():
  try:
    from google.colab import files
    uploaded=files.upload()

    #files in Data/conlldata
    train_path='train.txt'
    test_path='test.txt'
  except:
    train_path='Data/conlldata/train.txt'
    test_path='Data/conlldata/test.txt'

  conll_train=load_data_conll(train_path)
  conll_val=load_data_conll(test_path)

  print('Training a sequence classification model with CRF:')
  features, labels=get_features_conll(conll_train)
  valfeatures, vallabels=get_features_conll(conll_val)
  train_seq(features, labels, valfeatures, vallabels)
  print('Done with sequence model')

if __name__=='__main__':
  main()

Saving test.txt to test (4).txt
Saving train.txt to train (5).txt
Training a sequence classification model with CRF:
Done with sequence model
