In [3]:
import nltk
from nltk.stem import PorterStemmer #for stem of a word
import numpy as np
import pandas as pd
import random
from collections import defaultdict, Counter
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import time
import pycrfsuite #Using python-crfsuite for CRF model training

# 1. Dataset

In [2]:
accuracy_score([1,2,3,], [2,3,3])

NameError: name 'accuracy_score' is not defined

In [4]:
nltk.download('brown')
nltk.download('universal_tagset')
nltk_data = list(nltk.corpus.brown.tagged_sents(tagset='universal'))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# 2. Feature extraction

### Features used:
##### Features based on the word itself  
- Stem
- Suffix
##### Contextual features  
- Previous word
- Next word
##### Word position  
- Is the first word (boolean value)
- Is the last word (boolean value)

other potential features: 1. first letter capital(useful for noun) 

### Stem extractor function
Using nltk porter stemmer

In [9]:
#stemmer object
stemmer = PorterStemmer()

In [11]:
#Example of the stem function
stemmer.stem('throwing')

'throw'

### Feature extractor function
- This function will take the statement and the word index as the input. 
- The reason for giving the entire statement as the input is that we need to capure the positional and contextual features too

In [20]:
def features(statement, word_idx):#outputs a dictionary of named features 

   curr_word = statement[word_idx]
   curr_stem = stemmer.stem(curr_word)
   curr_suffix = curr_word[len(curr_stem):] #Part of the word after the stem
   is_first_word = 1 if word_idx == 0 else 0
   is_last_word = 1 if (word_idx == len(statement) - 1) else 0
   feature_dict = {
      'word_stem': curr_stem,
      'word_suffix': curr_suffix,
      'prev_word': statement[word_idx - 1] if is_first_word == 0 else '^',
      'next_word': statement[word_idx + 1] if is_last_word == 0 else '.',
      'is_first_word': is_first_word,
      'is_last_word': is_last_word
   }
   return feature_dict

#### Testing the feature extractor function

In [42]:
tokenized_sentence = ['I', 'jumped', 'the', 'signal']
features(tokenized_sentence, 1)

{'word_stem': 'jump',
 'word_suffix': 'ed',
 'prev_word': 'I',
 'next_word': 'the',
 'is_first_word': 0,
 'is_last_word': 0}

# 3. Data preparation

- List of feature dictonaries for each word of each sentence

In [61]:
features_brown = [] #list of dictionaries holding the features for each word of each sentence
sentences_brown = [[word for word, tag in sentence] for sentence in nltk_data] 
for sentence in sentences_brown:
    sentence_features = []
    for i in range(len(sentence)):
        sentence_features.append(features(sentence, i))
    features_brown.append(sentence_features)

- List of list of tags for each word of each sentence

In [77]:
tags_brown = [[tag for word, tag in sentence] for sentence in nltk_data]

### Train-Test Split

In [80]:
X_train, X_test, y_train, y_test = train_test_split(features_brown, tags_brown, test_size=0.2, random_state=11)

# 4. Model training

In [85]:
trainer = pycrfsuite.Trainer() # Creating a CRF trainer object

for x, y in zip(X_train, y_train): # Adding train data to the trainer
    trainer.append(x, y)

# # Set training parameters (optional)
# trainer.set_params({
#     'c1': 1.0,  # L1 regularization coefficient
#     'c2': 1e-3,  # L2 regularization coefficient
#     'max_iterations': 100,  # Maximum number of training iterations
#     'feature.possible_transitions': True  # Include possible transitions
# })

# Train the model and save it as a file named 'crf_model.crfsuite'
trainer.train('crf_model.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 260039
Seconds required: 4.079

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 2078849.714066
Feature norm: 1.000000
Error norm: 207606.710749
Active features: 260039
Line search trials: 1
Line search step: 0.000004
Seconds required for this iteration: 4.657

***** Iteration #2 *****
Loss: 1527664.553673
Feature norm: 8.771993
Error norm: 230825.754663
Active features: 260039
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 1.916

***** Iteration #3 *****
Loss: 1282414.585881
Feature norm: 9.599130
Error norm: 145978.045745
Active features: 260039
Line search trials: 1
Line search step: 1.000000


In [86]:
tagger = pycrfsuite.Tagger()
tagger.open('crf_model.crfsuite')

<contextlib.closing at 0x2539507eb40>

In [102]:
tokenized_sentence = ['It', 'is', 'not', 'advisable', 'to', 'buffalo', 'someone']
features_test = []
for i in range(7):
    features_test.append(features(tokenized_sentence, i))

In [103]:
tagger.tag(features_test)

['PRON', 'VERB', 'ADV', 'ADJ', 'PRT', 'VERB', 'NOUN']

In [105]:
import re
def regex_tokenize(text):
        # Regular expression to match words
        tokens = re.findall(r'\b\w+\b', text)
        return tokens

In [109]:
def final_func(sentence):
    tokens = regex_tokenize(sentence)
    features_test = []
    for i in range(len(tokens)):
        features_test.append(features(tokens, i))
    return tagger.tag(features_test)

### Final output function

In [110]:
import gradio as gr

# Define the Gradio interface
interface = gr.Interface(
    fn= final_func,                       # Function that processes input
    inputs="text",                       # Input type: Text field for the user
    outputs="text",                      # Output type: Text display for POS tags
    title="CRF based Part-of-Speech Tagger",       # Title of the UI
    description="Enter a sentence and get the Part-of-Speech tags."  # Description
)

# Launch the UI in a browser
interface.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


