In [1]:
import nltk
import random
from src.hmm import HMM, process_inputs, save_hmm, load_hmm

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/stark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/stark/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
nltk.download("brown")
nltk.download('universal_tagset')

[nltk_data] Downloading package brown to /Users/stark/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /Users/stark/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


True

In [3]:
def load_brown_corpus(train_size=0.9, tagset="universal", seed=42, join=False):
    random.seed(seed)
    corpus = list(nltk.corpus.brown.tagged_sents(tagset=tagset))
    random.shuffle(corpus)
    split_size = int(len(corpus) * train_size)
    train_data = corpus[:split_size]
    test_data = corpus[split_size:]
    if join:
        train_x = [" ".join([x for x, _ in data]) for data in train_data]
    else:
        train_x = [[x for x, _ in data] for data in train_data]
    train_y = [[y for _, y in data] for data in train_data]
    if join:
        test_x = [" ".join([x for x, _ in data]) for data in test_data]
    else:
        test_x = [[x for x, _ in data] for data in test_data]
    test_y = [[y for _, y in data] for data in test_data]
    return train_x, train_y, test_x, test_y

In [4]:
trainX, trainY, testX, testY = load_brown_corpus(train_size=0.9, tagset="universal")

In [5]:
for sent, tar in zip(trainX, trainY):
    print(sent)
    print(tar)
    break

['He', 'let', 'her', 'tell', 'him', 'all', 'about', 'the', 'church', '.']
['PRON', 'VERB', 'PRON', 'VERB', 'PRON', 'PRT', 'ADP', 'DET', 'NOUN', '.']


In [6]:
for sent, tar in zip(testX, testY):
    print(sent)
    print(tar)
    break

['Assumption', '3', '.']
['NOUN', 'NUM', '.']


In [7]:
print("Number of samples in train and test respectively: ", len(trainX), len(testX))

Number of samples in train and test respectively:  51606 5734


In [8]:
all_pos_tags = []
for y in trainY:
    all_pos_tags.extend(y)
for y in testY:
    all_pos_tags.extend(y)

all_pos_tags = set(all_pos_tags)

In [9]:
print("All Pos Tags: ")
print(all_pos_tags)

All Pos Tags: 
{'ADP', 'DET', 'NOUN', 'ADJ', 'X', 'VERB', 'NUM', 'PRT', 'ADV', '.', 'CONJ', 'PRON'}


In [10]:
print("Number of POS Tags: ", len(all_pos_tags))

Number of POS Tags:  12


In [11]:
trainX = process_inputs(trainX, split=True)
testX = process_inputs(testX, split=True)

Processing Input data..: 100%|██████████| 51606/51606 [00:00<00:00, 322827.40it/s]
Processing Input data..: 100%|██████████| 5734/5734 [00:00<00:00, 576686.63it/s]


In [12]:
trainX[0]

['he', 'let', 'her', 'tell', 'him', 'all', 'about', 'the', 'church', '.']

In [13]:
testX[0]

['assumption', '3', '.']

In [14]:
model = HMM()

In [15]:
model.fit(X=trainX, y=trainY)
save_hmm(model, "hmm.pkl")

(14, 11648)


Fitting HMM..: 100%|██████████| 51606/51606 [00:00<00:00, 87458.30it/s]


In [16]:
model = load_hmm("hmm.pkl")
test_predictions = model.predict(testX)

Predicting..: 100%|██████████| 5734/5734 [00:08<00:00, 680.39it/s]


In [17]:
from src.metrics import accuracy, confusion_matrix

In [18]:
accuracy(test_predictions, testY)

0.9488374903951583