In [69]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [70]:
def read_conllu(path):
    sentences = []
    sentence = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # Sentence boundary
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue

            # Skip comments
            if line.startswith("#"):
                continue

            cols = line.split("\t")

            # Skip multi-word tokens like "1-2 don't"
            if "-" in cols[0]:
                continue

            word = cols[1].lower()
            upos = cols[3]

            sentence.append((word, upos))

    if sentence:
        sentences.append(sentence)

    return sentences


In [71]:
#update to local paths to test locally
# TRAIN_PATH = "/kaggle/input/nlp-datasets/en_ewt-ud-train.conllu"
# TEST_PATH  = "/kaggle/input/nlp-datasets/en_ewt-ud-test.conllu"
TRAIN_PATH = "en_ewt-ud-train.conllu"
TEST_PATH  = "en_ewt-ud-test.conllu"
train_sentences = read_conllu(TRAIN_PATH)
test_sentences  = read_conllu(TEST_PATH)


In [72]:
counter = 0
while counter < 2:
    print(train_sentences[counter])
    counter += 1
print()
counter = 0
while counter < 2:
    print(test_sentences[counter])
    counter += 1

[('al', 'PROPN'), ('-', 'PUNCT'), ('zaman', 'PROPN'), (':', 'PUNCT'), ('american', 'ADJ'), ('forces', 'NOUN'), ('killed', 'VERB'), ('shaikh', 'PROPN'), ('abdullah', 'PROPN'), ('al', 'PROPN'), ('-', 'PUNCT'), ('ani', 'PROPN'), (',', 'PUNCT'), ('the', 'DET'), ('preacher', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('mosque', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('town', 'NOUN'), ('of', 'ADP'), ('qaim', 'PROPN'), (',', 'PUNCT'), ('near', 'ADP'), ('the', 'DET'), ('syrian', 'ADJ'), ('border', 'NOUN'), ('.', 'PUNCT')]
[('[', 'PUNCT'), ('this', 'DET'), ('killing', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('respected', 'ADJ'), ('cleric', 'NOUN'), ('will', 'AUX'), ('be', 'AUX'), ('causing', 'VERB'), ('us', 'PRON'), ('trouble', 'NOUN'), ('for', 'ADP'), ('years', 'NOUN'), ('to', 'PART'), ('come', 'VERB'), ('.', 'PUNCT'), (']', 'PUNCT')]

[('what', 'PRON'), ('if', 'SCONJ'), ('google', 'PROPN'), ('morphed', 'VERB'), ('into', 'ADP'), ('googleos', 'PROPN'), ('?', 'PUNCT')]
[('what', 'PRON'), ('if', 'SCO

In [73]:
from collections import Counter

word_counts = Counter(
    word for sent in train_sentences for word, _ in sent
)

print(word_counts.most_common(10))


[('the', 9075), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3380), ('in', 3112), ('is', 2241)]


In [74]:
#Initial Probabilities (P(tag/START))
from collections import Counter

initial_tag_counts = Counter()
total_sentences = len(train_sentences)

for sent in train_sentences:
    #check the tag of first word of each sentence
    first_tag = sent[0][1]
    initial_tag_counts[first_tag] += 1
# print(initial_tag_counts)
# print()
#calculate prob that a tag occurs at the start
initial_probs = {
    tag: count / total_sentences
    for tag, count in initial_tag_counts.items()
}

sorted_data = dict(sorted(initial_probs.items(), key=lambda item: item[1]))
print(sorted_data)
# print(initial_probs)

{'X': 7.971938775510203e-05, 'PART': 0.00470344387755102, 'SYM': 0.0074936224489795915, 'CCONJ': 0.02311862244897959, 'AUX': 0.028858418367346938, 'INTJ': 0.03228635204081633, 'PUNCT': 0.035235969387755105, 'SCONJ': 0.035634566326530615, 'NUM': 0.039142219387755105, 'ADJ': 0.04097576530612245, 'ADP': 0.04320790816326531, 'VERB': 0.06050701530612245, 'NOUN': 0.06194196428571429, 'ADV': 0.07653061224489796, 'DET': 0.10044642857142858, 'PROPN': 0.12771045918367346, 'PRON': 0.28212691326530615}


In [75]:
#Transition probabilities
transition_counts = Counter()
tag_counts = Counter()   # denominator

for sent in train_sentences:
    for i in range(len(sent) - 1):
        tag_i = sent[i][1]
        tag_j = sent[i + 1][1]

        transition_counts[(tag_i, tag_j)] += 1
        tag_counts[tag_i] += 1
transition_probs = {}

for (tag_i, tag_j), count in transition_counts.items():
    transition_probs[(tag_i, tag_j)] = count / tag_counts[tag_i]

# print(transition_probs)

In [76]:
#Emission probabilities
emission_counts = Counter()
tag_counts_emission = Counter()

for sent in train_sentences:
    for word, tag in sent:
        emission_counts[(tag, word)] += 1
        tag_counts_emission[tag] += 1
# print(len(tag_counts_emission))
emission_probs = {}

for (tag, word), count in emission_counts.items():
    emission_probs[(tag, word)] = count / tag_counts_emission[tag]
# print(emission_probs)

In [77]:
#basic checks
print(sum(initial_probs.values()))
print(sum(
    prob for (t1, _), prob in transition_probs.items()
    if t1 == "NOUN"
)
)
print(sum(
    prob for (t, _), prob in emission_probs.items()
    if t == "NOUN"
)
)

# total
# for (tag, word), prob in emission_probs.items():
#     if tag == "PUNCT":
#         # print(f"word={word} prob={prob}")
#         total=total+prob
# print(total)

TypeError: 'float' object is not callable

In [68]:
#brute force way 
# if sentences are long there will be many possible tag sequences and overall many iterations to check so we use viterbi algorithm to optimize


import itertools


def sequence_probability(words, tag_seq,initial_probs,transition_probs,emission_probs):
#for computing probabilities of a tag sequence in brute force way
    prob = 1.0

    word = words[0]
    prob *= initial_probs[tag_seq[0]]
    prob *= emission_probs.get((tag_seq[0], word), 0)

        # example for a three letter word
    
    # P(t1)×P(w1∣t1)×P(t2∣t1)×P(w2∣t2)×P(t3∣t2)×P(w3∣t3)
    # p(t1)-initial_prob
    

    for i in range(1, len(words)):
        prev_tag = tag_seq[i-1]
        curr_tag = tag_seq[i]

        word = words[i]

        transition_probs.get((prev_tag, curr_tag), 0)
        prob *= emission_probs.get((curr_tag, word), 0)

    return prob


def brute_force_decode(words,all_tags,initial_probs,transition_probs,emission_probs):

    best_prob = -1
    best_seq = None

    # generate probability for every possible tag sequence
    for tag_seq in itertools.product(all_tags, repeat=len(words)):

        prob = sequence_probability(
            words,
            tag_seq,
            initial_probs,
            transition_probs,
            emission_probs
        )

        if prob > best_prob:
            best_prob = prob
            best_seq = tag_seq

    return list(best_seq)

    
all_tags = list(initial_probs.keys())
# print(all_tags) all possible tags
correct = 0
total = 0
c=0;
for sent in test_sentences:
    
    correct = 0
    total = 0
    if len(sent) < 5: 

        words = [w for w,_ in sent]
        crt_tag= [t for _,t in sent]

        pred = brute_force_decode(
            words,
            all_tags,
            initial_probs,
            transition_probs,
            emission_probs
        )


        for t, p in zip(crt_tag, pred):
            if t == p:
                correct += 1
            total += 1

        acc = correct / len(crt_tag)

        print("Sentence:", words)
        print("crt_tag:     ", gold)
        print("Pred:     ", pred)
        print("Accuracy: ", round(acc, 3))
        print()
        c+=1
        
        if c>5:
            break
        
        

    # print("Accuracy (short sentences only):", correct/total)


Sentence: ['see', 'http://www.gulf-news.com/articles/news.asp?articleid=97508']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['PROPN', 'PROPN']
Accuracy:  0.5

Sentence: ['i.e', '.']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['ADV', 'PUNCT']
Accuracy:  1.0

Sentence: ['...']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['PUNCT']
Accuracy:  0.0

Sentence: ['wtf', 'is', 'this', '?']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['PROPN', 'PROPN', 'PROPN', 'PROPN']
Accuracy:  0.0

Sentence: ['i', "'m", 'the', 'king']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['PRON', 'AUX', 'DET', 'PROPN']
Accuracy:  0.75

Sentence: ['yeah']
crt_tag:      ['VERB', 'PROPN']
Pred:      ['INTJ']
Accuracy:  1.0

