In [30]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-datasets/en_ewt-ud-test.conllu
/kaggle/input/nlp-datasets/en_ewt-ud-train.conllu


In [31]:
def read_conllu(path):
    sentences = []
    sentence = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()

            # Sentence boundary
            if not line:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
                continue

            # Skip comments
            if line.startswith("#"):
                continue

            cols = line.split("\t")

            # Skip multi-word tokens like "1-2 don't"
            if "-" in cols[0]:
                continue

            word = cols[1].lower()
            upos = cols[3]

            sentence.append((word, upos))

    if sentence:
        sentences.append(sentence)

    return sentences


In [None]:
#update to local paths to test locally
TRAIN_PATH = "/kaggle/input/nlp-datasets/en_ewt-ud-train.conllu"
TEST_PATH  = "/kaggle/input/nlp-datasets/en_ewt-ud-test.conllu"

train_sentences = read_conllu(TRAIN_PATH)
test_sentences  = read_conllu(TEST_PATH)


In [33]:
counter = 0
while counter < 2:
    print(train_sentences[counter])
    counter += 1

[('al', 'PROPN'), ('-', 'PUNCT'), ('zaman', 'PROPN'), (':', 'PUNCT'), ('american', 'ADJ'), ('forces', 'NOUN'), ('killed', 'VERB'), ('shaikh', 'PROPN'), ('abdullah', 'PROPN'), ('al', 'PROPN'), ('-', 'PUNCT'), ('ani', 'PROPN'), (',', 'PUNCT'), ('the', 'DET'), ('preacher', 'NOUN'), ('at', 'ADP'), ('the', 'DET'), ('mosque', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('town', 'NOUN'), ('of', 'ADP'), ('qaim', 'PROPN'), (',', 'PUNCT'), ('near', 'ADP'), ('the', 'DET'), ('syrian', 'ADJ'), ('border', 'NOUN'), ('.', 'PUNCT')]
[('[', 'PUNCT'), ('this', 'DET'), ('killing', 'NOUN'), ('of', 'ADP'), ('a', 'DET'), ('respected', 'ADJ'), ('cleric', 'NOUN'), ('will', 'AUX'), ('be', 'AUX'), ('causing', 'VERB'), ('us', 'PRON'), ('trouble', 'NOUN'), ('for', 'ADP'), ('years', 'NOUN'), ('to', 'PART'), ('come', 'VERB'), ('.', 'PUNCT'), (']', 'PUNCT')]


In [34]:
from collections import Counter

word_counts = Counter(
    word for sent in train_sentences for word, _ in sent
)

print(word_counts.most_common(10))


[('the', 9075), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3380), ('in', 3112), ('is', 2241)]


In [40]:
#Initial Probabilities (P(tag/START))
from collections import Counter

initial_tag_counts = Counter()
total_sentences = len(train_sentences)

for sent in train_sentences:
    #check the tag of first word of each sentence
    first_tag = sent[0][1]
    initial_tag_counts[first_tag] += 1

#calculate prob that a tag occurs at the start
initial_probs = {
    tag: count / total_sentences
    for tag, count in initial_tag_counts.items()
}

sorted_data = dict(sorted(initial_probs.items(), key=lambda item: item[1]))
print(sorted_data)
#print(initial_probs)

{'X': 7.971938775510203e-05, 'PART': 0.00470344387755102, 'SYM': 0.0074936224489795915, 'CCONJ': 0.02311862244897959, 'AUX': 0.028858418367346938, 'INTJ': 0.03228635204081633, 'PUNCT': 0.035235969387755105, 'SCONJ': 0.035634566326530615, 'NUM': 0.039142219387755105, 'ADJ': 0.04097576530612245, 'ADP': 0.04320790816326531, 'VERB': 0.06050701530612245, 'NOUN': 0.06194196428571429, 'ADV': 0.07653061224489796, 'DET': 0.10044642857142858, 'PROPN': 0.12771045918367346, 'PRON': 0.28212691326530615}


In [47]:
#Transition probabilities
transition_counts = Counter()
tag_counts = Counter()   # denominator

for sent in train_sentences:
    for i in range(len(sent) - 1):
        tag_i = sent[i][1]
        tag_j = sent[i + 1][1]

        transition_counts[(tag_i, tag_j)] += 1
        tag_counts[tag_i] += 1
transition_probs = {}

for (tag_i, tag_j), count in transition_counts.items():
    transition_probs[(tag_i, tag_j)] = count / tag_counts[tag_i]

#print(transition_probs)

In [50]:
#Emission probabilities
emission_counts = Counter()
tag_counts_emission = Counter()

for sent in train_sentences:
    for word, tag in sent:
        emission_counts[(tag, word)] += 1
        tag_counts_emission[tag] += 1

emission_probs = {}

for (tag, word), count in emission_counts.items():
    emission_probs[(tag, word)] = count / tag_counts_emission[tag]


In [56]:
#basic checks
print(sum(initial_probs.values()))
print(sum(
    prob for (t1, _), prob in transition_probs.items()
    if t1 == "NOUN"
)
)
print(sum(
    prob for (t, _), prob in emission_probs.items()
    if t == "NOUN"
)
)

1.0
1.0
1.0
