### Part 2

Estimate the emission parameters from the training set using MLE

In [29]:
import copy
class Emission:
    def __init__(self):
        self.emission_p = {}
        self.y_count = {}
        self.y_labels = []
        self.x_given_y_count = {}
        self.tokens_list = []
        self.special_token = '#UNK#'
    
    def clean_data(self, k = 1):
        token_freq = {}
        for token in self.tokens_list:
            if token[0] not in token_freq: 
                token_freq[token[0]] = 1
            else:
                token_freq[token[0]] += 1
        for i in range(len(self.tokens_list)):
            if token_freq[self.tokens_list[i][0]] < k:
                self.tokens_list[i][0] = self.special_token
        return self.tokens_list
    
    def train(self, tokens_list: list, k = 1, special_token = '#UNK#'):
        self.tokens_list = tokens_list
        self.special_token = special_token
        self.clean_data(k)
        self.y_count = {}
        self.x_given_y_count = {} 

        for token in tokens_list:
            if token[1] not in self.y_count:
                self.y_count[token[1]] = 1
                self.x_given_y_count[token[1]] = {}
                self.x_given_y_count[token[1]][token[0]] = 1
            else:
                self.y_count[token[1]] += 1
                if token[0] not in self.x_given_y_count[token[1]]:
                    self.x_given_y_count[token[1]][token[0]] = 1
                else:
                    self.x_given_y_count[token[1]][token[0]] += 1

        # calculate emission params
        self.emission_p = copy.deepcopy(self.x_given_y_count)
        for label in self.emission_p:
            for word in self.emission_p[label]:
                self.emission_p[label][word] = float(self.x_given_y_count[label][word]) / self.y_count[label]
        self.y_labels = list(self.emission_p.keys())
        return self.emission_p
    
    def predict(self, y: str, x: str):
        x_inside = False
        for token in self.tokens_list:
            if token[0] == x:
                x_inside = True
                break

        if not x_inside:
            x = self.special_token

        if x not in self.emission_p[y]:
            return 0
        else:
            return self.emission_p[y][x]
        
    def predict_tag(self, x: str):
        score = 0.0
        y_tag = None
        for y in self.y_labels:
            y_score = self.predict(y, x)
            if y_score > score:
                y_tag = y
                score = y_score
        return y_tag
            

In [30]:
# Example ussage
import os

data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O']]
model = Emission()
print(model.train(tokens_list=data))
print(model.predict(y='O', x='b'))
print(model.y_labels)
print(model.predict_tag('a'))

# with open('./EN/train') as train_file:
#     read_data = train_file.read()
#     read_data = os.linesep.join([s for s in read_data.splitlines() if s])
#     data = list(map(lambda x: x.split(' '),read_data.split('\n')))
# emission_params(tokens_list=data, y='O', x='a')

{'O': {'a': 0.3333333333333333, 'b': 0.3333333333333333, 'c': 0.3333333333333333}, 'I': {'a': 1.0}}
0.3333333333333333
['O', 'I']
I


Modify training set to replace words that appear less than k times with special token. Apply this to the emission parameters prediction function with k = 3

In [31]:
# Example usage
data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O'], ['a', 'O']]
model = Emission()
model.train(tokens_list=data, k = 3)
model.predict(y='O', x='a')

0.5

Sentiment analysis system that produces the tag for input

In [33]:
languages = ['EN', 'SG', 'CN', 'FR']

for l in languages:
    model = Emission()
    with open("./{}/train".format(l)) as train_file:
        read_data = train_file.read()
        read_data = os.linesep.join([s for s in read_data.splitlines() if s])
        data = list(map(lambda x: x.split(' '),read_data.split('\n')))
        model.train(tokens_list=data, k=3)
    
    print("Finish training for {}".format(l))

    with open("./{}/dev.in".format(l)) as in_file, open("./{}/dev.p2.out".format(l), 'w+') as out_file:
        for line in in_file:
            word = line.strip()
            if (word == ''):
                out_file.write("\n")
            else:
                out_file.write("{} {}\n".format(word, model.predict_tag(word)))
    print("Finished: {}".format(l))

Finish training for EN
Finished: EN
Finish training for SG
Finished: SG
Finish training for CN
Finished: CN
Finish training for FR
Finished: FR


In [36]:
for l in languages:
    output = os.popen("python3 EvalScript/evalResult.py {0}/dev.out {0}/dev.p2.out".format(l)).read()
    print("Language: {}".format(l))
    print(output)
    print("----------------------")

Language: EN

#Entity in gold data: 226
#Entity in prediction: 1201

#Correct Entity : 165
Entity  precision: 0.1374
Entity  recall: 0.7301
Entity  F: 0.2313

#Correct Sentiment : 71
Sentiment  precision: 0.0591
Sentiment  recall: 0.3142
Sentiment  F: 0.0995

----------------------
Language: SG

#Entity in gold data: 1382
#Entity in prediction: 6542

#Correct Entity : 780
Entity  precision: 0.1192
Entity  recall: 0.5644
Entity  F: 0.1969

#Correct Sentiment : 311
Sentiment  precision: 0.0475
Sentiment  recall: 0.2250
Sentiment  F: 0.0785

----------------------
Language: CN

#Entity in gold data: 362
#Entity in prediction: 3318

#Correct Entity : 183
Entity  precision: 0.0552
Entity  recall: 0.5055
Entity  F: 0.0995

#Correct Sentiment : 57
Sentiment  precision: 0.0172
Sentiment  recall: 0.1575
Sentiment  F: 0.0310

----------------------
Language: FR

#Entity in gold data: 223
#Entity in prediction: 1149

#Correct Entity : 182
Entity  precision: 0.1584
Entity  recall: 0.8161
Entity  F

### Part 3

Estimates the transition parameters from the training set using MLE

In [5]:
import copy
def transition_params(ordered_labels_list: list):
    count = {}
    count_given = {} # 2 layer dictionary depth-0 key is the (i-1)-label, depth-1 key is the i-label
    
    # count frequency of all label and combinations of 2 labels in the dataset
    for idx, label in enumerate(ordered_labels_list):
        if label not in count:
            count[label] = 1
            count_given[label] = {}
            if idx < len(ordered_labels_list) - 1:
                next_label = ordered_labels_list[idx + 1]
                count_given[label][next_label] = 1
        else:
            count[label] += 1
            if idx < len(ordered_labels_list) - 1:
                next_label = ordered_labels_list[idx + 1]
                if next_label not in count_given[label]:
                    count_given[label][next_label] = 1
                else:
                    count_given[label][next_label] += 1
    
    # calculate trans_params
    trans_params = copy.deepcopy(count_given)
    for given_label in trans_params:
        for label in trans_params[given_label]:
            trans_params[given_label][label] /= count[given_label]
            
    return trans_params

def specific_transition_params(ordered_labels_list: list, y: str, y_given: str):
    trans_params = transition_params(ordered_labels_list)
    if y not in trans_params:
        return 0;
    elif y_given not in trans_params[y]:
        return 0;
    else:
        return trans_params[y_given][y]
    
specific_transition_params(['a', 'b', 'b', 'c', 'b', 'a', 'd', 'h', 'b'], 'b', 'a')

0.5

Viterbi algo

In [6]:
def viterbi(training_tokens: list, sentence: str):
    sentence = sentence.split()
    # sentence has both START and STOP words
    cache = {}
    y_predicted = []
    trans_params = transition_params(sentence)
    print(trans_params)
    emission_params = emission_params_cache(training_tokens)
    for k in range(len(sentence)-1):
         pass
    return result

viterbi("""
Hey I was doing just fine before I met you
Drink too much and that\'s an issue
But I\'m okay
Hey, you tell your friends it was nice to meet them
But I hope I never see them
Again
I know it breaks your heart
Moved to the city in a broke-down car
And four years, no calls
Now you're looking pretty in a hotel bar
And I, I, I, I, I can't stop
No, I, I, I, I, I can't stop
""")

ERROR:root:An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 0))



{'Hey': {'I': 1.0}, 'I': {'was': 0.14285714285714285, 'met': 0.14285714285714285, 'hope': 0.14285714285714285, 'never': 0.14285714285714285, 'know': 0.14285714285714285, "can't": 0.2857142857142857}, 'was': {'doing': 0.5, 'nice': 0.5}, 'doing': {'just': 1.0}, 'just': {'fine': 1.0}, 'fine': {'before': 1.0}, 'before': {'I': 1.0}, 'met': {'you': 1.0}, 'you': {'Drink': 0.5, 'tell': 0.5}, 'Drink': {'too': 1.0}, 'too': {'much': 1.0}, 'much': {'and': 1.0}, 'and': {"that's": 1.0}, "that's": {'an': 1.0}, 'an': {'issue': 1.0}, 'issue': {'But': 1.0}, 'But': {"I'm": 0.5, 'I': 0.5}, "I'm": {'okay': 1.0}, 'okay': {'Hey,': 1.0}, 'Hey,': {'you': 1.0}, 'tell': {'your': 1.0}, 'your': {'friends': 0.5, 'heart': 0.5}, 'friends': {'it': 1.0}, 'it': {'was': 0.5, 'breaks': 0.5}, 'nice': {'to': 1.0}, 'to': {'meet': 0.5, 'the': 0.5}, 'meet': {'them': 1.0}, 'them': {'But': 0.5, 'Again': 0.5}, 'hope': {'I': 1.0}, 'never': {'see': 1.0}, 'see': {'them': 1.0}, 'Again': {'I': 1.0}, 'know': {'it': 1.0}, 'breaks': {'yo

NameError: name 'result' is not defined