### Part 2

Estimate the emission parameters from the training set using MLE

In [9]:
import copy
class Emission:
    def __init__(self):
        self.emission_p = {}
        self.y_count = {}
        self.y_labels = []
        self.x_given_y_count = {}
        self.tokens_list = []
        self.special_token = '#UNK#'
    
    def clean_data(self, k = 1):
        token_freq = {}
        for token in self.tokens_list:
            if token[0] not in token_freq: 
                token_freq[token[0]] = 1
            else:
                token_freq[token[0]] += 1
        for i in range(len(self.tokens_list)):
            if token_freq[self.tokens_list[i][0]] < k:
                self.tokens_list[i][0] = self.special_token
        return self.tokens_list
    
    def train(self, tokens_list: list, k = 1, special_token = '#UNK#'):
        self.tokens_list = tokens_list
        self.special_token = special_token
        self.clean_data(k)
        self.y_count = {}
        self.x_given_y_count = {} 

        for token in tokens_list:
            if token[0] in ['START', 'STOP']:
                continue
            if token[1] not in self.y_count:
                self.y_count[token[1]] = 1
                self.x_given_y_count[token[1]] = {}
                self.x_given_y_count[token[1]][token[0]] = 1
            else:
                self.y_count[token[1]] += 1
                if token[0] not in self.x_given_y_count[token[1]]:
                    self.x_given_y_count[token[1]][token[0]] = 1
                else:
                    self.x_given_y_count[token[1]][token[0]] += 1

        # calculate emission params
        self.emission_p = copy.deepcopy(self.x_given_y_count)
        for label in self.emission_p:
            for word in self.emission_p[label]:
                self.emission_p[label][word] = float(self.x_given_y_count[label][word]) / self.y_count[label]
        self.y_labels = list(self.emission_p.keys())
        return self.emission_p
    
    def predict(self, y: str, x: str):
        x_inside = False
        for token in self.tokens_list:
            if token[0] == x:
                x_inside = True
                break

        if not x_inside:
            x = self.special_token

        if x not in self.emission_p[y]:
            return 0
        else:
            return self.emission_p[y][x]
        
    def predict_tag(self, x: str):
        score = 0.0
        y_tag = None
        for y in self.y_labels:
            y_score = self.predict(y, x)
            if y_score > score:
                y_tag = y
                score = y_score
        return y_tag
            

In [10]:
# Example ussage
import os

data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O']]
model = Emission()
print(model.train(tokens_list=data))
print(model.predict(y='O', x='b'))
print(model.y_labels)
print(model.predict_tag('a'))

# with open('./EN/train') as train_file:
#     read_data = train_file.read()
#     read_data = os.linesep.join([s for s in read_data.splitlines() if s])
#     data = list(map(lambda x: x.split(' '),read_data.split('\n')))
# emission_params(tokens_list=data, y='O', x='a')

{'O': {'a': 0.3333333333333333, 'b': 0.3333333333333333, 'c': 0.3333333333333333}, 'I': {'a': 1.0}}
0.3333333333333333
['O', 'I']
I


Modify training set to replace words that appear less than k times with special token. Apply this to the emission parameters prediction function with k = 3

In [11]:
# Example usage
data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O'], ['a', 'O']]
model = Emission()
model.train(tokens_list=data, k = 3)
model.predict(y='O', x='a')

0.5

Sentiment analysis system that produces the tag for input

In [39]:
languages = ['EN', 'SG', 'CN', 'FR']

for l in languages:
    model = Emission()
    with open("./{}/train".format(l)) as train_file:
        read_data = train_file.read()
        read_data = os.linesep.join([s if s else 'START START\nSTOP STOP' for s in read_data.splitlines()])
        data = list(map(lambda x: x.rsplit(' ',1),read_data.split('\n')))
        model.train(tokens_list=data, k=3)
    
    print("Finish training for {}".format(l))

    with open("./{}/dev.in".format(l)) as in_file, open("./{}/dev.p2.out".format(l), 'w+') as out_file:
        for line in in_file:
            word = line.strip()
            if (word == ''):
                out_file.write("\n")
            else:
                out_file.write("{} {}\n".format(word, model.predict_tag(word)))
    print("Finished: {}".format(l))

Finish training for EN
Finished: EN
Finish training for SG


KeyboardInterrupt: 

In [36]:
for l in languages:
    output = os.popen("python3 EvalScript/evalResult.py {0}/dev.out {0}/dev.p2.out".format(l)).read()
    print("Language: {}".format(l))
    print(output)
    print("----------------------")

Language: EN

#Entity in gold data: 226
#Entity in prediction: 1201

#Correct Entity : 165
Entity  precision: 0.1374
Entity  recall: 0.7301
Entity  F: 0.2313

#Correct Sentiment : 71
Sentiment  precision: 0.0591
Sentiment  recall: 0.3142
Sentiment  F: 0.0995

----------------------
Language: SG

#Entity in gold data: 1382
#Entity in prediction: 6542

#Correct Entity : 780
Entity  precision: 0.1192
Entity  recall: 0.5644
Entity  F: 0.1969

#Correct Sentiment : 311
Sentiment  precision: 0.0475
Sentiment  recall: 0.2250
Sentiment  F: 0.0785

----------------------
Language: CN

#Entity in gold data: 362
#Entity in prediction: 3318

#Correct Entity : 183
Entity  precision: 0.0552
Entity  recall: 0.5055
Entity  F: 0.0995

#Correct Sentiment : 57
Sentiment  precision: 0.0172
Sentiment  recall: 0.1575
Sentiment  F: 0.0310

----------------------
Language: FR

#Entity in gold data: 223
#Entity in prediction: 1149

#Correct Entity : 182
Entity  precision: 0.1584
Entity  recall: 0.8161
Entity  F

### Part 3

Estimates the transition parameters from the training set using MLE

In [6]:
import copy
def transition_params(ordered_labels_list: list):
    count = {}
    count_given = {} # 2 layer dictionary depth-0 key is the (i-1)-label, depth-1 key is the i-label
    
    # count frequency of all label and combinations of 2 labels in the dataset
    for idx, label in enumerate(ordered_labels_list):
        if label == 'STOP':
            continue
        if label not in count:
            count[label] = 1
            count_given[label] = {}
            if idx < len(ordered_labels_list) - 1:
                next_label = ordered_labels_list[idx + 1]
                count_given[label][next_label] = 1
        else:
            count[label] += 1
            if idx < len(ordered_labels_list) - 1:
                next_label = ordered_labels_list[idx + 1]
                if next_label not in count_given[label]:
                    count_given[label][next_label] = 1
                else:
                    count_given[label][next_label] += 1
    
    # calculate trans_params
    trans_params = copy.deepcopy(count_given)
    for given_label in trans_params:
        for label in trans_params[given_label]:
            trans_params[given_label][label] /= count[given_label]
    return trans_params

def specific_transition_params(ordered_labels_list: list, y: str, y_given: str):
    trans_params = transition_params(ordered_labels_list)
    if y not in trans_params:
        return 0;
    elif y_given not in trans_params[y]:
        return 0;
    else:
        return trans_params[y_given][y]
    
specific_transition_params(['START','a', 'b', 'b', 'STOP','START','c', 'b', 'a', 'd', 'h', 'b','STOP'], 'b', 'a')

0.5

Viterbi algo

In [84]:
def viterbi(sentence: str, labels: list, trans_p: dict, emission_p: dict):
    observed_words = sentence.split()
    cache = [{}]
    
    # handle first layer
    for l in labels:
        trans_param = trans_p['START'][l] if l in trans_p['START'] else 0
        emission_param = emission_p[l][observed_words[0]] if observed_words[0] in emission_p[l] else emission_p[l]['#UNK#']
        cache[0][l] = {"chance": trans_param * emission_param, "prev": None}
    
    # handle middle layers
    for i in range(1, len(observed_words)):
        cache.append({})
        max_trans_prob = 0
        max_prev_l = None
        for l in labels:
            for prev_l in labels:
                trans_param = trans_p[prev_l][l] if l in trans_p[prev_l] else 0
                trans_prob = cache[i-1][l]['chance'] * trans_param
                if trans_prob >= max_trans_prob:
                    max_trans_prob = trans_prob
                    max_prev_l = prev_l
            
            emission_param = emission_p[l][observed_words[i]] if observed_words[i] in emission_p[l] else emission_p[l]['#UNK#']
            cache[i][l] = {'chance': max_trans_prob * emission_param, 'prev': max_prev_l}
            
    # handle the end layer       
    cache.append({})
    max_end_prob = 0
    max_end_l = None
    for l in labels:
        trans_param = trans_p[l]['STOP'] if 'STOP' in trans_p[l] else 0
        end_prob = cache[len(observed_words) - 1][l]['chance'] * trans_param
        if end_prob >= max_end_prob:
            max_end_prob = end_prob
            max_end_l = l
    cache[len(observed_words)]['STOP'] = {'chance': max_end_prob, 'prev': max_end_l}
    
    # backtrack for optimal path
    optimal_prob = cache[len(observed_words)]['STOP']['chance']
    previous_l = cache[len(observed_words)]['STOP']['prev']
    optimal = [previous_l]
#     print(cache[-1])
#     print(cache[-1]['STOP'])
    for i in range(len(observed_words) - 1, 0, -1):
#         print(optimal)
        optimal.insert(0, cache[i][previous_l]['prev'])
        previous = cache[i][previous_l]['prev']
    print(cache)
    return (optimal, optimal_prob)
    
    
    
    

print(viterbi(
    'normal cold dizzy', 
    ['Healthy', 'Fever'],
    {
        'START': {'Healthy': 0.6, 'Fever': 0.4},
        'Healthy': {'Healthy': 0.69, 'Fever': 0.3, 'STOP': 0.01},
        'Fever': {'Healthy': 0.4, 'Fever': 0.59, 'STOP': 0.01}
    },
    {
        'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
        'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
    }
))

[{'Healthy': {'chance': 0.3, 'prev': None}, 'Fever': {'chance': 0.04000000000000001, 'prev': None}}, {'Healthy': {'chance': 0.0828, 'prev': 'Healthy'}, 'Fever': {'chance': 0.062099999999999995, 'prev': 'Healthy'}}, {'Healthy': {'chance': 0.0057132, 'prev': 'Healthy'}, 'Fever': {'chance': 0.034279199999999996, 'prev': 'Healthy'}}, {'STOP': {'chance': 0.00034279199999999997, 'prev': 'Fever'}}]
(['Healthy', 'Healthy', 'Fever'], 0.00034279199999999997)


In [85]:
for l in languages:
    model = Emission()
    with open("./{}/train".format(l)) as train_file:
        read_data = train_file.read()
        read_data = os.linesep.join([s if s else 'STOP STOP\nSTART START' for s in read_data.splitlines()])
        data = list(map(lambda x: x.rsplit(' ',1),read_data.split('\n')))
        model.train(tokens_list=data, k=3)
        emission_p = model.emission_p
        ordered_labels_list = list(map(lambda x: x[1], data))
        transition_p = transition_params(ordered_labels_list)
        labels = list(filter(lambda a: a != 'START', transition_p.keys()))
    
    print("Finish training for {}".format(l))

    with open("./{}/dev.in".format(l)) as in_file, open("./{}/dev.p3.out".format(l), 'w+') as out_file:
        read_data = in_file.read()
        sentences = list(filter(lambda x: len(x) > 0, read_data.split('\n\n')))
        sentences = list(map(lambda x: ' '.join(x.split('\n')), sentences))
        for sentence in sentences:
            sentence_labels, chance = viterbi(sentence=sentence, labels=labels, trans_p=transition_p, emission_p=emission_p)
            for idx,word in enumerate(sentence.split()):
                out_file.write("{} {}\n".format(word, sentence_labels[idx]))
    print("Finished: {}".format(l))

Finish training for EN
[{'O': {'chance': 0.0004263822614760147, 'prev': None}, 'B-neutral': {'chance': 0.0009035278656207647, 'prev': None}, 'B-positive': {'chance': 0.01061883227318853, 'prev': None}, 'B-negative': {'chance': 0.001956711941253917, 'prev': None}, 'I-negative': {'chance': 0.0, 'prev': None}, 'I-positive': {'chance': 0.0, 'prev': None}, 'I-neutral': {'chance': 0.0, 'prev': None}}, {'O': {'chance': 7.155669341848883e-06, 'prev': 'O'}, 'B-neutral': {'chance': 6.206359076390004e-05, 'prev': 'O'}, 'B-positive': {'chance': 0.00011963200756560824, 'prev': 'O'}, 'B-negative': {'chance': 9.038188231284807e-05, 'prev': 'O'}, 'I-negative': {'chance': 0.00012608806525019018, 'prev': 'O'}, 'I-positive': {'chance': 0.00017145117623394162, 'prev': 'O'}, 'I-neutral': {'chance': 0.00010722322684319242, 'prev': 'O'}}, {'O': {'chance': 1.777208159256383e-09, 'prev': 'O'}, 'B-neutral': {'chance': 1.0415689717881882e-06, 'prev': 'O'}, 'B-positive': {'chance': 1.492826690826764e-06, 'prev': 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [86]:
for l in languages:
    output = os.popen("python3 EvalScript/evalResult.py {0}/dev.out {0}/dev.p3.out".format(l)).read()
    print("Language: {}".format(l))
    print(output)
    print("----------------------")

Language: EN

#Entity in gold data: 226
#Entity in prediction: 1135

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000

----------------------
Language: SG

#Entity in gold data: 1382
#Entity in prediction: 11760

#Correct Entity : 1
Entity  precision: 0.0001
Entity  recall: 0.0007
Entity  F: 0.0002

#Correct Sentiment : 1
Sentiment  precision: 0.0001
Sentiment  recall: 0.0007
Sentiment  F: 0.0002

----------------------
Language: CN

#Entity in gold data: 362
#Entity in prediction: 238

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000

----------------------
Language: FR

#Entity in gold data: 223
#Entity in prediction: 499

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Co

forward-backward algo

In [97]:
import copy
def forward_backward(sentence: str, labels: list, trans_p: dict, emission_p: dict):
    observed_words = sentence.split()
    
    # forward part
    forward = []
    prev_forward = {}
    for i, word in enumerate(observed_words):
        curr_forward = {}
        for l in labels:
            prev_f_sum = 0
            if i == 0:
                trans_prob = trans_p['START'][l] if l in trans_p['START'] else 0
                prev_f_sum = trans_prob
            else:
                for prev_l in labels:
                    trans_prob = trans_p[prev_l][l] if l in trans_p[prev_l] else 0
                    prev_f_sum += prev_forward[prev_l] * trans_prob
            
            emission_prob = emission_p[l][word] if word in emission_p[l] else emission_p[l]['#UNK#']
            curr_forward[l] = emission_prob * prev_f_sum
        
        forward.append(curr_forward)
        prev_forward = copy.deepcopy(curr_forward)
    
    forward_prob = 0
    for l in labels:
        trans_prob = trans_p[l]['STOP'] if 'STOP' in trans_p[l] else 0
        forward_prob += curr_forward[l] * trans_prob
    
    # backward part
    backward = []
    prev_backward = {}
    for i, word in enumerate((observed_words[1:] + [None])[::-1]):
        curr_backward = {}
        for l in labels:
            curr_backward[l] = 0
            if i == 0:
                trans_prob = trans_p[l]['STOP'] if 'STOP' in trans_p[l] else 0
                curr_backward[l] = trans_prob
            else:
                for next_l in labels:
                    trans_prob = trans_p[l][next_l] if next_l in trans_p[l] else 0
                    emm_prob = emission_p[next_l][word] if word in emission_p[next_l] else emission_p[next_l]['#UNK#']
                    curr_backward[l] += trans_prob * emm_prob * prev_backward[next_l]
        
        backward.insert(0, curr_backward)
        prev_backward = copy.deepcopy(curr_backward)
    
    backward_prob = 0
    for l in labels:
        trans_prob = trans_p['START'][l] if l in trans_p['START'] else 0
        emm_prob = emission_p[l][observed_words[0]] if observed_words[0] in emission_p[l] else emission_p[l]['#UNK#']
        backward_prob += trans_prob * emm_prob * curr_backward[l]
        
    return forward, backward

forward_backward(
    'normal cold dizzy', 
    ['Healthy', 'Fever'],
    {
        'START': {'Healthy': 0.6, 'Fever': 0.4},
        'Healthy': {'Healthy': 0.69, 'Fever': 0.3, 'STOP': 0.01},
        'Fever': {'Healthy': 0.4, 'Fever': 0.59, 'STOP': 0.01}
    },
    {
        'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
        'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
    }
)

([{'Fever': 0.04000000000000001, 'Healthy': 0.3},
  {'Fever': 0.03408, 'Healthy': 0.0892},
  {'Fever': 0.028120319999999997, 'Healthy': 0.007518}],
 [{'Fever': 0.00109578, 'Healthy': 0.0010418399999999998},
  {'Fever': 0.00394, 'Healthy': 0.00249},
  {'Fever': 0.01, 'Healthy': 0.01}])

Max-Marginal Decoding with Forward-backward

In [98]:
def max_marginal(sentence: str, labels: list, trans_p: dict, emission_p: dict):
    forward_p, backward_p = forward_backward(sentence, labels, trans_p, emission_p)
    predictions = []
    for i in range(len(forward_p)):
        product_p = {l: forward_p[i][l] * backward_p[i][l] for l in labels}
        predictions.append(max(product_p, key=product_p.get))
    
    return predictions

max_marginal(
    'normal cold dizzy', 
    ['Healthy', 'Fever'],
    {
        'START': {'Healthy': 0.6, 'Fever': 0.4},
        'Healthy': {'Healthy': 0.69, 'Fever': 0.3, 'STOP': 0.01},
        'Fever': {'Healthy': 0.4, 'Fever': 0.59, 'STOP': 0.01}
    },
    {
        'Healthy' : {'normal': 0.5, 'cold': 0.4, 'dizzy': 0.1},
        'Fever' : {'normal': 0.1, 'cold': 0.3, 'dizzy': 0.6}
    }
)      

['Healthy', 'Healthy', 'Fever']

In [100]:
for l in languages:
    model = Emission()
    with open("./{}/train".format(l)) as train_file:
        read_data = train_file.read()
        read_data = os.linesep.join([s if s else 'STOP STOP\nSTART START' for s in read_data.splitlines()])
        data = list(map(lambda x: x.rsplit(' ',1),read_data.split('\n')))
        model.train(tokens_list=data, k=3)
        emission_p = model.emission_p
        ordered_labels_list = list(map(lambda x: x[1], data))
        transition_p = transition_params(ordered_labels_list)
        labels = list(filter(lambda a: a != 'START', transition_p.keys()))
    
    print("Finish training for {}".format(l))

    with open("./{}/dev.in".format(l)) as in_file, open("./{}/dev.p4.out".format(l), 'w+') as out_file:
        read_data = in_file.read()
        sentences = list(filter(lambda x: len(x) > 0, read_data.split('\n\n')))
        sentences = list(map(lambda x: ' '.join(x.split('\n')), sentences))
        for sentence in sentences:
            sentence_labels = max_marginal(sentence=sentence, labels=labels, trans_p=transition_p, emission_p=emission_p)
            for idx,word in enumerate(sentence.split()):
                out_file.write("{} {}\n".format(word, sentence_labels[idx]))
    print("Finished: {}".format(l))

Finish training for EN
Finished: EN
Finish training for SG
Finished: SG
Finish training for CN
Finished: CN
Finish training for FR
Finished: FR


In [101]:
for l in languages:
    output = os.popen("python3 EvalScript/evalResult.py {0}/dev.out {0}/dev.p4.out".format(l)).read()
    print("Language: {}".format(l))
    print(output)
    print("----------------------")

Language: EN

#Entity in gold data: 226
#Entity in prediction: 394

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000

----------------------
Language: SG

#Entity in gold data: 1382
#Entity in prediction: 3208

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000

----------------------
Language: CN

#Entity in gold data: 362
#Entity in prediction: 270

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000

----------------------
Language: FR

#Entity in gold data: 223
#Entity in prediction: 241

#Correct Entity : 0
Entity  precision: 0.0000
Entity  recall: 0.0000
Entity  F: 0.0000

#Corr