### Part 2

Estimate the emission parameters from the training set using MLE

In [2]:
def emission_params(tokens_list: list, y: str, x: str, special_token='#UNK#'):
    y_count = 0
    x_given_y_count = 0
    x_count = 0
    for token in tokens_list: 
        if token[1] == y:
            y_count += 1
            if token[0] == x:
                x_given_y_count += 1
        if token[0] == x:
            x_count += 1
    
    if x_count == 0:
        return emission_params(tokens_list, y, special_token)
    return float(x_given_y_count) / y_count

In [3]:
# Example ussage
import os

data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O']]
emission_params(tokens_list=data, y='O', x='a')

# with open('./EN/train') as train_file:
#     read_data = train_file.read()
#     read_data = os.linesep.join([s for s in read_data.splitlines() if s])
#     data = list(map(lambda x: x.split(' '),read_data.split('\n')))
# emission_params(tokens_list=data, y='O', x='a')

0.3333333333333333

Modify training set to replace words that appear less than k times with special token. Apply this to the emission parameters prediction function with k = 3

In [4]:
def clean_data(tokens_list, k = 1, special_token = '#UNK#'):
    token_freq = {}
    for token in tokens_list:
        if token[0] not in token_freq: 
            token_freq[token[0]] = 1
        else:
            token_freq[token[0]] += 1
    for i in range(len(tokens_list)):
        if token_freq[tokens_list[i][0]] < k:
            tokens_list[i][0] = special_token
    
    return tokens_list

def emission_params_clean_data(tokens_list: list, y: str, x: str):
    tokens_list = clean_data(tokens_list = tokens_list, k = 3)
    return emission_params(tokens_list, y, x)

In [5]:
# Example usage
data = [['a', 'O'], ['b', 'O'], ['a', 'I'], ['c', 'O'], ['a', 'O']]
emission_params_clean_data(tokens_list=data, y='O', x='a')

0.5

Sentiment analysis system that produces the tag for input

In [10]:
def predict_tag(tokens_list: list, x: str):
    y_tags = ['O', 'B-positive', 'I-positive', 'B-neutral', 'I-neutral', 'B-negative', 'I-negative']
    score = 0.0
    y_tag = None
    for y in y_tags:
        y_score = emission_params_clean_data(tokens_list, y, x)
        if (y_score > score):
            y_tag = y
            score = y_score 
    return y_tag

# clean data before predicting the tag
def predict_tag_optimized(tokens_list: list, x: str):
    y_tags = ['O', 'B-positive', 'I-positive', 'B-neutral', 'I-neutral', 'B-negative', 'I-negative']
    score = 0.0
    y_tag = None
    for y in y_tags:
        y_score = emission_params(tokens_list, y, x)
        if (y_score > score):
            y_tag = y
            score = y_score 
    return y_tag
languages = ['EN', 'SG', 'CN', 'FR']

for l in languages:
    with open("./{}/train".format(l)) as train_file:
        read_data = train_file.read()
        read_data = os.linesep.join([s for s in read_data.splitlines() if s])
        data = list(map(lambda x: x.split(' '),read_data.split('\n')))
        data_cleaned = clean_data(tokens_list = data, k = 3)

    with open("./{}/dev.in".format(l)) as in_file, open("./{}/dev.p2.out".format(l), 'w+') as out_file:
        for line in in_file:
            word = line.strip()
            if (word == ''):
                out_file.write("\n")
            else:
                out_file.write("{} {}\n".format(word, predict_tag_optimized(data_cleaned, word)))
    print("Finished: {}".format(l))

Finished: EN
Finished: SG
Finished: CN
Finished: FR


In [12]:
for l in languages:
    output = os.popen("python EvalScript/evalResult.py {0}/dev.out {0}/dev.p2.out".format(l)).read()
    print("Language: {}".format(l))
    print(output)
    print("----------------------")

Language: EN

#Entity in gold data: 226
#Entity in prediction: 1201

#Correct Entity : 165
Entity  precision: 0.1374
Entity  recall: 0.7301
Entity  F: 0.2313

#Correct Sentiment : 71
Sentiment  precision: 0.0591
Sentiment  recall: 0.3142
Sentiment  F: 0.0995

----------------------
Language: SG

#Entity in gold data: 1382
#Entity in prediction: 6599

#Correct Entity : 794
Entity  precision: 0.1203
Entity  recall: 0.5745
Entity  F: 0.1990

#Correct Sentiment : 315
Sentiment  precision: 0.0477
Sentiment  recall: 0.2279
Sentiment  F: 0.0789

----------------------
Language: CN

#Entity in gold data: 362
#Entity in prediction: 3318

#Correct Entity : 183
Entity  precision: 0.0552
Entity  recall: 0.5055
Entity  F: 0.0995

#Correct Sentiment : 57
Sentiment  precision: 0.0172
Sentiment  recall: 0.1575
Sentiment  F: 0.0310

----------------------
Language: FR

#Entity in gold data: 223
#Entity in prediction: 1149

#Correct Entity : 182
Entity  precision: 0.1584
Entity  recall: 0.8161
Entity  F

### Part 3

Estimates the transition parameters from the training set using MLE

In [2]:
import copy
def transition_params(ordered_words_list: list):
    count = {}
    count_given = {} # 2 layer dictionary depth-0 key is the (i-1)-word, depth-1 key is the i-word
    
    # count frequency of all word and combinations of 2 words in the dataset
    for idx, word in enumerate(ordered_words_list):
        if word not in count:
            count[word] = 1
            count_given[word] = {}
            if idx < len(ordered_words_list) - 1:
                next_word = ordered_words_list[idx + 1]
                count_given[word][next_word] = 1
        else:
            count[word] += 1
            if idx < len(ordered_words_list) - 1:
                next_word = ordered_words_list[idx + 1]
                if next_word not in count_given[word]:
                    count_given[word][next_word] = 1
                else:
                    count_given[word][next_word] += 1
    
    # calculate trans_params
    trans_params = copy.deepcopy(count_given)
    for given_word in trans_params:
        for word in trans_params[given_word]:
            trans_params[given_word][word] /= count[given_word]
            
    return trans_params

def specific_transition_params(ordered_words_list: list, y: str, y_given: str):
    trans_params = transition_params(ordered_words_list)
    if y not in trans_params:
        return 0;
    elif y_given not in trans_params[y]:
        return 0;
    else:
        return trans_params[y_given][y]
    
specific_transition_params(['a', 'b', 'b', 'c', 'b', 'a', 'd', 'h', 'b'], 'b', 'a')

0.5

Viterbi algo

In [3]:
def viterbi(sentence: str):
    words_list = ['START'] + sentence.split() + ['STOP']
    trans_params = transition_params(words_list)
    result = {} # key is given_word, value is the the maximum-likely next word
    
    for given_word in trans_params:
        max_arg = 0
        result[given_word] = ''
        for word in trans_params[given_word]:
            if trans_params[given_word][word] > max_arg:
                result[given_word] = word
        
    return result

viterbi("""
Hey I was doing just fine before I met you
Drink too much and that\'s an issue
But I\'m okay
Hey, you tell your friends it was nice to meet them
But I hope I never see them
Again
I know it breaks your heart
Moved to the city in a broke-down car
And four years, no calls
Now you're looking pretty in a hotel bar
And I, I, I, I, I can't stop
No, I, I, I, I, I can't stop
""")

{'Again': 'I',
 'And': 'I,',
 'But': 'I',
 'Drink': 'too',
 'Hey': 'I',
 'Hey,': 'you',
 'I': "can't",
 "I'm": 'okay',
 'I,': 'I',
 'Moved': 'to',
 'No,': 'I,',
 'Now': "you're",
 'START': 'Hey',
 'STOP': '',
 'a': 'hotel',
 'an': 'issue',
 'and': "that's",
 'bar': 'And',
 'before': 'I',
 'breaks': 'your',
 'broke-down': 'car',
 'calls': 'Now',
 "can't": 'stop',
 'car': 'And',
 'city': 'in',
 'doing': 'just',
 'fine': 'before',
 'four': 'years,',
 'friends': 'it',
 'heart': 'Moved',
 'hope': 'I',
 'hotel': 'bar',
 'in': 'a',
 'issue': 'But',
 'it': 'breaks',
 'just': 'fine',
 'know': 'it',
 'looking': 'pretty',
 'meet': 'them',
 'met': 'you',
 'much': 'and',
 'never': 'see',
 'nice': 'to',
 'no': 'calls',
 'okay': 'Hey,',
 'pretty': 'in',
 'see': 'them',
 'stop': 'STOP',
 'tell': 'your',
 "that's": 'an',
 'the': 'city',
 'them': 'Again',
 'to': 'the',
 'too': 'much',
 'was': 'nice',
 'years,': 'no',
 'you': 'tell',
 "you're": 'looking',
 'your': 'heart'}