# Machine Learning Design Project

Entity labelling of Tweets

In [None]:
from collections import defaultdict
from utils import load_file, get_entities, compare_result

## Part 2

### Estimation of emission parameters

In [None]:
def emission_estimate(train):
    # count emission y -> x
    counts_yx = defaultdict(int)
    
    # count occurence of y
    counts_y = defaultdict(int)
    
    xs = set()
    for line in train:
        for tok in line:
            x, y = tok
            xs.add(x)
            counts_y[y] += 1
            counts_yx[(y, x)] += 1
    
    e = {}
    for y, count_y in counts_y.items():
        for x in xs:
            e[(x, y)] = counts_yx[(y, x)]/count_y
    
    return e

### Handling unknowns

In [None]:
def emission_estimate2(train, k=3):
    # count emission of y -> x
    counts_yx = defaultdict(int)
    
    # count occurence of y
    counts_y = defaultdict(int)
    
    xs = set()
    for line in train:
        for tok in line:
            x, y = tok
            x = x.lower()
            xs.add(x)
            counts_y[y] += 1
            counts_yx[(y, x)] += 1
    
    ys = counts_y.keys()
    unknowns = set()
    
    # replace x with unk if it appears less than k times
    for x in xs:
        if sum(counts_yx[(y, x)] for y in ys) >= k:
            continue
        unknowns.add(x)
        for y in ys:
            counts_yx[(y, '#UNK#')] += counts_yx.pop((y, x), 0)
            
    xs -= unknowns
    xs.add('#UNK#')
    
    e = {}
    for y, count_y in counts_y.items():
        for x in xs:
            count_yx = counts_yx[(y, x)]
            if count_yx:
                e[(x, y)] = count_yx/count_y
    
    return e

### Naive estimator

In [None]:
def simple_analyse(emissions, dev, output):
    xs, ys = tuple(set(i) for i in zip(*emissions.keys()))
    with open(output, 'w') as f:
        for line in dev:
            for tok in line:
                x = tok[0].lower()
                if x not in xs:
                    x = '#UNK#'
                ym = max((emissions.get((x, y), 0), y) for y in ys)[1]
                print("%s %s" % (tok[0], ym), file=f)
            print(file=f)

In [None]:
for lang in ('EN', 'SG', 'CN', 'FR'):
    print('Language: %s' % lang)
    print('Training...   ', end='')
    train = load_file('%s/train' % lang)
    emissions = emission_estimate2(train)
    print('OK')
    
    print('Predicting... ', end='')
    dev = load_file('%s/dev.in' % lang)
    simple_analyse(emissions, dev, '%s/dev.p2.out' % lang)
    print('OK')
    
    print('Analysing...  ', end='')
    with open('%s/dev.p2.out' % lang) as f:
        pred = get_entities(f)
    with open('%s/dev.out' % lang) as f:
        gold = get_entities(f)
    print('OK')
    
    compare_result(gold, pred)
    print()

## Part 3

### Estimating transition parameters

In [None]:
def transition_estimate(train):
    # count transition j -> i
    counts_ji = defaultdict(int)
    
    # count occurence of states
    counts = defaultdict(int)
    
    for line in train:
        y_j = 'START'
        counts['START'] += 1
        
        for tok in line:
            y_i = tok[-1]
            counts[y_i] += 1
            counts_ji[(y_j, y_i)] += 1
            y_j = y_i
            
        counts_ji[(y_j, 'STOP')] += 1
        counts['STOP'] += 1
    
    q = {}
    for y_i, count_y in counts.items():
        for y_j in counts:
            count_ji = counts_ji[(y_j, y_i)]
            if count_ji:
                q[(y_i, y_j)] = count_ji/count_y
    
    return q

### Viterbi

In [None]:
def viterbi(e, q, sentence):
    xs, T = tuple(set(i) for i in zip(*e.keys()))
    T.update({'START', 'STOP'})
    
    pis = [{k: (0, None) for k in T}]
    pis[0]['START'] = (1, None)
    n = len(sentence)
    
    def val(i, u, v, word):
        return pis[i][u][0] * q.get((v, u), 0) * e.get((word, v), 0)
    
    for i, word in enumerate(sentence):
        word = word.lower()
        if word not in xs:
            word = '#UNK#'
        p_i = {}
        for v in T:
            p_i[v] = max((val(i, u, v, word), u) for u in T)
        pis.append(p_i)
    
    last_pi = max((pis[n][u][0]*q.get(('STOP', u), 0), u) for u in T)[1]
    
    tags = [last_pi]
    for i in range(n-1):
        p = pis[n-i][last_pi][1]
        tags.append(p)
        last_pi = p
        
    tags.reverse()
    return tags

In [None]:
for lang in ('EN', 'SG', 'CN', 'FR'):
    print('Language: %s' % lang)
    print('Training...   ', end='')
    train = load_file('%s/train' % lang)
    emissions = emission_estimate2(train)
    transitions = transition_estimate(train)
    print('OK')
    
    print('Predicting... ', end='')
    dev = load_file('%s/dev.in' % lang)
    with open('%s/dev.p3.out' % lang, 'w') as f:
        for line in dev:
            words = tuple(l[0] for l in line)
            tags = viterbi(emissions, transitions, words)
            for a in zip(words, tags):
                print(' '.join(a), file=f)
            print(file=f)
    print('OK')
    
    print('Analysing...  ', end='')
    with open('%s/dev.p3.out' % lang) as f:
        pred = get_entities(f)
    with open('%s/dev.out' % lang) as f:
        gold = get_entities(f)
    print('OK')
    print()
    compare_result(gold, pred)
    print()

## Part 4

### Forward Probabilities

In [None]:
def forward(e, q, sentence):
    xs, T = tuple(set(i) for i in zip(*e.keys()))
    T.update({'START', 'STOP'})
    
    alphas = [{u: q.get((u, 'START'), 0) for u in T}]
    
    def val(i, u, v, word):
        return alphas[i][v] * q.get((u, v), 0) * e.get((word, v), 0)
    
    for i, word in enumerate(sentence):
        word = word.lower()
        if word not in xs:
            word = '#UNK#'
        a_j = {}
        for u in T:
            a_j[u] = sum(val(i, u, v, word) for v in T)
        alphas.append(a_j)
        
    return alphas

In [None]:
def backward(e, q, sentence):
    xs, T = tuple(set(i) for i in zip(*e.keys()))
    T.update({'START', 'STOP'})
    
    betas = []
    
    def val(i, u, v, word):
        return betas[i-1][v] * q.get((v, u), 0) * e.get((word, u), 0)
    
    for i, word in enumerate(reversed(sentence)):
        word = word.lower()
        if word not in xs:
            word = '#UNK#'
        b_i = {}
        for u in T:
            if i:
                b_i[u] = sum(val(i, u, v, word) for v in T)
            else:
                b_i[u] = q.get(('STOP', u), 0)*e.get((word, u), 0)
        betas.append(b_i)
    
    betas.reverse()
    return betas

In [None]:
print('Training...   ', end='')
train = load_file('EN/train')
emissions = emission_estimate2(train)
transitions = transition_estimate(train)
xs, T = tuple(set(i) for i in zip(*emissions.keys()))
T.update({'START', 'STOP'})
print('OK')

print('Predicting... ', end='')
dev = load_file('EN/dev.in')
with open('EN/dev.p4.out', 'w') as f:
    for line in dev:
        words = tuple(l[0] for l in line)
        als = forward(emissions, transitions, words)
        bts = backward(emissions, transitions, words)

        tags = []
        for i in range(len(words)):
            y_m = max((als[i].get(u, 0)*bts[i].get(u, 0), u) for u in T)[1]
            tags.append(y_m)

        for a in zip(words, tags):
            print(' '.join(a), file=f)
        print(file=f)
print('OK')
    
print('Analysing...  ', end='')
with open('EN/dev.p4.out') as f:
    pred = get_entities(f)
with open('EN/dev.out') as f:
    gold = get_entities(f)
print('OK')
print()
compare_result(gold, pred)
print()