# Part 1

### Estimate Emission Params

In [345]:
import pandas as pd

def build_emission_params(path):
    with open(path, mode="r") as fp:
        data = fp.read()
        lines = data.split("\n")
        emission = {}
        for line in lines:
            if line == "":
                continue
            [word, tag] = line.rsplit(" ",1)
            if tag not in emission.keys():
                emission[tag] = {}
            if word not in emission[tag].keys():
                emission[tag][word] = 0
            emission[tag][word] += 1
        emission = pd.DataFrame(emission).fillna(0)
        for col in emission.columns:
            emission[col] = emission[col]/emission[col].sum()
        return emission

In [346]:
build_emission_params("ES/train")

Unnamed: 0,O,B-positive,B-negative,B-neutral,I-neutral,I-positive,I-negative
Estuvimos,0.000207,0.0,0.0,0.0,0.0,0.0,0.000000
hace,0.000895,0.0,0.0,0.0,0.0,0.0,0.000000
poco,0.001894,0.0,0.0,0.0,0.0,0.0,0.000000
mi,0.002480,0.0,0.0,0.0,0.0,0.0,0.000000
pareja,0.000448,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...
500cc,0.000000,0.0,0.0,0.0,0.0,0.0,0.005848
Camarón,0.000000,0.0,0.0,0.0,0.0,0.0,0.005848
braseado,0.000000,0.0,0.0,0.0,0.0,0.0,0.005848
argentina,0.000000,0.0,0.0,0.0,0.0,0.0,0.005848


In [347]:
build_emission_params("RU/train")

Unnamed: 0,B-positive,O,I-positive,B-negative,I-negative,B-neutral,I-neutral
Еда,0.007539,0.000025,0.0,0.004505,0.0,0.009662,0.000000
сервировку,0.000539,0.000000,0.0,0.000000,0.0,0.000000,0.000000
Филадельфию,0.000539,0.000000,0.0,0.000000,0.0,0.000000,0.000000
лосося-филадельфии-чуток,0.000539,0.000000,0.0,0.000000,0.0,0.000000,0.000000
Десерт,0.001077,0.000025,0.0,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...
карбонара,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.014706
отдыху,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.014706
форелью,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.014706
Салмон,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.014706


### Modified Estimate Emission Params

In [348]:
def build_emission_params(path, k):
    with open(path, mode="r") as fp:
        data = fp.read()
        lines = data.split("\n")
        emission = {}
        for line in lines:
            if line == "":
                continue
            [word, tag] = line.rsplit(" ",1)
            if tag not in emission.keys():
                emission[tag] = {}
            if word not in emission[tag].keys():
                emission[tag][word] = 0
            emission[tag][word] += 1
        emission = pd.DataFrame(emission).fillna(0)
        emission.loc["#UNK#",:] = 1
        for col in emission.columns:
            emission[col] = emission[col]/emission[col].sum()
        return emission

In [349]:
build_emission_params("ES/train", 1)

Unnamed: 0,O,B-positive,B-negative,B-neutral,I-neutral,I-positive,I-negative
Estuvimos,0.000207,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
hace,0.000895,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
poco,0.001894,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
mi,0.002480,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
pareja,0.000448,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
Camarón,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005814
braseado,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005814
argentina,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005814
pizz,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.005814


In [350]:
build_emission_params("RU/train", 1)

Unnamed: 0,B-positive,O,I-positive,B-negative,I-negative,B-neutral,I-neutral
Еда,0.007535,0.000025,0.000000,0.004494,0.000000,0.009615,0.000000
сервировку,0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Филадельфию,0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
лосося-филадельфии-чуток,0.000538,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Десерт,0.001076,0.000025,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...
отдыху,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.014493
форелью,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.014493
Салмон,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.014493
кальмара,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.014493


### Simple Sentiment Analysis

In [351]:
def simple_sentiment_analysis(emission, path):
    params = {}
    for idx in emission.index:
        params[idx] = emission.loc[idx,:].idxmax()
    with open(path, mode="r") as fp:
        data = fp.read()
        out = ""
        lines = data.split("\n")
        for line in lines:
            if line == "":
                out += "\n"
                continue
            if line not in params.keys():
                word = "#UNK#"
            else:
                word = line
            pred_tag = params[word]
            out += f"{line} {pred_tag}\n"
        [dir, file] = path.rsplit("/", 1)
        [name, ext] = file.rsplit(".", 1)
        path_out = f"{dir}/{name}.p1.out"
        with open(path_out, mode="w") as fp_out:
            fp_out.write(out)

In [352]:
emission = build_emission_params("ES/train", 1)
simple_sentiment_analysis(emission, "ES/dev.in")

In [353]:
emission = build_emission_params("RU/train", 1)
simple_sentiment_analysis(emission, "RU/dev.in")

Scores:

<img src="img/1_Score.jpg"/>

# Part 2

### Estimate Transition Params

In [354]:
def build_transition_params(path):
    with open(path, mode="r") as fp:
        data = fp.read()
        lines = data.split("\n")
        transition = {}
        prev_tag = "START"
        for line in lines:
            if line == "":
                if prev_tag == "START":
                    continue
                tag = "STOP"
            else:
                [word, tag] = line.rsplit(" ",1)
            if prev_tag not in transition.keys():
                transition[prev_tag] = {}
            if tag not in transition[prev_tag].keys():
                transition[prev_tag][tag] = 0
            transition[prev_tag][tag] += 1
            if tag == "STOP":
                prev_tag = "START"
            else:
                prev_tag = tag
        if prev_tag != "START":
            if prev_tag not in transition.keys():
                transition[prev_tag] = {}
            if "STOP" not in transition[prev_tag].keys():
                transition[prev_tag]["STOP"] = 0
            transition[prev_tag]["STOP"] += 1
        transition = pd.DataFrame(transition).fillna(0)
        for col in transition.columns:
            transition[col] = transition[col]/transition[col].sum()
        return transition

In [355]:
build_transition_params("ES/train")

Unnamed: 0,START,O,B-positive,B-negative,B-neutral,I-neutral,I-positive,I-negative
O,0.928918,0.88569,0.871552,0.811024,0.791667,0.348837,0.426752,0.397661
B-positive,0.052235,0.036508,0.002586,0.0,0.0,0.0,0.0,0.0
B-negative,0.014001,0.012227,0.0,0.0,0.0,0.0,0.0,0.0
B-neutral,0.004847,0.002135,0.000862,0.0,0.0,0.0,0.0,0.0
STOP,0.0,0.063441,0.008621,0.010499,0.0,0.0,0.003185,0.0
I-positive,0.0,0.0,0.116379,0.0,0.0,0.0,0.570064,0.0
I-negative,0.0,0.0,0.0,0.178478,0.0,0.0,0.0,0.602339
I-neutral,0.0,0.0,0.0,0.0,0.208333,0.651163,0.0,0.0


In [356]:
build_transition_params("RU/train")

Unnamed: 0,START,B-positive,O,I-positive,B-negative,I-negative,B-neutral,I-neutral
B-positive,0.112345,0.001616,0.037235,0.001675,0.0,0.0,0.0,0.0
O,0.844873,0.80937,0.874451,0.582915,0.815315,0.574468,0.864734,0.411765
B-neutral,0.021228,0.0,0.003504,0.0,0.0,0.0,0.0,0.0
B-negative,0.021555,0.0,0.009327,0.0,0.0,0.0,0.0,0.0
I-positive,0.0,0.188476,0.0,0.413735,0.0,0.0,0.0,0.0
STOP,0.0,0.000539,0.075482,0.001675,0.0,0.007092,0.0,0.0
I-negative,0.0,0.0,0.0,0.0,0.184685,0.41844,0.0,0.0
I-neutral,0.0,0.0,0.0,0.0,0.0,0.0,0.135266,0.588235


### Viterbi Algorithm Implementation

In [357]:
def viterbi_alg(transition, emission, sequence):
    tree = []
    pred_path = []
    for i in range(len(sequence)+2):
        tree.append({})
        if i == 0:
            tree[i]["START"] = [None, 1]
        else:
            for idx in transition.index:
                trans_values = {}
                if i == len(sequence)+1 and idx != "STOP":
                    continue
                elif i == len(sequence)+1:
                    emit_value = 1
                else:
                    if sequence[i-1] not in emission.index:
                        emit = "#UNK#"
                    else:
                        emit = sequence[i-1]
                    if idx == "STOP":
                        emit_value = 0
                    else:
                        emit_value = emission.loc[emit,idx]
                for state in tree[i-1].keys():
                    if state == "STOP":
                        continue
                    else:
                        trans_value = tree[i-1][state][1]*emit_value*transition.loc[idx,state]
                    trans_values[state] = trans_value
                tree[i][idx] = [max(trans_values, key=trans_values.get), max(trans_values.values())]
    pred_state = "STOP"
    level = len(tree)-1
    pred_path.append(pred_state)
    while level > 0:
        pred_path.append(tree[level][pred_state][0])
        pred_state = tree[level][pred_state][0]
        level -= 1
    pred_path.reverse()
    return pred_path

In [358]:
def viterbi(train_path, test_path):
    transition = build_transition_params(train_path)
    emission = build_emission_params(train_path, 1)
    with open(test_path, mode="r") as fp:
        data = fp.read()
        sequences = data.split("\n\n")
        sequences = [sequence.split("\n") for sequence in sequences]
        pred_paths = []
        out = ""
        for sequence in sequences:
            if "" in sequence:
                continue
            pred_path = viterbi_alg(transition, emission, sequence)
            pred_paths.append(pred_path)
            for i in range(len(sequence)):
                out += f"{sequence[i]} {pred_path[i+1]}\n"
            out += "\n"
        [dir, file] = test_path.rsplit("/", 1)
        [name, ext] = file.rsplit(".", 1)
        path_out = f"{dir}/{name}.p2.out"
        with open(path_out, mode="w") as fp_out:
            fp_out.write(out)
        

In [359]:
viterbi("ES/train", "ES/dev.in")

In [360]:
viterbi("RU/train", "RU/dev.in")

<img src="img/2_Score.jpg"/>

# Part 4

### Estimate Relative Transition Position Params

In [361]:
import numpy as np
import copy

def build_position_params(path):
    with open(path, mode="r") as fp:
        data = fp.read()
        sequences = data.split("\n\n")
        sequences = [sequence.split("\n") for sequence in sequences]
        transition = {}
        for sequence in sequences:
            if "" in sequence:
                continue
            prev_tag = "START"
            for i in range(len(sequence)+1):
                if i == len(sequence):
                    tag = "STOP"
                else:
                    [word, tag] = sequence[i].rsplit(" ", 1)
                if prev_tag not in transition.keys():
                    transition[prev_tag] = {}
                if tag not in transition[prev_tag].keys():
                    transition[prev_tag][tag] = []
                transition[prev_tag][tag].append(i/len(sequence))
                prev_tag = tag
        mean = copy.deepcopy(transition)
        spread = copy.deepcopy(transition)
        for v in transition.keys():
            for u in transition[v].keys():
                mean[v][u] = np.mean(mean[v][u])
                spread[v][u] = np.std(spread[v][u])
        mean = pd.DataFrame(mean)
        spread = pd.DataFrame(spread)
        return mean, spread

In [362]:
mean, spread = build_position_params("ES/train")

In [363]:
mean

Unnamed: 0,START,O,B-positive,B-negative,B-neutral,I-neutral,I-positive,I-negative
O,0.0,0.510247,0.431648,0.467382,0.363755,0.40171,0.567872,0.45313
B-positive,0.0,0.388977,0.258974,,,,,
B-negative,0.0,0.414229,,,,,,
B-neutral,0.0,0.327314,0.746988,,,,,
STOP,,1.0,1.0,1.0,,,1.0,
I-positive,,,0.449981,,,,0.540129,
I-negative,,,,0.33165,,,,0.431599
I-neutral,,,,,0.305162,0.474898,,


In [364]:
spread

Unnamed: 0,START,O,B-positive,B-negative,B-neutral,I-neutral,I-positive,I-negative
O,0.0,0.2719,0.267615,0.261319,0.25291,0.20147,0.25435,0.255531
B-positive,0.0,0.263094,0.177683,,,,,
B-negative,0.0,0.257161,,,,,,
B-neutral,0.0,0.239042,0.0,,,,,
STOP,,0.0,0.0,0.0,,,0.0,
I-positive,,,0.252352,,,,0.256614,
I-negative,,,,0.230999,,,,0.21413
I-neutral,,,,,0.182225,0.156887,,


In [365]:
mean, spread = build_position_params("RU/train")

In [366]:
mean

Unnamed: 0,START,B-positive,O,I-positive,B-negative,I-negative,B-neutral,I-neutral
B-positive,0.0,0.519798,0.452112,0.863636,,,,
O,0.0,0.444942,0.507386,0.566843,0.434143,0.486562,0.346702,0.412621
B-neutral,0.0,,0.389362,,,,,
B-negative,0.0,,0.429681,,,,,
I-positive,,0.46055,,0.49076,,,,
STOP,,1.0,1.0,1.0,,1.0,,
I-negative,,,,,0.405254,0.529871,,
I-neutral,,,,,,,0.276246,0.376621


In [367]:
spread

Unnamed: 0,START,B-positive,O,I-positive,B-negative,I-negative,B-neutral,I-neutral
B-positive,0.0,0.285572,0.251518,0.0,,,,
O,0.0,0.285775,0.268128,0.251915,0.279213,0.273896,0.277377,0.232913
B-neutral,0.0,,0.248512,,,,,
B-negative,0.0,,0.261276,,,,,
I-positive,,0.25763,,0.229569,,,,
STOP,,0.0,0.0,0.0,,0.0,,
I-negative,,,,,0.274989,0.237946,,
I-neutral,,,,,,,0.206003,0.184529


### Modified Viterbi Algorithm Implementation

In [368]:
def pos_viterbi_alg(transition, emission, sequence, mean, mean_bias, spread, spread_bias):
    tree = []
    pred_path = []
    for i in range(len(sequence)+2):
        tree.append({})
        if i == 0:
            tree[i]["START"] = [None, 1]
        else:
            pos = (i-1)/len(sequence)
            for idx in transition.index:
                trans_values = {}
                if i == len(sequence)+1 and idx != "STOP":
                    continue
                elif i == len(sequence)+1:
                    emit_value = 1
                else:
                    if sequence[i-1] not in emission.index:
                        emit = "#UNK#"
                    else:
                        emit = sequence[i-1]
                    if idx == "STOP":
                        emit_value = 0
                    else:
                        emit_value = emission.loc[emit,idx]
                for state in tree[i-1].keys():
                    if state == "STOP":
                        continue
                    else:
                        pos_bias = 1-(np.abs(pos-mean.loc[idx, state])*mean_bias*(1-spread.loc[idx, state]*spread_bias))
                        if pos_bias < 0 or np.isnan(pos_bias):
                            pos_bias = 0
                        trans_value = tree[i-1][state][1]*emit_value*transition.loc[idx,state]*pos_bias
                    trans_values[state] = trans_value
                tree[i][idx] = [max(trans_values, key=trans_values.get), max(trans_values.values())]
    pred_state = "STOP"
    level = len(tree)-1
    pred_path.append(pred_state)
    while level > 0:
        pred_path.append(tree[level][pred_state][0])
        pred_state = tree[level][pred_state][0]
        level -= 1
    pred_path.reverse()
    return pred_path

In [369]:
def pos_viterbi(train_path, test_path, mean_bias, spread_bias):
    transition = build_transition_params(train_path)
    emission = build_emission_params(train_path, 1)
    mean, spread = build_position_params(train_path)
    with open(test_path, mode="r") as fp:
        data = fp.read()
        sequences = data.split("\n\n")
        sequences = [sequence.split("\n") for sequence in sequences]
        pred_paths = []
        out = ""
        for sequence in sequences:
            if "" in sequence:
                continue
            pred_path = pos_viterbi_alg(transition, emission, sequence, mean, mean_bias, spread, spread_bias)
            pred_paths.append(pred_path)
            for i in range(len(sequence)):
                out += f"{sequence[i]} {pred_path[i+1]}\n"
            out += "\n"
        [dir, file] = test_path.rsplit("/", 1)
        [name, ext] = file.rsplit(".", 1)
        path_out = f"{dir}/{name}.p4.out"
        with open(path_out, mode="w") as fp_out:
            fp_out.write(out)

In [370]:
pos_viterbi("ES/train", "ES/dev.in", 1, 1)

In [371]:
pos_viterbi("RU/train", "RU/dev.in", 0.6, 1)