In [1]:
import numpy as np
import nltk
nltk.download("averaged_perceptron_tagger")
from scipy.spatial import distance

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/Pura/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
from nltk.tag import StanfordPOSTagger
from nltk import word_tokenize

jar = "stanford-postagger-full-2020-11-17/stanford-postagger.jar"
model = "stanford-postagger-full-2020-11-17/models/chinese-distsim.tagger"

st = StanfordPOSTagger(model, jar, encoding='utf8')

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import f1_score

In [4]:
def to_arr(f):
    """
    Returns the array representation of the data in the file ``f``. This is the representation that we mainly work with
    in this file. See example below to understand this intermediate representation.
    
    Parameters: ``f`` is an opened text file. 
    """
    i, arr, d= 0, [], {}
    for line in f:
        s = line.replace("\n", "")
        if i % 6 == 0:
            d["chinese"] = s
        elif i % 6 == 1:
            d["reference"] = s
        elif i % 6 == 2:
            d["candidate"] = s
        elif i % 6 == 3: 
            d["score"] = float(s)
        elif i % 6 == 4:
            d["label"] = 1 if s == "H" else -1
        else:
            arr.append(d)
            d= {}
        i= i + 1
    return arr

fTr, fTe = open("train.txt"), open("test.txt")
arrTr, arrTe= to_arr(fTr), to_arr(fTe)

In [5]:
# Example to show format of the array.
arrTr

[{'chinese': '巴林 公主 下 嫁 美 大兵 惊 世 婚姻 五 年 宣告 破裂',
  'reference': 'bahraini princess marries us soldier , astonishing 5 year bond comes to end',
  'candidate': 'bahraini princess marries a u.s. soldier ; astounding marriage dissolves in 5 years',
  'score': 0.3125,
  'label': 1},
 {'chinese': '巴林 公主 梅 丽 安 ・ 哈 里 发 下 嫁 美国 陆 战 队 大兵 强 生 , 曾 获 美国 电视台 广 赞 为 美 事 , 但是 这 桩 波折 重重 的 婚姻 才 持续 五 年 , 如今 却 已 在 沙漠 赌 城 拉 斯 维 加 斯 画 下 句 点 .',
  'reference': 'the star-crossed marriage between bahraini princess meriam al-khalifa and us marine johnson was once extensively feted on us television . however , it has come to an end in the desert gambling town of las vegas after only five years .',
  'candidate': 'u.s. television stations had once feted the marriage between bahraini princess meriam al-khalifa and u.s. marine johnson . however , after just five years , this star-crossed marriage has already come to an end in the desert casino city of las vegas .',
  'score': 0.6531,
  'label': 1},
 {'chinese': '梅 丽 安

In [6]:
# Dictionary that maps select symbols from the LDC Chinese Treebank tagset to the Universal Dependencies tagset.
cpt = {
    "AD":"ADV",
    "CC":"CCONJ",
    "CD":"NUM",
    "CS":"SCONJ",
    "DT":"DET",
    "IJ":"INTJ",
    "MSP":"PART",
    "NN":"NOUN",
    "NR":"PROPN",
    "NT":"NOUN",
    "OD":"NUM",
    "PN":"PRON",
    "PU":"PUNCT",
    "SP":"PART",
    "VA":"ADJ",
    "VE":"VERB",
    "VV":"VERB",
    "X":"SYM"
}
# Dictionary that maps select symbols from the Treebank English POS tag set tagset to the Universal Dependencies 
# tagset.
penn = {
    "CC":"CCONJ",
    "CD":"NUM",
    "DT":"DET",
    "IN":"SCONJ",
    "JJ":"ADJ",
    "JJR":"ADJ",
    "JJS":"ADJ",
    "NN":"NOUN",
    "NNS":"NOUN",
    "NNP":"PROPN",
    "NNPS":"PROPN",
    "PDT":"DET",
    "PRP":"PRON",
    "PRP$":"PRON",
    "RB":"ADV",
    "RBR":"ADV",
    "RBS":"ADV",
    "RP":"PART",
    "SYM":"SYM",
    "UH":"INTJ",
    "VB":"VERB",
    "VBD":"VERB",
    "VBG":"VERB",
    "VBN":"VERB",
    "VBP":"VERB",
    "VBZ":"VERB",
    "WDT":"DET",
    "WP":"PRON",
    "WP$":"PRON",
    "WRB":"ADV"
}

In [7]:
def rel_edit_distance(u, v):
    """
    Returns the edit distance between the two sequences ``u`` and ``v`` divided by their mean length. This serves 
    as a measure of sequence dissimilarity which is independent of the length of the sequences. 
    
    Parameters: ``u`` and ``v`` are lists.
    """
    len1, len2 = len(u), len(v)
    arr= [[0 if j != 0 else i for j in range(len2 + 1)] if i != 0 else list(range(len2 + 1)) for i in range(len1 + 1)] 
    for i in range(len1): 
        for j in range(len2):
            dA, dB, dC = arr[i][j], arr[i + 1][j], arr[i][j + 1] 
            d = min((0 if u[i] == v[j] else 1) + dA, 1 + dB, 1 + dC) 
            arr[i + 1][j + 1] = d
    return 2 * d / (len1 + len2)

def pos_translate(lst, dct):
    """
    Translates the part of speech tags from one set to another using a given dictionary.
    
    Parameters:
        ``lst`` is a list (sequence) of part of speech tags in either the Penn or the LDC Chinese Treebank tagset.
        ``dct`` is a dictionary containing the conversions of the source tagset to the target tagset. 
    """
    def f(x):
        try:
            return dct[x]
        except KeyError:
            return "O"
    return [f(x) for x in lst]

def vocab_diff(x):
    """
    Returns the cosine difference of the bag-of-words vectors of the reference and candidate texts, which serves
    as a measure of the difference in vocabulary used in the translations.
    
    Parameters: ``x`` is a dictionary element of the array representation of the data (see above).
    """
    u = [z for z in x["reference"].split(" ")]
    v = [z for z in x["candidate"].split(" ")]
    words = set(u + v)
    v1 = np.array([(1 if w in u else 0) for w in words])
    v2 = np.array([(1 if w in v else 0) for w in words])
    return distance.cosine(v1, v2)

def rc_pos_dist(x):
    """
    Returns the relative edit distance (see ``rel_edit_distance``) between the parts of speech tags of the reference
    and candidate translations.
    
    Parameters: ``x`` is a dictionary element of the array representation of the data (see above).
    """
    u = pos_translate([z[1] for z in nltk.pos_tag([z for z in x["reference"].split(" ") if z])], penn)
    v = pos_translate([z[1] for z in nltk.pos_tag([z for z in x["candidate"].split(" ") if z])], penn)
    return rel_edit_distance(u, v)

def cc_pos_dist(x):
    """
    Returns the relative edit distance (see ``rel_edit_distance``) between the parts of speech tags of the Chinese
    text and the candidate translation.
    
    Parameters: ``x`` is a dictionary element of the array representation of the data (see above).
    """
    u = pos_translate([z[1][z[1].index("#") + 1:] for z in st.tag([z for z in x["chinese"].split(" ") if z])], cpt)
    v = pos_translate([z[1] for z in nltk.pos_tag([z for z in x["candidate"].split(" ") if z])], penn)
    return rel_edit_distance(u, v)

def to_np(arr):
    """
    Returns the np.array form of ``arr``, where each row is a vector representing a dictionary in ``arr``. 
    In other words, this function converts ``arr`` to matrix form.
    
    Parameters: ``arr`` is the array represention of the data (see above).
    """
    return np.array([[x["score"], rc_pos_dist(x), vocab_diff(x), cc_pos_dist(x)] for x in arr])

def normalize(X):
    """
    Returns a copy of ``X`` with the columns normalized via standardization. 
    
    Parameters: ``X`` is a matrix of row vectors. 
    """
    dims = X.shape
    X_= np.zeros(dims)
    for j in range(dims[1]):
        X_[:,j] = (X[:,j] - X[:,j].mean()) / X[:,j].std()
    return X_

In [8]:
XTr, XTe = normalize(to_np(arrTr)), normalize(to_np(arrTe))
yTr, yTe = [x["label"] for x in arrTr], [x["label"] for x in arrTe]

In [9]:
# Training-validation split
n, _ = XTr.shape
XTr_, XVal_ = XTr[:int(0.7 * n)], XTr[int(0.7 * n):]
yTr_, yVal_ = yTr[:int(0.7 * n)], yTr[int(0.7 * n):]

In [10]:
def score(clf, X, y):
    """
    Calculates the mean of the F1 score of the classifier ``clf`` on the test set ``X`` and true labels ``y``.
    
    Parameters:
        ``clf`` is a SciKit-Learn classifier.
        ``X`` is an n x d numpy matrix.
        ``y`` is an n x 1 numpy array.
    """
    preds = clf.predict(X)
    return (f1_score(y, preds, pos_label=1) + f1_score(y, preds, pos_label=-1)) / 2

In [11]:
# Hyperparameter search for Random Forest.
for d in range(1, 8):
    clf = RandomForestClassifier(max_depth=d, criterion="entropy", n_estimators=10)
    clf.fit(XTr_, yTr_)
    print("d = " + str(d))
    print("    Training score: " + str(score(clf, XTr_, yTr_)))
    print("    Validation score: " + str(score(clf, XVal_, yVal_)))

d = 1
    Training score: 0.7374134790528234
    Validation score: 0.7026143790849673
d = 2
    Training score: 0.7617710636817651
    Validation score: 0.7312881832140872
d = 3
    Training score: 0.7808550702268868
    Validation score: 0.7427227286092325
d = 4
    Training score: 0.8307921251006318
    Validation score: 0.7367216117216118
d = 5
    Training score: 0.8578961840457059
    Validation score: 0.7369281045751634
d = 6
    Training score: 0.8757309941520468
    Validation score: 0.7249869041382924
d = 7
    Training score: 0.9008071574443255
    Validation score: 0.7590479937057435


In [12]:
# Training and applying the Random Forest model
clf1 = RandomForestClassifier(max_depth=2, criterion="entropy", n_estimators=10, random_state=55)
clf1.fit(XTr, yTr)
score(clf1, XTe, yTe)

0.783786522546867

In [13]:
# Hyperparameter search for SVM.
for C in [0.01, 0.05, 0.1, 0.5, 1, 5, 10]:
    clf = svm.SVC(kernel="linear", C=C)
    clf.fit(XTr_, yTr_)
    print("C = " + str(C))
    print("    Training score: " + str(score(clf, XTr_, yTr_)))
    print("    Validation score: " + str(score(clf, XVal_, yVal_)))

C = 0.01
    Training score: 0.7663742690058479
    Validation score: 0.7710622710622711
C = 0.05
    Training score: 0.7666415167814637
    Validation score: 0.776090023293199
C = 0.1
    Training score: 0.7687229905585049
    Validation score: 0.7764127764127764
C = 0.5
    Training score: 0.7731325043536708
    Validation score: 0.7701602311531389
C = 1
    Training score: 0.773961218836565
    Validation score: 0.7701602311531389
C = 5
    Training score: 0.7710716986655608
    Validation score: 0.7642051989878077
C = 10
    Training score: 0.7710716986655608
    Validation score: 0.7642051989878077


In [14]:
# Training and applying the SVM.
clf2 = svm.SVC(kernel="linear", C=1)
clf2.fit(XTr, yTr)
score(clf2, XTe, yTe)

0.7618599966426052