In [1]:
import sys
sys.path.append('..')

In [2]:
from sklearn.metrics import classification_report

from baseline_logisticregression import readInData
from typing import NamedTuple, List
from bert_utils import calc_entailment_prob
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm
import os

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [3]:
class RawInput(NamedTuple):
    twit0: str
    twit1: str

In [4]:
def load_data(fn: str)->(List[RawInput],List[bool]):
    print(f"Start to read '{fn}'")
    data, trends = readInData(fn)
    print("Total records:", len(data))
    print("True samples:", sum([1 for r in data if r[1]]))
    print("False samples:", sum([1 for r in data if not r[1]]))
    return [RawInput(r[2], r[3]) for r in data], [r[1] for r in data]

In [5]:
def featurize(x_raw: List[RawInput])->List[List[float]]:
    res = []
    for r in tqdm(x_raw):
        p = calc_entailment_prob(r.twit0, r.twit1)
        pb = calc_entailment_prob(r.twit1, r.twit0)
        res.append([p[0], p[1], pb[0], pb[1]])
    return res

In [6]:
x_train_raw, y_train = load_data('../data/train.data')
x_dev_raw, y_dev = load_data('../data/dev.data')
x_test_raw, y_test = load_data('../data/test.data')

Start to read '../data/train.data'
Total records: 11530
True samples: 3996
False samples: 7534
Start to read '../data/dev.data'
Total records: 4142
True samples: 1470
False samples: 2672
Start to read '../data/test.data'
Total records: 972
True samples: 175
False samples: 797


In [7]:
for r in x_train_raw[:10]:
    print(r)

RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='But my bro from the 757 EJ Manuel is the 1st QB gone')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Can believe EJ Manuel went as the 1st QB in the draft')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='EJ MANUEL IS THE 1ST QB what')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Manuel is the 1st QB to get drafted')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='My boy EJ Manuel being the 1st QB picked')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Not surprised EJ Manuel was 1st QB taken')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='WOW EJ MANUEL FSU 1ST QB TAKEN')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Wow EJ Manuel 1st QB taken in the draft')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='if EJ is the 1st QB off the board')
RawInput(twit0='So EJ Manuel 

## BERT features

In [8]:
def load_bert_features(fn: str):
    with open(fn, 'rt', encoding='utf-8') as f:
        res = []
        for l in f:
            fs = l.strip().split('\t')
            res.append([float(feature) for feature in fs])
    return res

In [9]:
print("Start featurizing...")
if not os.path.isfile('../data/bert.train.data'):
    x_train_bert_features = featurize(x_train_raw)
    x_dev_bert_features = featurize(x_dev_raw)
    x_test_bert_features = featurize(x_test_raw)
else:
    x_train_bert_features = load_bert_features('../data/bert.train.data')
    x_dev_bert_features = load_bert_features('../data/bert.dev.data')
    x_test_bert_features = load_bert_features('../data/bert.test.data')

print("Done!")

Start featurizing...


HBox(children=(IntProgress(value=0, max=11530), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4142), HTML(value='')))




HBox(children=(IntProgress(value=0, max=972), HTML(value='')))


Done!


In [10]:
def save_bert_features(x, filename):
    with open(filename, 'wt', encoding='utf-8') as f:
        lines = ['\t'.join([str(row[0]),str(row[1]),str(row[2]),str(row[3])]) for row in x]
        tsv_str = '\n'.join(lines)
        f.write(tsv_str)

In [11]:
if not os.path.isfile('../data/bert.train.data'):
    save_bert_features(x_train_bert_features, '../data/bert.train.data')
    save_bert_features(x_dev_bert_features, '../data/bert.dev.data')
    save_bert_features(x_test_bert_features, '../data/bert.test.data')

## LEN features

In [12]:
def featurize_len(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        res.append([len(r.twit0)/len(r.twit1), len(r.twit0)/100, len(r.twit1)/100])
    return res

In [13]:
print("Start featurizing...")
x_train_len_features = featurize_len(x_train_raw)
x_dev_len_features = featurize_len(x_dev_raw)
x_test_len_features = featurize_len(x_test_raw)
print("Done!")

Start featurizing...
Done!


In [14]:
def report(y_true, y_pred):
    y_true_cleaned, y_pred_cleaned = [], []
    for t, p in zip(y_true, y_pred):
        if t is not None:
            y_true_cleaned.append(t)
            y_pred_cleaned.append(p)
    print(classification_report(y_true_cleaned, y_pred_cleaned))

## bpemb

In [16]:
from bpemb import BPEmb
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize

emb = BPEmb(lang='en', dim = 300)
print(emb)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d300.w2v.bin.tar.gz


100%|██████████████████████████| 11189884/11189884 [00:04<00:00, 2486294.84B/s]


BPEmb(lang=en, vs=10000, dim=300)


In [17]:
STOP_WORDS = stopwords.words('english')

def tokenize_filter(text: str, trace:bool=False)->List[str]:
    if trace:
        print(text)
    tokens = word_tokenize(text)
    if trace:
        print(tokens)
        
    tokens = [t for t in tokens if t.lower() not in STOP_WORDS]
    #tokens = [t for t in tokens if t.isalpha()]
    if trace:
        print(tokens)
    return tokens

print(tokenize_filter('I like to move it move it'))

['like', 'move', 'move']


In [18]:
def calc_emb(text, delete_stopwords: bool=False):
    if delete_stopwords:
        text = ' '.join(tokenize_filter(text))
    res = np.zeros(emb.vectors.shape[1], dtype=np.float32)
    # tokens = word_tokenize(text)
    # for t in tokens:
    embs = emb.embed(text.casefold())
    for e in embs:
        res += e
    n = len(embs)
    if n:
        res /= n
    return res

def featurize_emb(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        emb0 = calc_emb(r.twit0, True) 
        emb1 = calc_emb(r.twit1, True)
        cos_symilarity = 1 - spatial.distance.cosine(emb0, emb1)
        res.append([cos_symilarity])
    return res

print(featurize_emb([RawInput('Twit1 experiment', 'Some text')]))
print(featurize_emb([RawInput('I like to move it move it', 'I like to move it')]))

[[0.06752275675535202]]
[[0.9628931283950806]]


In [19]:
print("Start featurizing...")
x_train_emb_features = featurize_emb(x_train_raw)
x_dev_emb_features = featurize_emb(x_dev_raw)
x_test_emb_features = featurize_emb(x_test_raw)
print("Done!")

Start featurizing...
Done!


## Numberbatch

In [20]:
from gensim.models import KeyedVectors

In [22]:
print('loading word vectors')
word_vectors = KeyedVectors.load_word2vec_format("d:/nlp/vectors/numberbatch-en-17.06.txt.gz", binary=False)
print('loading word vectors finished')

loading word vectors
loading word vectors finished


In [24]:
import math
def featurize_nb(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        sym = 1-word_vectors.wmdistance(r.twit0.lower(), r.twit1.lower()) 
        if sym == -math.inf:
            print("-inf for ", r)
            sym = 1
        res.append([sym])
    return res

print(featurize_nb([RawInput('Twit1 experiment', 'Some text')]))
print(featurize_nb([RawInput('I like to move it move it', 'I like to move it')]))
print(x_train_raw[0])
print(featurize_nb([x_train_raw[2]]))

[[0.541184628215287]]
[[0.9367074744120422]]
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='But my bro from the 757 EJ Manuel is the 1st QB gone')
[[0.7664519039066737]]


In [25]:
print("Start featurizing...")
x_train_nb_features = featurize_nb(x_train_raw)
x_dev_nb_features = featurize_nb(x_dev_raw)
x_test_nb_features = featurize_nb(x_test_raw)
print("Done!")

Start featurizing...
Done!


In [26]:
print(x_train_nb_features[:10])

[[0.7498249189151303], [0.7703056647189214], [0.7664519039066737], [0.8661652827015432], [0.7286935032912113], [0.7427941822826905], [0.7012285319810363], [0.8230706304511168], [0.775967100570345], [0.7417885924345241]]


## Glue features

In [27]:
#import copy

def glue_features(*f_lists):
    #f_first, f_others = f_lists
    #res = copy.deepcopy(f_first)
    res = []
    for rows in zip(*f_lists):
        row = []
        for columns in rows:
            row+=columns
        res.append(row)
    return res

tst1, tst2 = [[1],[2],[3],[4],[5]], [[6],[7],[8],[9],[10]]
print(glue_features(tst1, tst2))
tst3 = [[11],[12],[13],[14],[15]]
print(glue_features(tst1, tst2, tst3))

[[1, 6], [2, 7], [3, 8], [4, 9], [5, 10]]
[[1, 6, 11], [2, 7, 12], [3, 8, 13], [4, 9, 14], [5, 10, 15]]


In [28]:
"""
x_train_features = glue_features(x_train_bert_features, x_train_len_features, x_train_emb_features, x_train_nb_features)
x_dev_features = glue_features(x_dev_bert_features, x_dev_len_features, x_dev_emb_features, x_dev_nb_features)
x_test_features = glue_features(x_test_bert_features, x_test_len_features, x_test_emb_features, x_test_nb_features)
"""
x_train_features = glue_features( x_train_len_features, x_train_emb_features, x_train_nb_features)
x_dev_features = glue_features( x_dev_len_features, x_dev_emb_features, x_dev_nb_features)
x_test_features = glue_features( x_test_len_features, x_test_emb_features, x_test_nb_features)


## Classifier

In [29]:
from sklearn.linear_model import LogisticRegression
print("Start learning classifier...")
class_weight = {True: 1.9, False:1}
#clf = RandomForestClassifier(n_estimators=2, random_state=1974, verbose=True, class_weight='balanced')
clf = LogisticRegression(random_state=1974, verbose=True, solver='saga'
                         , class_weight='balanced'
                         #, class_weight=class_weight
                        )
print("Done!")
clf.fit(x_train_features, y_train)
y_pred = clf.predict(x_test_features)
y_pred_prob = clf.predict_proba(x_test_features)
report(y_test, y_pred)

Start learning classifier...
Done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 19 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


              precision    recall  f1-score   support

       False       0.90      0.86      0.88       663
        True       0.54      0.65      0.59       175

    accuracy                           0.81       838
   macro avg       0.72      0.75      0.74       838
weighted avg       0.83      0.81      0.82       838



In [30]:
print(list(zip(y_pred[:10], y_pred_prob[:10])))

[(False, array([0.7076316, 0.2923684])), (False, array([0.63809099, 0.36190901])), (False, array([0.64122582, 0.35877418])), (False, array([0.54998022, 0.45001978])), (True, array([0.44441409, 0.55558591])), (False, array([0.63203127, 0.36796873])), (False, array([0.77118463, 0.22881537])), (False, array([0.61376673, 0.38623327])), (False, array([0.74028144, 0.25971856])), (False, array([0.67951965, 0.32048035]))]


Store results

In [31]:
def store_pred(fn: str, pred, pred_prob):
    with open(fn, 'wt', encoding='utf-8') as f:
        for row in zip(pred, pred_prob):
            b = 'true' if row[0] else 'false'
            p = row[1][1]
            f.write(f"{b}\t{p:.4f}\n")

In [33]:
store_pred('../systemoutputs/PIT2015_BASELINE_SS_SS.output', y_pred, y_pred_prob)

In [34]:
!python pit2015_eval_single.py ../data/test.label ../systemoutputs/PIT2015_BASELINE_SS_SS.output

838	BASELINE	SS_SS		F: 0.592	Prec: 0.543	Rec: 0.651		P-corr: 0.563	F1: 0.624	Prec: 0.756	Rec: 0.531


In [35]:
y_pred_dev = clf.predict(x_dev_features)
report(y_dev, y_pred_dev)

              precision    recall  f1-score   support

       False       0.76      0.82      0.79      2672
        True       0.61      0.52      0.56      1470

    accuracy                           0.71      4142
   macro avg       0.68      0.67      0.67      4142
weighted avg       0.70      0.71      0.71      4142



In [36]:
y_pred_tr = clf.predict(x_train_features)
report(y_train, y_pred_tr)

              precision    recall  f1-score   support

       False       0.84      0.74      0.78      7534
        True       0.60      0.74      0.66      3996

    accuracy                           0.74     11530
   macro avg       0.72      0.74      0.72     11530
weighted avg       0.76      0.74      0.74     11530

