In [3]:
import sys
sys.path.append('..')

In [7]:
from sklearn.metrics import classification_report

from baseline_logisticregression import readInData
from typing import NamedTuple, List
from bert_utils import calc_entailment_prob
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm

In [5]:
class RawInput(NamedTuple):
    twit0: str
    twit1: str

In [6]:
def load_data(fn: str)->(List[RawInput],List[bool]):
    print(f"Start to read '{fn}'")
    data, trends = readInData(fn)
    print("Total records:", len(data))
    print("True samples:", sum([1 for r in data if r[1]]))
    print("False samples:", sum([1 for r in data if not r[1]]))
    return [RawInput(r[2], r[2]) for r in data], [r[1] for r in data]

In [8]:
def featurize(x_raw: List[RawInput])->List[List[float]]:
    res = []
    for r in tqdm(x_raw):
        p = calc_entailment_prob(r.twit0, r.twit1)
        pb = calc_entailment_prob(r.twit1, r.twit0)
        res.append([p[0], p[1], pb[0], pb[1]])
    return res

In [9]:
x_train_raw, y_train = load_data('../data/train.data')
x_dev_raw, y_dev = load_data('../data/dev.data')
x_test_raw, y_test = load_data('../data/test.data')

Start to read '../data/train.data'
Total records: 11530
True samples: 3996
False samples: 7534
Start to read '../data/dev.data'
Total records: 4142
True samples: 1470
False samples: 2672
Start to read '../data/test.data'
Total records: 972
True samples: 175
False samples: 797


In [10]:
print("Start featurizing...")
x_train_features = featurize(x_train_raw)
x_dev_features = featurize(x_dev_raw)
x_test_features = featurize(x_test_raw)
print("Done!")

Start featurizing...


HBox(children=(IntProgress(value=0, max=11530), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4142), HTML(value='')))




HBox(children=(IntProgress(value=0, max=972), HTML(value='')))


Done!


In [19]:
def save_bert_features(x, filename):
    with open(filename, 'wt', encoding='utf-8') as f:
        lines = ['\t'.join([str(row[0]),str(row[1]),str(row[2]),str(row[3])]) for row in x]
        tsv_str = '\n'.join(lines)
        f.write(tsv_str)

In [20]:
save_bert_features(x_train_features, '../data/bert.train.data')

In [21]:
save_bert_features(x_dev_features, '../data/bert.dev.data')

In [22]:
save_bert_features(x_test_features, '../data/bert.test.data')

In [34]:
def report(y_true, y_pred):
    y_true_cleaned, y_pred_cleaned = [], []
    for t, p in zip(y_true, y_pred):
        if t is not None:
            y_true_cleaned.append(t)
            y_pred_cleaned.append(p)
    print(classification_report(y_true_cleaned, y_pred_cleaned))

In [45]:
from sklearn.linear_model import LogisticRegression
print("Start learning classifier...")
#clf = RandomForestClassifier(n_estimators=2, random_state=1974, verbose=True)
clf = LogisticRegression(random_state=1974, verbose=True, solver='saga')
print("Done!")
clf.fit(x_train_features, y_train)
y_pred = clf.predict(x_test_features)
print(report(y_test, y_pred))

Start learning classifier...
Done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 49 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       False       0.79      1.00      0.88       663
        True       0.00      0.00      0.00       175

   micro avg       0.79      0.79      0.79       838
   macro avg       0.40      0.50      0.44       838
weighted avg       0.63      0.79      0.70       838

None


In [46]:
y_pred_tr = clf.predict(x_train_features)
print(report(y_train, y_pred_tr))

  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

       False       0.65      1.00      0.79      7534
        True       0.00      0.00      0.00      3996

   micro avg       0.65      0.65      0.65     11530
   macro avg       0.33      0.50      0.40     11530
weighted avg       0.43      0.65      0.52     11530

None
