In [1]:
import sys
sys.path.append('..')

In [2]:
from sklearn.metrics import classification_report

from baseline_logisticregression import readInData
from typing import NamedTuple, List
from bert_utils import calc_entailment_prob
from sklearn.ensemble import RandomForestClassifier
from tqdm.auto import tqdm
import os

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [3]:
class RawInput(NamedTuple):
    twit0: str
    twit1: str

In [4]:
def load_data(fn: str)->(List[RawInput],List[bool]):
    print(f"Start to read '{fn}'")
    data, trends = readInData(fn)
    print("Total records:", len(data))
    print("True samples:", sum([1 for r in data if r[1]]))
    print("False samples:", sum([1 for r in data if not r[1]]))
    return [RawInput(r[2], r[3]) for r in data], [r[1] for r in data]

In [5]:
def featurize(x_raw: List[RawInput])->List[List[float]]:
    res = []
    for r in tqdm(x_raw):
        p = calc_entailment_prob(r.twit0, r.twit1)
        pb = calc_entailment_prob(r.twit1, r.twit0)
        res.append([p[0], p[1], pb[0], pb[1]])
    return res

In [6]:
x_train_raw, y_train = load_data('../data/train.data')
x_dev_raw, y_dev = load_data('../data/dev.data')
x_test_raw, y_test = load_data('../data/test.data')

Start to read '../data/train.data'
Total records: 11530
True samples: 3996
False samples: 7534
Start to read '../data/dev.data'
Total records: 4142
True samples: 1470
False samples: 2672
Start to read '../data/test.data'
Total records: 972
True samples: 175
False samples: 797


In [7]:
for r in x_train_raw[:10]:
    print(r)

RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='But my bro from the 757 EJ Manuel is the 1st QB gone')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Can believe EJ Manuel went as the 1st QB in the draft')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='EJ MANUEL IS THE 1ST QB what')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Manuel is the 1st QB to get drafted')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='My boy EJ Manuel being the 1st QB picked')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Not surprised EJ Manuel was 1st QB taken')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='WOW EJ MANUEL FSU 1ST QB TAKEN')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='Wow EJ Manuel 1st QB taken in the draft')
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='if EJ is the 1st QB off the board')
RawInput(twit0='So EJ Manuel 

## BERT features

In [8]:
def load_bert_features(fn: str):
    with open(fn, 'rt', encoding='utf-8') as f:
        res = []
        for l in f:
            fs = l.strip().split('\t')
            res.append([float(feature) for feature in fs])
    return res

In [9]:
print("Start featurizing...")
if not os.path.isfile('../data/bert.train.data'):
    x_train_bert_features = featurize(x_train_raw)
    x_dev_bert_features = featurize(x_dev_raw)
    x_test_bert_features = featurize(x_test_raw)
else:
    x_train_bert_features = load_bert_features('../data/bert.train.data')
    x_dev_bert_features = load_bert_features('../data/bert.dev.data')
    x_test_bert_features = load_bert_features('../data/bert.test.data')

print("Done!")

Start featurizing...
Done!


In [10]:
def save_bert_features(x, filename):
    with open(filename, 'wt', encoding='utf-8') as f:
        lines = ['\t'.join([str(row[0]),str(row[1]),str(row[2]),str(row[3])]) for row in x]
        tsv_str = '\n'.join(lines)
        f.write(tsv_str)

In [11]:
if not os.path.isfile('../data/bert.train.data'):
    save_bert_features(x_train_bert_features, '../data/bert.train.data')
    save_bert_features(x_dev_bert_features, '../data/bert.dev.data')
    save_bert_features(x_test_bert_features, '../data/bert.test.data')

## LEN features

In [12]:
def featurize_len(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        res.append([len(r.twit0)/len(r.twit1), len(r.twit0)/100, len(r.twit1)/100])
    return res

In [13]:
print("Start featurizing...")
x_train_len_features = featurize_len(x_train_raw)
x_dev_len_features = featurize_len(x_dev_raw)
x_test_len_features = featurize_len(x_test_raw)
print("Done!")

Start featurizing...
Done!


In [14]:
def report(y_true, y_pred):
    y_true_cleaned, y_pred_cleaned = [], []
    for t, p in zip(y_true, y_pred):
        if t is not None:
            y_true_cleaned.append(t)
            y_pred_cleaned.append(p)
    print(classification_report(y_true_cleaned, y_pred_cleaned))

## bpemb

In [15]:
from bpemb import BPEmb
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

emb = BPEmb(lang='en', dim = 300)
print(emb)

BPEmb(lang=en, vs=10000, dim=300)


In [16]:
def calc_emb(text):
    res = np.zeros(emb.vectors.shape[1], dtype=np.float32)
    # tokens = word_tokenize(text)
    # for t in tokens:
    embs = emb.embed(text.casefold())
    for e in embs:
        res += e
    n = len(embs)
    if n:
        res /= n
    return res

def featurize_emb(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        emb0 = calc_emb(r.twit0) 
        emb1 = calc_emb(r.twit1)
        cos_symilarity = 1 - spatial.distance.cosine(emb0, emb1)
        res.append([cos_symilarity])
    return res

print(featurize_emb([RawInput('Twit1 experiment', 'Some text')]))
print(featurize_emb([RawInput('I like to move it move it', 'I like to move it')]))

[[0.01242410484701395]]
[[0.9760012626647949]]


In [17]:
print("Start featurizing...")
x_train_emb_features = featurize_emb(x_train_raw)
x_dev_emb_features = featurize_emb(x_dev_raw)
x_test_emb_features = featurize_emb(x_test_raw)
print("Done!")

Start featurizing...
Done!


## Numberbatch

In [18]:
from gensim.models import KeyedVectors

In [19]:
print('loading word vectors')
word_vectors = KeyedVectors.load_word2vec_format("d:/nlp/vectors/numberbatch-en-17.06.txt.gz", binary=False)
print('loading word vectors finished')

loading word vectors
loading word vectors finished


In [32]:
import math
def featurize_nb(x_raw: List[RawInput]) -> List[List[float]]:
    res = []
    for r in x_raw:
        sym = 1-word_vectors.wmdistance(r.twit0, r.twit1) 
        if sym == -math.inf:
            print("-inf for ", r)
            sym = 1
        res.append([sym])
    return res

print(featurize_nb([RawInput('Twit1 experiment', 'Some text')]))
print(featurize_nb([RawInput('I like to move it move it', 'I like to move it')]))
print(x_train_raw[0])
print(featurize_nb([x_train_raw[2]]))

[[0.5166783913983792]]
[[0.9470648965858036]]
RawInput(twit0='EJ Manuel the 1st QB to go in this draft', twit1='But my bro from the 757 EJ Manuel is the 1st QB gone')
[[0.39836074996202275]]


In [33]:
print("Start featurizing...")
x_train_nb_features = featurize_nb(x_train_raw)
x_dev_nb_features = featurize_nb(x_dev_raw)
x_test_nb_features = featurize_nb(x_test_raw)
print("Done!")

Start featurizing...
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Also in South Africa actual money was spent on transportredoing airports etc')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='I guess kids in Africa dont have it')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='I think South Africa is finally getting the picture')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Is there an idealware for Africa')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Meet 1 of South Africa s Top Winemakers')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Miss Africa Utah Traditional outfit')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Oh I was right daddy owns a few houses in South Africa')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='Shout out to the prince of Africa AliDoee')
-inf for  RawInput(twit0='COME TO SOUTH AFRICA ASSHOLE', twit1='We are Africans 

-inf for  RawInput(twit0='Im so watching Cinderella right now', twit1='CINDERELLA IS ON OH MY CHILDHOOD RIGHT IN THE FEELS')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='And then along came Cinderella and Panda')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='Cinderella Peter Pan and the lion kingdont bother call me today')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='Cinderella is my most favorite movie ever')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='Cinderella is on one of my favorites')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='I love that Cinderella is on so much')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='Im 19 and watching Cinderella')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='Oh my god Cinderella is on')
-inf for  RawInput(twit0='CINDERELLA IS ON OMG OMG OMG', twit1='making my dad and grandfather watch Cinderella right now')
-inf for  Ra

-inf for  RawInput(twit0='TWITTER IS GOING IN ON DWIGHT HOWARD', twit1='How the NBA wanted the team to not have all stars but they got Dwight Howard')
-inf for  RawInput(twit0='I can careless if Dwight Howard leaves the Lakers', twit1='DWIGHT HOWARD IS NOT A MAX MONEY GUY')
-inf for  RawInput(twit0='Dwight Howard is to childish to be on the Lakers', twit1='DWIGHT HOWARD IS NOT A MAX MONEY GUY')
-inf for  RawInput(twit0='Family guy is entertaining me right now', twit1='I HATE THE STARWARS EPISODES OF FAMILY GUY')
-inf for  RawInput(twit0='The only thing i have to look forward to is another episode of Game of Thrones', twit1='MY NEIGHBORS ARE WATCHING GAME OF THRONES AND I CAN HEAR IT THROUGH MY WINDOW')
-inf for  RawInput(twit0='And not get to watch game of thrones', twit1='THEY SHOWED PENIS ON GAME OF THRONES')
-inf for  RawInput(twit0='Gerald Green nasty dunk on Josh Smith', twit1='OH MY GOODNESS GERALD GREEN JUST POSTERIZED JOSH SMITH')
-inf for  RawInput(twit0='The Germans arent fuc

-inf for  RawInput(twit0='ME PLEASE PLEASE FOLLOW ME JAI ILY', twit1='Will JaiBrooks1 follow me if i tag ILoveCheese')
-inf for  RawInput(twit0='ME PLEASE PLEASE FOLLOW ME JAI ILY', twit1='follow me please Jai cause my foot hurts like hell')
-inf for  RawInput(twit0='ME PLEASE PLEASE FOLLOW ME JAI ILY', twit1='jai Im gonna cry please follow me')
-inf for  RawInput(twit0='ME PLEASE PLEASE FOLLOW ME JAI ILY', twit1='not cool jai I do exists PLEASE JAI FOLLOW ME BE MY 25 please ilysm')
-inf for  RawInput(twit0='I love the Jamie Collins selection', twit1='JAMIE COLLINS IS THE TRUTH THO')
-inf for  RawInput(twit0='no joaning is necessary', twit1='OMG JOAN WATSON IS SO GREAT')
-inf for  RawInput(twit0='I freaking LOVE Joan Rivers', twit1='OMG JOAN WATSON IS SO GREAT')
-inf for  RawInput(twit0='Joan Rivers is gross looking', twit1='I LOVE JOAN BOSS LADY RIVERS')
-inf for  RawInput(twit0='Is Joey Crawford the only ref at this OKCvsHOU game', twit1='JOEY CRAWFORD CALLS A TECH ON HOUSTON')
-inf 

-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='Michael Clifford please follow me youd make me so happy please please')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='Michael clifford you little player you')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='WHY DOES MICHAEL CLIFFORD TWEET AND FOLLOW EVERYONE BUT ME WHY')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='i wish michael clifford would follow me cough')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='michael clifford why wont you follow me lil shit')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='michael clifford you havent noticed me im offended')
-inf for  RawInput(twit0='MICHAEL CLIFFORD WOULD YOU FOLLOW ME IF I WAS PRETTY', twit1='ur giving me a tummy ache michael clifford')
-inf for 

-inf for  RawInput(twit0='WHO CARES PETER PAN IS ON', twit1='Why have I never noticed that the Peter Pan movie rhymes the whole time')
-inf for  RawInput(twit0='WHO CARES PETER PAN IS ON', twit1='currently watching Peter Pan and I will watch the Lion King next')
-inf for  RawInput(twit0='Cinderella Peter Pan and The Lion King on ABCFamily today', twit1='PETER PAN IS ON I LOVE THIS MOVIE')
-inf for  RawInput(twit0='Peter Pan is on you guys', twit1='EVERYONE SHUT UP PETER PAN IS ON')
-inf for  RawInput(twit0='Peter Pan is on you guys', twit1='PETER PAN IS ON WHY DIDNT I KNOW ABOUT THIS')
-inf for  RawInput(twit0='They got all the presidents in one place', twit1='FUCK THE PRESIDENTS ON THE BILLS IT SHOULD BE OBAMA')
-inf for  RawInput(twit0='Reggie Jackson swear he the darkskin Russell Westbrook', twit1='I DONT LOOK LIKE REGGIE JACKSON')
-inf for  RawInput(twit0='why cant i be in russia with you', twit1='WERUSSIAN BELIEBERS HOPE YOU LIKE OUR COUNTRY')
-inf for  RawInput(twit0='ryan beatty

-inf for  RawInput(twit0='HIII SIMON PLEASE FOLLOW ME', twit1='simon dawwwwwwg follow me please x do it for one direction')
-inf for  RawInput(twit0='HIII SIMON PLEASE FOLLOW ME', twit1='simon please follow me it would mean the world to me xx')
-inf for  RawInput(twit0='HIII SIMON PLEASE FOLLOW ME', twit1='uncle simon can u please follow me and JamieSalud1D itd mean so much to us w')
-inf for  RawInput(twit0='simon please please please please please follow me', twit1='SIMON COWELL YOU BETTER FOLLOW ME')
-inf for  RawInput(twit0='uncle Simon can you follow me', twit1='SIMON DO YOU SEE ME')
-inf for  RawInput(twit0='uncle Simon can you follow me', twit1='SIMON FOLLOW ME I WATCH THE X FACTOR EVERY SEASON')
-inf for  RawInput(twit0='uncle Simon can you follow me', twit1='SIMON PLEASE FOLOW ME PLEASE')
-inf for  RawInput(twit0='would you please follow me Simon', twit1='WHY CANT U FOLLOW ME SIMON PLS')
-inf for  RawInput(twit0='would you please follow me Simon', twit1='YOURE STILL FOLLOWING 

-inf for  RawInput(twit0='please follow me please stella iloveyousomuch5', twit1='STELLA FOLLOW ME PLS PLS PLS PLS PLS')
-inf for  RawInput(twit0='please follow me please stella iloveyousomuch5', twit1='STELLA OH MY GOD CAN U JUST FOLLOW ME OR STELLAHUDGENS')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM', twit1='ASDFGHJKLAJAJDKXHSKWIXBELWUWPWIEOZKEURPVKD GURL YPU GT A STELLA FOLLOW IM SO HAPPY FOR YOU')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM', twit1='FOLLOW ME PLEASE ILYSM STELLA IF YOU FOLLOW ME ILL BE SO HAPPY')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM', twit1='I am using your trend and I still cant get you to follow me Stella')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM', twit1='PLEASE FOLLOW THESE CRAZY FANGIRLS STELLA AND TELENA')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM', twit1='PLEASE STELLA FOLLOW ME I LOVE YOU SO MICH PLEASE')
-inf for  RawInput(twit0='STELLA FOLLOW ME BAE PLEASE ILYSM',

-inf for  RawInput(twit0='NIGGA YOU PLAY FOR THE WIZARDS', twit1='Nigga on the wizards admitted to being gay')
-inf for  RawInput(twit0='NIGGA YOU PLAY FOR THE WIZARDS', twit1='No wonder the wizards sucked')
-inf for  RawInput(twit0='NIGGA YOU PLAY FOR THE WIZARDS', twit1='Safe to say the wizards need a center next season')
-inf for  RawInput(twit0='Congrats Bjoern Werner drafted to the Colts and Xavier Rhodes drafted to the Vikings', twit1='BRUH XAVIER RHODES TO THE VIKINGS')
-inf for  RawInput(twit0='Vikings got Shariff Floyd AND Xavier Rhodes', twit1='BIG CONGRATS TO THE XMAN XAVIER RHODES')
-inf for  RawInput(twit0='In the 2nd round the Eagles pick TE Zach Ertz', twit1='OUT OF EVERYONE THE EAGLES PICK ZACH ERTZ')
-inf for  RawInput(twit0='Loving the Eagles pick of Stanford TE Zach Ertz', twit1='GREAT PICK UP FOR THE EAGLES IN ZACH ERTZ AWW YEAH')
-inf for  RawInput(twit0='A Walk to Remember is the definition of true love', twit1='BUT GUYS ITS ON MY FAVE PART OF A WALK TO REMEMBER')

-inf for  RawInput(twit0='THIS IS A GOLDEN STATE OF MIND FOOL', twit1='Once again Golden State is fucking balling')
-inf for  RawInput(twit0='THIS IS A GOLDEN STATE OF MIND FOOL', twit1='This Denver golden state series is the best series in the playoffs')
-inf for  RawInput(twit0='THIS IS A GOLDEN STATE OF MIND FOOL', twit1='Was watching the golden state game for 5 minutes and already became a Steph Curry fan')
-inf for  RawInput(twit0='THIS IS A GOLDEN STATE OF MIND FOOL', twit1='What the hell golden state is choking')
-inf for  RawInput(twit0='THIS IS A GOLDEN STATE OF MIND FOOL', twit1='Why arent you a Golden State Fan')
-inf for  RawInput(twit0='Harding had a fantastic game', twit1='KEEP YOUR LEGS CLOSED HARDING')
-inf for  RawInput(twit0='HILLER GETS ALL THE STARS', twit1='Abdelkader with a flutter but Hiller strong on the post')
-inf for  RawInput(twit0='HILLER GETS ALL THE STARS', twit1='Everyone quick complain Hiller is on his knees the whole time')
-inf for  RawInput(twit0='HI

-inf for  RawInput(twit0='That was some hit on Lars Eller', twit1='LARS ELLER IS ON THE ICE JUST BLEEDING PROFUSELY')
-inf for  RawInput(twit0='MC hammer is at the game aha', twit1='MC HAMMER IN THE GOLDEN STATE CROWD')
-inf for  RawInput(twit0='Today was the perfect May Day', twit1='WHAT DOES MAY DAY MEAN TO YOU')
-inf for  RawInput(twit0='May Day Protests in Olympia WA', twit1='WHELP WHAT A HAPPY MAY DAY THAT WAS')
-inf for  RawInput(twit0='Finally I have my Pandora again', twit1='MY PANDORA FINALLY WORKS AGAIN YAYAY')
-inf for  RawInput(twit0='I got my pandora back thank god', twit1='YAY I CAN FINALLY LISTEN TO PANDORA AGAIN')
-inf for  RawInput(twit0='I can use pandora on my phone again', twit1='OMGGG I FINALLY GOT MY PANDORA BACK YAYYYY')
-inf for  RawInput(twit0='And agree on how ugly Rachel is', twit1='THANK YOU FOR LETTING RACHEL REACH HER DREAMS')
-inf for  RawInput(twit0='Lmao Reggie miller said shit on tv', twit1='LMAAAAAAAAAAAAAAAAAOOOO NIGGA REGGIE MILLER JUST CUSSED ON TV

In [34]:
print(x_train_nb_features)

[[0.7110856607953571], [0.7458067432091466], [0.39836074996202275], [0.8687102508684913], [0.6956479808986432], [0.7059054891241642], [-0.14754049217211374], [0.7994314111605042], [0.7651052492742733], [0.730442579268944], [0.7624346508382158], [0.5894637070329636], [0.8361046523089243], [0.7027955066467345], [0.626069718098228], [0.8190682992381588], [0.7859410618915534], [0.6751902194047593], [0.8262007050414312], [0.7427887255259038], [0.7023310491411701], [0.7911514269984273], [0.7569676285347886], [0.6853125008380311], [0.6513766598697432], [-0.13965120816928778], [0.7121640099389518], [0.7302972425086869], [0.7042367626515535], [0.7632802002758613], [0.6990860859503104], [0.8054158552722858], [0.6243110149090183], [0.6671657870285677], [-0.13579725182277724], [0.7523102885622348], [0.30147368899135074], [0.29602359473048656], [0.7178392840684924], [0.7146426070711415], [0.7727946502970867], [0.5110323653762188], [0.7975672233925768], [0.7322990424873925], [0.8073323248169264], [0




## Glue features

In [35]:
#import copy

def glue_features(*f_lists):
    #f_first, f_others = f_lists
    #res = copy.deepcopy(f_first)
    res = []
    for rows in zip(*f_lists):
        row = []
        for columns in rows:
            row+=columns
        res.append(row)
    return res

tst1, tst2 = [[1],[2],[3],[4],[5]], [[6],[7],[8],[9],[10]]
print(glue_features(tst1, tst2))
tst3 = [[11],[12],[13],[14],[15]]
print(glue_features(tst1, tst2, tst3))

[[1, 6], [2, 7], [3, 8], [4, 9], [5, 10]]
[[1, 6, 11], [2, 7, 12], [3, 8, 13], [4, 9, 14], [5, 10, 15]]


In [60]:
"""
x_train_features = glue_features(x_train_bert_features, x_train_len_features, x_train_emb_features, x_train_nb_features)
x_dev_features = glue_features(x_dev_bert_features, x_dev_len_features, x_dev_emb_features, x_dev_nb_features)
x_test_features = glue_features(x_test_bert_features, x_test_len_features, x_test_emb_features, x_test_nb_features)
"""
x_train_features = glue_features(x_train_len_features, x_train_emb_features)
x_dev_features = glue_features(x_dev_len_features, x_dev_emb_features)
x_test_features = glue_features(x_test_len_features, x_test_emb_features)


## Classifier

In [61]:
from sklearn.linear_model import LogisticRegression
print("Start learning classifier...")
class_weight = {True: 1.9, False:1}
#clf = RandomForestClassifier(n_estimators=2, random_state=1974, verbose=True, class_weight='balanced')
clf = LogisticRegression(random_state=1974, verbose=True, solver='saga'
                         , class_weight='balanced'
                         #, class_weight=class_weight
                        )
print("Done!")
clf.fit(x_train_features, y_train)
y_pred = clf.predict(x_test_features)
report(y_test, y_pred)

Start learning classifier...
Done!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 16 epochs took 0 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


              precision    recall  f1-score   support

       False       0.91      0.80      0.85       663
        True       0.48      0.71      0.57       175

   micro avg       0.78      0.78      0.78       838
   macro avg       0.70      0.76      0.71       838
weighted avg       0.82      0.78      0.79       838



Store results

In [44]:
def store_pred(fn: str, pred):
    with open(fn, 'wt', encoding='utf-8') as f:
        for row in pred:
            b = 'true' if row else 'false'
            fl = 0.0000 if row else '1.0000'
            f.write(f"{b}\t{fl}\n")

In [45]:
store_pred('../systemoutputs/PIT2015_BASELINE_SS_SS.output', y_pred)

In [46]:
!python pit2015_eval_single.py ../data/test.label ../systemoutputs/PIT2015_BASELINE_SS_SS.output

838	BASELINE	SS_SS		F: 0.567	Prec: 0.640	Rec: 0.509		P-corr: -0.420	F1: 0.346	Prec: 0.209	Rec: 1.000


In [41]:
y_pred_dev = clf.predict(x_dev_features)
report(y_dev, y_pred_dev)

              precision    recall  f1-score   support

       False       0.76      0.77      0.76      2672
        True       0.57      0.56      0.56      1470

   micro avg       0.69      0.69      0.69      4142
   macro avg       0.67      0.66      0.66      4142
weighted avg       0.69      0.69      0.69      4142



In [42]:
y_pred_tr = clf.predict(x_train_features)
report(y_train, y_pred_tr)

              precision    recall  f1-score   support

       False       0.84      0.72      0.78      7534
        True       0.59      0.75      0.66      3996

   micro avg       0.73      0.73      0.73     11530
   macro avg       0.72      0.74      0.72     11530
weighted avg       0.76      0.73      0.74     11530

