In [139]:
import multiprocessing as mp
from collections import defaultdict

import numpy as np
import pandas as pd
import spacy 
import gensim

from nltk.wsd import lesk
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.utils import resample
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
train = pd.read_json('../../../../data/snli_1.0/snli_1.0_train.jsonl', lines=True)
train.head()

Unnamed: 0,annotator_labels,captionID,gold_label,pairID,sentence1,sentence1_binary_parse,sentence1_parse,sentence2,sentence2_binary_parse,sentence2_parse
0,[neutral],3416050480.jpg#4,neutral,3416050480.jpg#4r1n,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,A person is training his horse for a competition.,( ( A person ) ( ( is ( ( training ( his horse...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
1,[contradiction],3416050480.jpg#4,contradiction,3416050480.jpg#4r1c,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is at a diner, ordering an omelette.",( ( A person ) ( ( ( ( is ( at ( a diner ) ) )...,(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
2,[entailment],3416050480.jpg#4,entailment,3416050480.jpg#4r1e,A person on a horse jumps over a broken down a...,( ( ( A person ) ( on ( a horse ) ) ) ( ( jump...,(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN o...,"A person is outdoors, on a horse.","( ( A person ) ( ( ( ( is outdoors ) , ) ( on ...",(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) ...
3,[neutral],2267923837.jpg#2,neutral,2267923837.jpg#2r1n,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,They are smiling at their parents,( They ( are ( smiling ( at ( their parents ) ...,(ROOT (S (NP (PRP They)) (VP (VBP are) (VP (VB...
4,[entailment],2267923837.jpg#2,entailment,2267923837.jpg#2r1e,Children smiling and waving at camera,( Children ( ( ( smiling and ) waving ) ( at c...,(ROOT (NP (S (NP (NNP Children)) (VP (VBG smil...,There are children present,( There ( ( are children ) present ) ),(ROOT (S (NP (EX There)) (VP (VBP are) (NP (NN...


In [50]:
tr = train[['sentence1', 'sentence2', 'gold_label']]
tr = tr[tr['gold_label'] != '-']

dev = pd.read_json('../../../../data/snli_1.0/snli_1.0_dev.jsonl', lines=True)[['sentence1', 'sentence2', 'gold_label']]
dev = dev[dev['gold_label'] != '-']
test = pd.read_json('../../../../data/snli_1.0/snli_1.0_test.jsonl', lines=True)[['sentence1', 'sentence2', 'gold_label']]
test = test[test['gold_label'] != '-']

In [51]:
n_samples = 20000
c = resample(tr[tr['gold_label'] == 'contradiction'], replace=False, n_samples=n_samples, random_state=1234)
e = resample(tr[tr['gold_label'] == 'entailment'], replace=False, n_samples=n_samples, random_state=1234)
n = resample(tr[tr['gold_label'] == 'neutral'], replace=False, n_samples=n_samples, random_state=1234)

tr = pd.concat([c, e, n])

In [52]:
nlp = spacy.load('en', disable=['ner', 'textcat'])

def spacy_parse(df):
    docs = []
    for doc in nlp.pipe(df['sentence1'].values, batch_size=200, n_threads=16):
        docs.append(doc)
    df['t'] = pd.Series(docs, index=df.index)
    
    docs = []
    for doc in nlp.pipe(df['sentence2'].values, batch_size=200, n_threads=16):
        docs.append(doc)
    df['h'] = pd.Series(docs, index=df.index)
    return df

In [45]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(df): 
    df['t'] = df['t'].apply(lambda doc: [x for x in doc if x.lemma_ not in stop_words])
    df['t'] = df['t'].apply(lambda doc: [x for x in doc if x.pos_ != 'PUNCT'])
    
    df['h'] = df['h'].apply(lambda doc: [x for x in doc if x.lemma_ not in stop_words])
    df['h'] = df['h'].apply(lambda doc: [x for x in doc if x.pos_ != 'PUNCT'])
    return df

In [53]:
%%time 

tr = spacy_parse(tr)
dev = spacy_parse(dev)
test = spacy_parse(test)

CPU times: user 6min 14s, sys: 42.3 s, total: 6min 57s
Wall time: 4min 32s


In [46]:
%%time 

tr = remove_stop_words(tr)
dev = remove_stop_words(dev)
test = remove_stop_words(test)

CPU times: user 3.9 s, sys: 105 ms, total: 4 s
Wall time: 4 s


In [51]:
tr.head()

Unnamed: 0,sentence1,sentence2,gold_label,t,h
307667,The street sweeping crew cleans the street wit...,A street crew cleans an empty street.,contradiction,"[street, sweeping, crew, cleans, street, crowd...","[street, crew, cleans, empty, street]"
209053,A surfer walking into the ocean,There is a surfer sitting on the ground.,contradiction,"[surfer, walking, ocean]","[surfer, sitting, ground]"
288898,There is a man lying in the sun next to the oc...,A man is sitting on a bench at the playground.,contradiction,"[man, lying, sun, next, ocean]","[man, sitting, bench, playground]"
268571,Two women waiting at the subway station.,A man sleeping in the subway station.,contradiction,"[Two, women, waiting, subway, station]","[man, sleeping, subway, station]"
378325,Workers wearing orange reflective vests walkin...,Workers are on a break and sitting down playin...,contradiction,"[Workers, wearing, orange, reflective, vests, ...","[Workers, break, sitting, playing, cards]"


In [55]:
ic = wordnet_ic.ic('ic-brown.dat')


def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('V'):
        return 'v'
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('R'):
        return 'r'
    return None


def get_synset(token): 
    wn_tag = penn_to_wn(token.tag_)
    try: 
        return wn.synsets(token.text, wn_tag)[0]
    except: 
        return None
    
    
def sentence_similarity(sentence1, sentence2, sim_func): 
    sentence1_synsets = [x for x in sentence1 if x.lemma_ not in stopWords]
    sentence2_synsets = [x for x in sentence2 if x.lemma_ not in stopWords]

    sentence1_synsets = [get_synset(x) for x in sentence1_synsets]
    sentence2_synsets = [get_synset(x) for x in sentence2_synsets]
    
    sentence1_synsets = [x for x in sentence1_synsets if x is not None]
    sentence2_synsets = [x for x in sentence2_synsets if x is not None]
        
    score, count = 0.0, 0
    for s1 in sentence1_synsets: 
        scores = [sim_func(s1, s2) for s2 in sentence2_synsets]
        if scores:
            best_score = max(scores)
        else: 
            best_score = 0
            
        if best_score is not None: 
            score += best_score
            count += 1
    if count == 0: 
        return 0 
    else:
        return score / count


def path_similarity(sentence1, sentence2): 
    def closure(x, y): 
        r = x.path_similarity(y)
        return r if r else 0
    return sentence_similarity(sentence1, sentence2, closure)


def lin_similarity(sentence1, sentence2): 
    def closure(x, y): 
        try: 
            r = x.lin_similarity(y, ic)
        except: 
            r = None
        return r if r else 0
    return sentence_similarity(sentence1, sentence2, closure)


def res_similarity(sentence1, sentence2): 
    def closure(x, y): 
        try: 
            r = x.res_similarity(y, ic)
        except: 
            r = None
        return r if r else 0
    return sentence_similarity(sentence1, sentence2, closure)


def wup_similarity(sentence1, sentence2): 
    def closure(x, y): 
        r = x.wup_similarity(y)
        return r if r else 0
    return sentence_similarity(sentence1, sentence2, closure)


def jcn_similarity(sentence1, sentence2): 
    def closure(x, y): 
        try: 
            r = x.jcn_similarity(y, ic)
        except: 
            r = None
        return r if r else 0

    return sentence_similarity(sentence1, sentence2, closure)


def lch_similarity(sentence1, sentence2): 
    def closure(x, y): 
        if x.pos() != y.pos(): 
            r = 0
        else: 
            r = x.lch_similarity(y)
        return r if r else 0
    return sentence_similarity(sentence1, sentence2, closure)


def apply_all_functions(df): 
    df['path_similarity'] = df[['t', 'h']].apply(lambda x: path_similarity(x['t'], x['h']), axis=1)
    print('path_similarity computed')
    df['lin_similarity'] = df[['t', 'h']].apply(lambda x: lin_similarity(x['t'], x['h']), axis=1)
    print('lin_similarity computed')
    df['res_similarity'] = df[['t', 'h']].apply(lambda x: res_similarity(x['t'], x['h']), axis=1)
    print('res_similarity computed')
    df['wup_similarity'] = df[['t', 'h']].apply(lambda x: wup_similarity(x['t'], x['h']), axis=1)
    print('wup_similarity computed')
    df['jcn_similarity'] = df[['t', 'h']].apply(lambda x: jcn_similarity(x['t'], x['h']), axis=1)
    print('jcn_similarity computed')
    return df

In [56]:
%%time

tr = apply_all_functions(tr)
dev = apply_all_functions(dev)
test = apply_all_functions(test)

path_similarity computed
lin_similarity computed
res_similarity computed
wup_similarity computed
jcn_similarity computed
path_similarity computed
lin_similarity computed
res_similarity computed
wup_similarity computed
jcn_similarity computed
path_similarity computed
lin_similarity computed
res_similarity computed
wup_similarity computed
jcn_similarity computed
CPU times: user 9min 20s, sys: 11.7 s, total: 9min 32s
Wall time: 9min 30s


In [62]:
tr.head()

Unnamed: 0,sentence1,sentence2,gold_label,t,h,path_similarity,lin_similarity,res_similarity,wup_similarity,jcn_similarity
307667,The street sweeping crew cleans the street wit...,A street crew cleans an empty street.,contradiction,"[street, sweeping, crew, cleans, street, crowd...","[street, crew, cleans, empty, street]",0.586806,0.595422,5.39479,0.736772,4.9999999999999995e+299
209053,A surfer walking into the ocean,There is a surfer sitting on the ground.,contradiction,"[surfer, walking, ocean]","[surfer, sitting, ground]",0.472222,0.368524,3.333333e+299,0.614815,3.333333e+299
288898,There is a man lying in the sun next to the oc...,A man is sitting on a bench at the playground.,contradiction,"[man, lying, sun, next, ocean]","[man, sitting, bench, playground]",0.295,0.259588,1.710657,0.446845,1.9999999999999997e+299
268571,Two women waiting at the subway station.,A man sleeping in the subway station.,contradiction,"[Two, women, waiting, subway, station]","[man, sleeping, subway, station]",0.515385,0.571579,4.87681,0.632581,3.9999999999999995e+299
378325,Workers wearing orange reflective vests walkin...,Workers are on a break and sitting down playin...,contradiction,"[Workers, wearing, orange, reflective, vests, ...","[Workers, break, sitting, playing, cards]",0.212037,0.177669,0.985733,0.339191,1.111111e+299


In [7]:
le = LabelEncoder()
y_train = le.fit_transform(tr['gold_label'])
y_dev = le.transform(dev['gold_label'])
y_test = le.transform(test['gold_label'])

In [None]:
X_train = tr[['path_similarity', 'lin_similarity', 'res_similarity', 'wup_similarity', 'jcn_similarity']]
X_dev = dev[['path_similarity', 'lin_similarity', 'res_similarity', 'wup_similarity', 'jcn_similarity']]
X_test = test[['path_similarity', 'lin_similarity', 'res_similarity', 'wup_similarity', 'jcn_similarity']]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test)

In [89]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.43      0.62      0.51      3278
          1       0.46      0.53      0.49      3329
          2       0.32      0.13      0.19      3235

avg / total       0.41      0.43      0.40      9842



0.39609149797552184

In [96]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.44      0.57      0.50      3278
          1       0.46      0.53      0.49      3329
          2       0.39      0.21      0.27      3235

avg / total       0.43      0.44      0.42      9842



  if diff:


0.42095178772069414

In [162]:
t_train = tr['t'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))
h_train = tr['h'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))

t_dev = dev['t'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))
h_dev = dev['h'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))

t_test = test['t'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))
h_test = test['h'].apply(lambda doc: ' '.join(['{}_{}'.format(x.lemma_, x.pos_) for x in doc]))

In [163]:
vect_t = TfidfVectorizer()
t_train = vect_t.fit_transform(t_train).toarray()
t_dev = vect_t.transform(t_dev).toarray()
t_test = vect_t.transform(t_test).toarray()

vect_h = TfidfVectorizer()
h_train = vect_h.fit_transform(h_train).toarray()
h_dev = vect_h.transform(h_dev).toarray()
h_test = vect_h.transform(h_test).toarray()

X_train = np.hstack((t_train, h_train))
X_dev = np.hstack((t_dev, h_dev))
X_test = np.hstack((t_test, h_test))

In [164]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.72      0.44      0.54      3278
          1       0.59      0.66      0.63      3329
          2       0.56      0.71      0.63      3235

avg / total       0.62      0.60      0.60      9842



0.5979082320731927

In [165]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.73      0.63      0.67      3278
          1       0.68      0.74      0.71      3329
          2       0.63      0.67      0.65      3235

avg / total       0.68      0.68      0.68      9842



  if diff:


0.678847595595491

In [10]:
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/victor/Downloads/GoogleNews-vectors-negative300.bin', binary=True)

In [11]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  if __name__ == '__main__':
  if __name__ == '__main__':


In [93]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec, dim):
        self.word2vec = word2vec
        self.dim = dim

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec, dim):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = dim

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)

        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [92]:
vect = MeanEmbeddingVectorizer(w2v, 300)
X_train_t = vect.transform(tr['t'].apply(lambda doc: [x.text.lower() for x in doc]).values)
X_train_h = vect.transform(tr['h'].apply(lambda doc: [x.text.lower() for x in doc]).values)

X_dev_t = vect.transform(dev['t'].apply(lambda doc: [x.text.lower() for x in doc]).values)
X_dev_h = vect.transform(dev['h'].apply(lambda doc: [x.text.lower() for x in doc]).values)

X_train = np.hstack((X_train_t, X_train_h))
X_dev = np.hstack((X_dev_t, X_dev_h))

In [56]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.57      0.58      0.58      3278
          1       0.59      0.63      0.61      3329
          2       0.60      0.56      0.58      3235

avg / total       0.59      0.59      0.59      9842



0.5877005052429903

In [57]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.67      0.64      0.66      3278
          1       0.69      0.69      0.69      3329
          2       0.61      0.63      0.62      3235

avg / total       0.66      0.66      0.66      9842



  if diff:


0.65518661860427

In [62]:
vect_t = TfidfEmbeddingVectorizer(w2v, 300)
vect_h = TfidfEmbeddingVectorizer(w2v, 300)

t_train = tr['t'].apply(lambda doc: [x.text.lower() for x in doc]).values
vect_t.fit(t_train)
h_train = tr['h'].apply(lambda doc: [x.text.lower() for x in doc]).values
vect_h.fit(h_train)

X_train_t = vect_t.transform(t_train)
X_train_h = vect_h.transform(h_train)

X_dev_t = vect_t.transform(dev['t'].apply(lambda doc: [x.text.lower() for x in doc]).values)
X_dev_h = vect_h.transform(dev['h'].apply(lambda doc: [x.text.lower() for x in doc]).values)

X_train = np.hstack((X_train_t, X_train_h))
X_dev = np.hstack((X_dev_t, X_dev_h))

In [65]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.58      0.45      0.51      3278
          1       0.51      0.64      0.57      3329
          2       0.54      0.53      0.54      3235

avg / total       0.55      0.54      0.54      9842



0.5385851789427484

In [66]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.65      0.65      0.65      3278
          1       0.67      0.67      0.67      3329
          2       0.61      0.62      0.61      3235

avg / total       0.65      0.65      0.65      9842



  if diff:


0.6449966066741241

In [132]:
t = tr['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc])
h = tr['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]) 
sentences = t.tolist() + h.tolist()

model = gensim.models.Word2Vec(sentences, size=200, window=5, min_count=5, workers=8, iter=30)

In [133]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
vect = MeanEmbeddingVectorizer(w2v, 200)

X_train_t = vect.transform(tr['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)
X_train_h = vect.transform(tr['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)

X_dev_t = vect.transform(dev['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)
X_dev_h = vect.transform(dev['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)

X_train = np.hstack((X_train_t, X_train_h))
X_dev = np.hstack((X_dev_t, X_dev_h))

  if __name__ == '__main__':


In [134]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.61      0.40      0.48      3278
          1       0.50      0.77      0.60      3329
          2       0.61      0.48      0.54      3235

avg / total       0.57      0.55      0.54      9842



0.5410131091002927

In [135]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.64      0.60      0.62      3278
          1       0.65      0.68      0.66      3329
          2       0.61      0.61      0.61      3235

avg / total       0.63      0.63      0.63      9842



  if diff:


0.6312161834768871

In [154]:
t = tr['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc])
h = tr['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]) 
sentences = t.tolist() + h.tolist()

model = gensim.models.fasttext.FastText(sentences, size=300, iter=20)

In [155]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

  if __name__ == '__main__':


In [156]:
vect = MeanEmbeddingVectorizer(w2v, 300)

X_train_t = vect.transform(tr['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)
X_train_h = vect.transform(tr['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)

X_dev_t = vect.transform(dev['t'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)
X_dev_h = vect.transform(dev['h'].apply(lambda doc: ['{}_{}'.format(x.lemma_, x.pos_) for x in doc]).values)

X_train = np.hstack((X_train_t, X_train_h))
X_dev = np.hstack((X_dev_t, X_dev_h))

In [157]:
clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=-1, learning_rate='optimal')
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')



             precision    recall  f1-score   support

          0       0.52      0.48      0.50      3278
          1       0.58      0.41      0.48      3329
          2       0.49      0.68      0.57      3235

avg / total       0.53      0.52      0.52      9842



0.5167023914799794

In [158]:
import lightgbm
clf = lightgbm.LGBMClassifier(boosting_type='gbdt', objective='multiclass', n_estimators=300, reg_lambda=0.1)
clf.fit(X_train, y_train)

y_dev_pred = clf.predict(X_dev)
print(classification_report(y_dev, y_dev_pred))
f1_score(y_dev, y_dev_pred, average='macro')

             precision    recall  f1-score   support

          0       0.61      0.58      0.60      3278
          1       0.63      0.65      0.64      3329
          2       0.60      0.61      0.61      3235

avg / total       0.62      0.62      0.62      9842



  if diff:


0.6152842755787167