In [None]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation
import keras_metrics

import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [None]:
cn_file = './data/numberbatch-en-17.06.txt'

cn_vectors = KeyedVectors.load_word2vec_format(cn_file, binary=False)

In [None]:
def split_tags(string):
    return [tuple(i.split("/")) for i in string.split()]

def readTrainData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]            
        if nYes >= 3:
            amt_label = True
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
    return data

def readTestData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        data.append((split_tags(origsenttag), split_tags(candsenttag), expert_label))
    return data

In [None]:
dev_data = readTrainData("./data/data/dev.data")
train_data = readTrainData("./data/data/train.data")
test_data = readTestData("./data/data/test.data")

len(dev_data)  #4142
len(test_data)  #972
len(train_data)  #11530

In [514]:
def getTweetEmbedding(tweet):
    word_vectors = []
    exceptions = 0
    for word in tweet:
        try:
            word_vector = cn_vectors.get_vector(word[0].lower())
        except:
            continue
        word_vectors.append(word_vector)
        
        
    return np.mean(np.array(word_vectors), axis=0), exceptions


def getLabel(label):
    if label == True:
        return 1
    if label == False:
        return 0
    else:
        return None
    

def getLabelsFeatures(data, is_train):
    labels, features = [], []
    for row in data:
        lbl = getLabel(row[2])
        if lbl is None and is_train:
            continue
        labels.append(lbl)
        original_embedding, _ = getTweetEmbedding(row[0])
        cand_embedding, _ = getTweetEmbedding(row[1])
        embedding = np.hstack([original_embedding, cand_embedding])
#         embedding = np.mean( np.array([ original_embedding, cand_embedding ]), axis=0 )
        features.append(embedding)
    return labels, features
    
    
dev_labels, dev_features = getLabelsFeatures(dev_data, is_train=True)
test_labels, test_features = getLabelsFeatures(test_data, is_train=False)
train_labels, train_features = getLabelsFeatures(train_data, is_train=True)


dev_features = np.array(dev_features)
dev_labels = np.array(dev_labels)

test_features = np.array(test_features)
test_labels = np.array(test_labels)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

*** KERAS model

In [515]:
model = Sequential()

model.add(Dense(256, activation='relu', input_dim=600))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[precision, recall, f1])

loss_and_metrics = model.fit(train_features, train_labels, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [516]:
predictions = model.predict(test_features)
# 838	NN	01_NN		F: 0.187	Prec: 0.367	Rec: 0.126		P-corr: 0.243	F1: 0.474	Prec: 0.359	Rec: 0.697

In [518]:
for prediction in predictions:
    if prediction >= 0.5:
        print('true\t' + "{0:.4f}".format(prediction[0]))
    else:
        print('false\t' + "{0:.4f}".format(prediction[0]))

false	0.1048
false	0.3941
false	0.3416
false	0.1153
false	0.1297
false	0.0537
false	0.0212
false	0.0019
false	0.2166
false	0.0395
false	0.0625
false	0.0555
false	0.2247
false	0.2813
false	0.1304
false	0.2944
false	0.3915
false	0.2937
false	0.0395
false	0.0229
false	0.2479
false	0.0362
false	0.0055
false	0.0382
false	0.2472
false	0.1595
false	0.1090
false	0.1711
false	0.2444
false	0.1628
false	0.4621
false	0.1802
false	0.2154
false	0.0358
false	0.4613
true	0.7293
true	0.5815
false	0.2997
false	0.0845
true	0.5767
false	0.0054
false	0.0451
false	0.0043
false	0.2309
false	0.0507
false	0.4869
false	0.0149
false	0.3491
false	0.2091
false	0.0024
true	0.5009
false	0.0151
false	0.2797
false	0.1103
true	0.7150
false	0.4747
false	0.0569
false	0.4042
false	0.1492
false	0.4496
false	0.0283
false	0.0044
false	0.0002
false	0.0067
false	0.0010
false	0.0046
false	0.0056
false	0.0017
false	0.0303
false	0.0187
false	0.0000
false	0.0243
false	0.0077
false	0.0145
false	0.0198
false	0.0128
false	0.0029
fals

*** Simple classifier

In [452]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# clf = DecisionTreeClassifier(max_depth=10)
# 'test_f1_macro': array([0.56202534, 0.5483129 , 0.53597202])
# 838	NN	01_NN		F: 0.182	Prec: 0.203	Rec: 0.166		P-corr: -0.028	F1: 0.367	Prec: 0.240	Rec: 0.783

# clf = RandomForestClassifier(max_depth=10, n_estimators=15, max_features=3)
#''test_f1_macro': array([0.48649772, 0.48084575, 0.52552964])

# clf = SVC(gamma='auto', random_state=42)
#  'test_f1_macro': array([0.39521712, 0.39518414, 0.39518414])
# 838	NN	01_NN		F: 0.151	Prec: 0.250	Rec: 0.109		P-corr: 0.201	F1: 0.415	Prec: 0.279	Rec: 0.806

# cross_validate(clf, train_features, train_labels, scoring=['recall_macro', 'precision_macro', 'f1_macro'], cv=3, verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.6s finished


{'fit_time': array([0.37244797, 0.23569798, 0.24670196]),
 'score_time': array([0.06008506, 0.04628825, 0.05297589]),
 'test_recall_macro': array([0.53647537, 0.54443422, 0.54877727]),
 'train_recall_macro': array([0.79515739, 0.82334564, 0.81762384]),
 'test_precision_macro': array([0.6564221 , 0.73152672, 0.62282497]),
 'train_precision_macro': array([0.89771725, 0.91086276, 0.90454305]),
 'test_f1_macro': array([0.48572014, 0.49196107, 0.51983695]),
 'train_f1_macro': array([0.82049802, 0.84815443, 0.8420174 ])}

In [468]:
clf = SVC(gamma='auto', random_state=42, probability=True)
clf.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [469]:
predictions = clf.predict_proba(test_features)

In [470]:
for prediction in predictions:
    label = np.argmax(prediction)
    if prediction[1] > 0.5:
        print('true\t' + "{0:.4f}".format(prediction[1]))
    else:
        print('false\t' + "{0:.4f}".format(prediction[1]))

false	0.4227
false	0.4501
false	0.4406
false	0.2478
false	0.1347
false	0.3678
false	0.2213
false	0.1741
false	0.3960
false	0.3064
false	0.2803
false	0.2698
false	0.1146
false	0.1217
false	0.3726
false	0.3278
false	0.3781
false	0.3391
false	0.2054
false	0.2458
false	0.3384
false	0.3809
false	0.4179
false	0.3610
false	0.2467
false	0.2587
false	0.2469
false	0.4130
false	0.4522
false	0.3581
false	0.4549
false	0.4230
false	0.4189
false	0.3695
false	0.3845
false	0.3625
false	0.3668
false	0.2481
false	0.2310
false	0.3405
false	0.4246
false	0.1977
false	0.2661
false	0.3361
false	0.1950
false	0.4566
false	0.4731
true	0.5203
true	0.6701
true	0.5186
false	0.3024
false	0.2092
false	0.2911
false	0.1390
false	0.3700
false	0.4839
false	0.4046
false	0.1832
false	0.3609
false	0.3595
false	0.1820
false	0.2190
false	0.2481
false	0.2665
false	0.0636
false	0.3149
false	0.1100
false	0.2433
false	0.1796
false	0.4219
false	0.1627
false	0.1218
false	0.0694
false	0.2989
false	0.2168
false	0.2653
false	0.1100
fa