In [19]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation

import numpy as np
from scipy.spatial import distance

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [6]:
cn_file = './data/numberbatch-en-17.06.txt'

cn_vectors = KeyedVectors.load_word2vec_format(cn_file, binary=False)

In [7]:
def split_tags(string):
    return [tuple(i.split("/")) for i in string.split()]

def readTrainData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]            
        if nYes >= 3:
            amt_label = True
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
    return data

def readTestData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        data.append((split_tags(origsenttag), split_tags(candsenttag), expert_label))
    return data

In [8]:
dev_data = readTrainData("./data/data/dev.data")
train_data = readTrainData("./data/data/train.data")
test_data = readTestData("./data/data/test.data")

len(dev_data)  #4142
len(test_data)  #972
len(train_data)  #11530

11530

In [32]:
def getTweetEmbedding(tweet):
    word_vectors = []
    exceptions = 0
    for word in tweet:
        try:
            word_vector = cn_vectors.get_vector(word[0].lower())
        except:
            continue
        word_vectors.append(word_vector)
        
        
    return np.mean(np.array(word_vectors), axis=0), exceptions


def getLabel(label):
    if label == True:
        return 1
    if label == False:
        return 0
    else:
        return None
    

def getLabelsFeatures(data, is_train):
    labels, features = [], []
    for row in data:
        lbl = getLabel(row[2])
        if lbl is None and is_train:
            continue
        labels.append(lbl)
        original_embedding, _ = getTweetEmbedding(row[0])
        cand_embedding, _ = getTweetEmbedding(row[1])
#         embedding = np.hstack([original_embedding, cand_embedding])
        embedding = np.subtract(original_embedding, cand_embedding)
#         embedding = distance.euclidean(original_embedding, cand_embedding)
#         embedding = np.mean( np.array([ original_embedding, cand_embedding ]), axis=0 )
        features.append(embedding)
    return labels, features
    
    
dev_labels, dev_features = getLabelsFeatures(dev_data, is_train=True)
test_labels, test_features = getLabelsFeatures(test_data, is_train=False)
train_labels, train_features = getLabelsFeatures(train_data, is_train=True)


dev_features = np.array(dev_features)
dev_labels = np.array(dev_labels)

test_features = np.array(test_features)
test_labels = np.array(test_labels)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

In [33]:
train_features.shape

(11530, 300)

*** KERAS model

In [30]:
model = Sequential()

model.add(Dense(256, activation='relu', input_dim=300))
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Dropout(0.5))
model.add(Dense(256))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[precision, recall, f1])

In [31]:
loss_and_metrics = model.fit(train_features, train_labels, epochs=15, batch_size=10)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
 1270/11530 [==>...........................] - ETA: 3s - loss: 0.5466 - precision: 0.6546 - recall: 0.4803 - f1: 0.5223

KeyboardInterrupt: 

In [13]:
predictions = model.predict(test_features)
# stacked vectors
# 838	NN	01_NN		F: 0.187	Prec: 0.367	Rec: 0.126		P-corr: 0.243	F1: 0.474	Prec: 0.359	Rec: 0.697

# subtract vectors
# 838	NN	04_NN		F: 0.522	Prec: 0.497	Rec: 0.549		P-corr: 0.415	F1: 0.538	Prec: 0.527	Rec: 0.549

# distance vectors


In [14]:
for prediction in predictions:
    if prediction >= 0.5:
        print('true\t' + "{0:.4f}".format(prediction[0]))
    else:
        print('false\t' + "{0:.4f}".format(prediction[0]))

false	0.0446
false	0.0111
false	0.4017
false	0.0797
false	0.0141
false	0.4567
false	0.0068
false	0.0000
false	0.0001
false	0.0031
false	0.0104
false	0.0183
false	0.1344
false	0.0046
false	0.0438
true	0.8084
false	0.0288
false	0.0006
false	0.0024
false	0.0001
false	0.0333
false	0.0084
false	0.0003
false	0.0112
false	0.2359
false	0.0854
false	0.2655
true	0.8829
false	0.2466
false	0.4014
false	0.2175
true	0.6120
false	0.2984
false	0.1389
true	0.8156
true	0.9210
true	0.9074
false	0.0844
false	0.0188
true	0.6777
false	0.0001
false	0.0000
false	0.3866
false	0.1778
false	0.0081
true	0.9270
false	0.0138
true	0.5994
false	0.0611
false	0.0000
true	0.9514
false	0.0843
true	0.9042
false	0.0839
false	0.0824
false	0.0064
false	0.0000
true	0.8301
false	0.4640
true	0.9041
false	0.0765
false	0.4209
false	0.0002
false	0.0007
false	0.0002
false	0.0013
false	0.0932
false	0.0000
false	0.0049
false	0.0000
false	0.0000
false	0.0035
false	0.0000
false	0.0000
false	0.0017
false	0.0000
false	0.0163
false	0.0007

*** Simple classifier

In [36]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# clf = DecisionTreeClassifier(max_depth=10)
# 'test_f1_macro': array([0.56202534, 0.5483129 , 0.53597202])
# 838	NN	01_NN		F: 0.182	Prec: 0.203	Rec: 0.166		P-corr: -0.028	F1: 0.367	Prec: 0.240	Rec: 0.783

# clf = RandomForestClassifier(max_depth=10, n_estimators=15, max_features=3)
#''test_f1_macro': array([0.48649772, 0.48084575, 0.52552964])

clf = SVC(gamma='auto', random_state=42)
#  'test_f1_macro': array([0.39521712, 0.39518414, 0.39518414])
# 838	NN	01_NN		F: 0.151	Prec: 0.250	Rec: 0.109		P-corr: 0.201	F1: 0.415	Prec: 0.279	Rec: 0.806
# difference of vectors
# 838	NN	04_NN		F: 0.522	Prec: 0.497	Rec: 0.549		P-corr: 0.415	F1: 0.538	Prec: 0.527	Rec: 0.549

cross_validate(clf, train_features, train_labels, scoring=['recall_macro', 'precision_macro', 'f1_macro'], cv=3, verbose=True)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  4.8min finished


{'fit_time': array([19.88594723, 19.64271736, 20.15567207]),
 'score_time': array([25.87473679, 25.69427466, 25.12932611]),
 'test_recall_macro': array([0.5, 0.5, 0.5]),
 'train_recall_macro': array([0.5, 0.5, 0.5]),
 'test_precision_macro': array([0.32674298, 0.32669789, 0.32669789]),
 'train_precision_macro': array([0.32669789, 0.32672044, 0.32672044]),
 'test_f1_macro': array([0.39521712, 0.39518414, 0.39518414]),
 'train_f1_macro': array([0.39518414, 0.39520063, 0.39520063])}

In [37]:
clf = SVC(gamma='auto', random_state=42, probability=True)
clf.fit(train_features, train_labels)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=42, shrinking=True,
  tol=0.001, verbose=False)

In [38]:
predictions = clf.predict_proba(test_features)

In [39]:
for prediction in predictions:
    label = np.argmax(prediction)
    if prediction[1] > 0.5:
        print('true\t' + "{0:.4f}".format(prediction[1]))
    else:
        print('false\t' + "{0:.4f}".format(prediction[1]))

false	0.3839
false	0.3206
false	0.3670
false	0.3829
false	0.3564
false	0.2975
false	0.3389
false	0.2964
false	0.3392
false	0.3992
false	0.4210
false	0.3984
false	0.3327
false	0.3818
false	0.4774
false	0.4899
false	0.4253
false	0.3766
false	0.3488
false	0.4098
false	0.4347
false	0.3949
false	0.3190
false	0.3168
false	0.3405
false	0.3622
false	0.3540
false	0.4107
false	0.3379
false	0.3192
false	0.3452
false	0.4009
false	0.4811
false	0.3903
false	0.4586
false	0.4216
false	0.4711
false	0.4272
false	0.3463
false	0.4287
false	0.3576
false	0.3676
false	0.2653
false	0.2458
false	0.3488
false	0.4280
false	0.3694
false	0.3883
false	0.3984
false	0.3107
false	0.4230
false	0.3141
false	0.3231
false	0.4565
false	0.3889
false	0.4139
false	0.3586
false	0.3804
false	0.2906
false	0.3274
false	0.3438
false	0.4547
false	0.2918
false	0.2828
false	0.2472
false	0.2553
false	0.2625
false	0.2329
false	0.2713
false	0.2506
false	0.2557
false	0.2965
false	0.1844
false	0.2363
false	0.1581
false	0.2037
false	0.2347