In [132]:
from gensim.models import KeyedVectors
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, LSTM, TimeDistributed, Activation
import keras_metrics

import numpy as np

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [136]:
from keras import backend as K

def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [269]:
def precision(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

In [17]:
cn_file = './data/numberbatch-en-17.06.txt'

cn_vectors = KeyedVectors.load_word2vec_format(cn_file, binary=False)

In [6]:
def split_tags(string):
    return [tuple(i.split("/")) for i in string.split()]

def readTrainData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = eval(judge)[0]            
        if nYes >= 3:
            amt_label = True
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
        elif nYes <= 1:
            amt_label = False
            data.append((split_tags(origsenttag), split_tags(candsenttag), amt_label))
    return data

def readTestData(filename):
    data = []
    for line in open(filename):
        line = line.strip()
        #read in training or dev data with labels
        if len(line.split('\t')) == 7:
            (trendid, trendname, origsent, candsent, judge, origsenttag, candsenttag) = \
            line.split('\t')
        else:
            continue
        # ignoring the training data that has middle label 
        nYes = int(judge[0])
        if nYes >= 4:
            expert_label = True
        elif nYes <= 2:
            expert_label = False
        else:
            expert_label = None
        data.append((split_tags(origsenttag), split_tags(candsenttag), expert_label))
    return data

In [16]:
dev_data = readTrainData("./data/data/dev.data")
train_data = readTrainData("./data/data/train.data")
test_data = readTestData("./data/data/test.data")

len(dev_data)  #4142
len(test_data)  #972
len(train_data)  #11530

11530

In [323]:
def getTweetEmbedding(tweet):
    word_vectors = []
    exceptions = 0
    for word in tweet:
        try:
            word_vector = cn_vectors.get_vector(word[0].lower())
        except:
            continue
        word_vectors.append(word_vector)
        
        
    return np.mean(np.array(word_vectors), axis=0), exceptions


def getLabel(label):
    if label == True:
        return 1
    if label == False:
        return 0
    else:
        return None
    

def getLabelsFeatures(data):
    labels, features = [], []
    for row in data:
        lbl = getLabel(row[2])
        if lbl is not None:
            labels.append(lbl)
            embedding, _ = getTweetEmbedding(row[1])
            features.append(embedding)
    return labels, features
    
    
dev_labels, dev_features = getLabelsFeatures(dev_data)
test_labels, test_features = getLabelsFeatures(test_data)
train_labels, train_features = getLabelsFeatures(train_data)


dev_features = np.array(dev_features)
dev_labels = np.array(dev_labels)

test_features = np.array(test_features)
test_labels = np.array(test_labels)

train_features = np.array(train_features)
train_labels = np.array(train_labels)

*** KERAS model

In [359]:
model = Sequential()

model.add(Dense(512, activation='relu', input_dim=300))
model.add(Dropout(0.3))
model.add(Dense(256))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[precision, recall, f1])

In [360]:
loss_and_metrics = model.fit(train_features, train_labels, epochs=10, batch_size=50)

print(model.metrics_names)
print(loss_and_metrics)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
['loss', 'precision', 'recall', 'f1']
<keras.callbacks.History object at 0x17414e898>


In [361]:
loss_and_metrics = model.evaluate(test_features, test_labels, batch_size=50)

print(model.metrics_names)
print(loss_and_metrics)

# ['loss', 'precision', 'recall', 'f1']
# [0.7009349179082952, 0.21241437910493632, 0.1699266110620237, 0.13016103533472254]

['loss', 'precision', 'recall', 'f1']
[0.7009349179082952, 0.21241437910493632, 0.1699266110620237, 0.13016103533472254]


In [297]:
predictions = model.predict(test_features)

In [None]:
for prediction in predictions:
    if prediction >= 0.5:
        print('true\t' + "{0:.4f}".format(prediction[0]))
    else:
        print('false\t' + "{0:.4f}".format(prediction[0]))

*** Simple classifier

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# clf = DecisionTreeClassifier(max_depth=10)
# 'test_f1_macro': array([0.54490802, 0.56093429, 0.56139427])

# clf = RandomForestClassifier(max_depth=10, n_estimators=10, max_features=5)
#'test_f1_macro': array([0.51949302, 0.51380182, 0.53575825])

# clf = SVC(gamma='auto', random_state=42)
# 'test_f1_macro': array([0.39521712, 0.39518414, 0.39518414]),

cross_validate(clf, train_features, train_labels, scoring=['recall_macro', 'precision_macro', 'f1_macro'], cv=3, verbose=True)

In [248]:
clf = DecisionTreeClassifier(max_depth=10)
clf.fit(train_features, train_labels)
clf.score(test_features, test_labels)

0.7386831275720165