#Â Multiclass classification : Anorexia related

In [8]:
path_corpus = {
    "test": "./path/to/test.csv",
    "training": "./path/to/train.csv",
    "dev": "./path/to/dev.csv"
}

In [9]:
def read_corpus(path, ty="main_label"):
    data = open(path, "rb").read().decode("utf-8").strip()
    labels = []
    entities = []
    texts = []
    for instance in data.split("####END-POST####"):
        if not instance=="":
            instance = instance.replace("####POST####\n", "")
            label = int(float(instance.split("####TITLE####")[0].strip().split("\n")[0]))
            if ty == "binary":
                label = int(label < 7)
            t = instance.split("####TEXT####")[1].strip()
            text = []
            ent = []
            for x in t.split("\n"):
                if not x=="":
                    text.append(x.split("\t")[0])
                    ent.append(x.split("\t")[1])
            entities.append(ent)
            texts.append(text)
            labels.append(label)
    return {"entities": entities, "texts": texts, "labels": labels}
training = read_corpus(path_corpus["training"], "main_label")
dev = read_corpus(path_corpus["dev"], "main_label")
test = read_corpus(path_corpus["test"], "main_label")

In [10]:
def cast_data(entities, text, dict_entities=None, word2index=None, modify=True):
    dict_entities = {} if dict_entities is None else dict_entities
    entities_c = []
    for x in entities:
        entities_c_l = []
        for y in x:
            if not y in dict_entities and modify:
                dict_entities[y] = len(dict_entities)
            if not y in dict_entities:
                dict_entities[y] = dict_entities["__UNK__"]
            entities_c_l.append(dict_entities[y])
        entities_c.append(entities_c_l)
    
    text_c = []
    for x in text:
        text_c.append([0 if not y in word2index else word2index[y] for y in x])
    return entities_c, text_c, dict_entities

In [11]:
import pickle
import numpy as np

max_seq_len = 500

e, full_vocab = pickle.load(open("resources/embedding.pkl","rb"))
e = np.vstack((np.zeros((1, e.shape[1])),
                            np.random.uniform(-0.25, 0.25, e.shape[1]), e))
full_vocab["__UNK__"] = 0
full_vocab["__PAD__"] = 1

In [12]:
train_entities, train_p, d_ent = cast_data(training["entities"], training["texts"], dict_entities={"__UNK__":0}, word2index=full_vocab)
dev_entities, dev_p, d_ent = cast_data(dev["entities"], dev["texts"], dict_entities=d_ent, word2index=full_vocab, modify=False)
test_entities, test_p, d_ent = cast_data(test["entities"], test["texts"], dict_entities=d_ent, word2index=full_vocab, modify=False)

In [13]:
from keras.preprocessing.sequence import pad_sequences
d_ent["__PAD__"] = len(d_ent)

# pad sequence
train_entities = pad_sequences(train_entities, maxlen=max_seq_len, padding="post", value=d_ent["__PAD__"])
dev_entities = pad_sequences(dev_entities, maxlen=max_seq_len, padding="post", value=d_ent["__PAD__"])
test_entities = pad_sequences(test_entities, maxlen=max_seq_len, padding="post", value=d_ent["__PAD__"])

train_p = pad_sequences(train_p, maxlen=max_seq_len, padding="post", value=full_vocab["__PAD__"])
dev_p = pad_sequences(dev_p, maxlen=max_seq_len, padding="post", value=full_vocab["__PAD__"])
test_p = pad_sequences(test_p, maxlen=max_seq_len, padding="post", value=full_vocab["__PAD__"])

In [14]:
from keras.utils import to_categorical
import numpy as np
y_train = to_categorical(training["labels"], 8)
y_dev = to_categorical(dev["labels"], 8)
y_test = to_categorical(test["labels"], 8)

In [27]:
np.random.seed(500)

from keras.models import Model
from keras import losses
from keras.layers import Dense, Embedding, Dropout
from keras.layers import LSTM, Bidirectional, Input
from keras.layers import Convolution1D, GlobalMaxPooling1D
from keras.layers import concatenate
from sklearn.metrics import f1_score
from keras.initializers import glorot_normal

X_train = [train_entities, train_p]
X_dev = [dev_entities, dev_p]
X_test = [test_entities, test_p]

x_input = Input(shape=(max_seq_len,), dtype='float32', name='pattern_input')
embedding_layer = Embedding(input_dim=e.shape[0],
                            output_dim=e.shape[1],
                            weights=[e], trainable=False) 
word = embedding_layer(x_input)
ent_input = Input(shape=(max_seq_len,))
ent = Embedding(input_dim=len(d_ent), output_dim=50, trainable=True)(ent_input)

x = concatenate([word, ent])
x = Convolution1D(128, 5, padding="valid", strides=1, activation='tanh')(x)
x = GlobalMaxPooling1D()(x)

x = Dense(8, activation='softmax')(x) 
model = Model(inputs=[ent_input, x_input], outputs=x)
model.summary()
model.compile(loss='categorical_crossentropy', optimizer="adadelta", metrics=['acc'])
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

callbacks = [
        ReduceLROnPlateau(patience=2, min_lr=0.0001, verbose=1, factor=0.1),
    EarlyStopping(patience=4, verbose=1)]
history = model.fit(X_train, y_train, epochs=20, 
                    batch_size=2,
                    shuffle=True, verbose=1, callbacks=callbacks, validation_data=(X_dev, y_dev))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pattern_input (InputLayer)      (None, 500)          0                                            
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 500)          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 500, 200)     3455600     pattern_input[0][0]              
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 500, 50)      5000        input_6[0][0]                    
__________________________________________________________________________________________________
concatenat

In [28]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

y_pred_test = model.predict(X_test)
y_test_classes_pred = y_pred_test.argmax(axis=-1)

#y_pred_train = model.predict(X_train)
#y_train_classes_pred = y_pred_train.argmax(axis=-1)

y_pred_dev = model.predict(X_dev)
y_dev_classes_pred = y_pred_dev.argmax(axis=-1)

print(classification_report(test["labels"], y_test_classes_pred))
print(confusion_matrix(test["labels"], y_test_classes_pred))

print(classification_report(dev["labels"], y_dev_classes_pred))
print(confusion_matrix(dev["labels"], y_dev_classes_pred))

              precision    recall  f1-score   support

           0       0.62      0.59      0.61        17
           1       0.50      0.07      0.12        56
           2       0.50      0.20      0.29         5
           3       0.17      0.04      0.06        26
           4       0.59      0.54      0.57        24
           5       0.65      0.43      0.51        47
           6       0.22      0.14      0.17        14
           7       0.86      0.98      0.92       645

   micro avg       0.82      0.82      0.82       834
   macro avg       0.51      0.37      0.41       834
weighted avg       0.77      0.82      0.78       834

[[ 10   1   1   0   0   4   1   0]
 [  2   4   0   3   2   2   3  40]
 [  1   0   1   0   1   1   0   1]
 [  0   1   0   1   0   0   0  24]
 [  2   0   0   0  13   1   0   8]
 [  1   0   0   0   2  20   0  24]
 [  0   2   0   0   1   0   2   9]
 [  0   0   0   2   3   3   3 634]]
              precision    recall  f1-score   support

           0 

In [29]:
model.save('final-models/mc-cnn-keys+embeddings.pkl')
pickle.dump(d_ent, open("final-models/dict_mc.pkl","wb"))