In [0]:
from google.colab import drive
import numpy as np
drive.mount('/content/drive')

try:
    %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from gensim.test.utils import datapath
from gensim import utils
import os

def getDataAsString(path):
  data=[]
  labels = []
  directory = os.fsencode(path)

  for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.endswith(".deft"): 
      
      filename=path+filename  
      with open(filename) as fp:
        line = fp.readline()
        while line:
          l=line.strip()
          labels.append(int(l[len(l)-2]))
          if(len(utils.simple_preprocess(l))==0):
            data.append(" ".join(utils.simple_preprocess('hamada')))
          else:
            data.append(" ".join(utils.simple_preprocess(l)))
          
          line = fp.readline()
  return data, labels


In [0]:
import tensorflow_datasets as tfds
corpus, y_train = getDataAsString('drive/My Drive/deft_train/')
testData, y_test = getDataAsString('drive/My Drive/deft_test/')

print("Train Data: ", len(corpus))

encoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(corpus, target_vocab_size=10000)


Train Data:  18157


In [0]:
def zeroPad(data, encodedArray, max):
  docs=np.zeros((len(data),max))
  for i in range(len(encodedArray)):
    docs[i][0:len(encodedArray[i])]+=encodedArray[i]
  return docs


def encodeDataAndPad():
  encodedTrain = []
  max = 0
  for sent in corpus:
    encodedTrain.append(encoder.encode(sent))
    if len(encodedTrain[-1])>max:
      max = len(encodedTrain[-1])

  docsTrain = zeroPad(corpus, encodedTrain, max)
  print(docsTrain.shape)


  encodedTest = []
  for sent in testData:
    encodedTest.append(encoder.encode(sent))

  docsTest = zeroPad(testData, encodedTest, max)

  trainLabels=np.array(y_train).reshape((-1,1))
  testLabels=np.array(y_test).reshape((-1,1))


  return docsTrain, trainLabels, docsTest, testLabels


docsTrain, y_train, docsTest, y_test = encodeDataAndPad()

(18157, 113)


In [0]:
from keras import backend as K
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import LSTM, Flatten, Dense, Dropout, Masking, Embedding, BatchNormalization
from keras import regularizers
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 128),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    # tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(64, activation='relu'),
    # tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Dense(32, activation='relu'),

    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['acc',f1_m,precision_m, recall_m])


model.summary()


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 128)         1275520   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_27 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_28 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 33        
Total params: 1,557,249
Trainable params: 1,557,249
Non-trainable params: 0
_________________________________________________________________


In [0]:
from keras.callbacks import ModelCheckpoint
filepath="weights.best2.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_f1_m', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
history  = model.fit(docsTrain, y_train, batch_size=64,  epochs=10, validation_split=0.1,callbacks=callbacks_list)



Train on 16341 samples, validate on 1816 samples
Epoch 1/10
Epoch 00001: val_f1_m improved from -inf to 0.10610, saving model to weights.best2.hdf5
Epoch 2/10
Epoch 00002: val_f1_m improved from 0.10610 to 0.56288, saving model to weights.best2.hdf5
Epoch 3/10
Epoch 00003: val_f1_m improved from 0.56288 to 0.57575, saving model to weights.best2.hdf5
Epoch 4/10
Epoch 00004: val_f1_m improved from 0.57575 to 0.61048, saving model to weights.best2.hdf5
Epoch 5/10
Epoch 00005: val_f1_m did not improve from 0.61048
Epoch 6/10
Epoch 00006: val_f1_m did not improve from 0.61048
Epoch 7/10
Epoch 00007: val_f1_m did not improve from 0.61048
Epoch 8/10
Epoch 00008: val_f1_m did not improve from 0.61048
Epoch 9/10
Epoch 00009: val_f1_m did not improve from 0.61048
Epoch 10/10
Epoch 00010: val_f1_m did not improve from 0.61048


In [0]:
from sklearn.metrics import f1_score,classification_report

model.load_weights("weights.best2.hdf5")
loss, accuracy, f1_score, precision, recall  = model.evaluate(docsTest, y_test,verbose=1)

print("Accuracy: ", accuracy)
print("f1: ", f1_score)

y_predicted = model.predict_classes(docsTest)

print(classification_report(y_test, y_predicted))


Accuracy:  0.7725674
f1:  0.61753833
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       573
           1       0.66      0.62      0.64       280

    accuracy                           0.77       853
   macro avg       0.74      0.73      0.74       853
weighted avg       0.77      0.77      0.77       853

