In [1]:
from tensorflow import keras
import pandas as pd
import numpy as np
import os
import dill as pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.metrics import FalseNegatives

In [2]:
def load_sentence_embeddings(model='DistilBERT', features_path='features/', filename='sentence_embeddings'):
    if model == 'DistilBERT':
        tst_df = pd.concat([pickle.load(open(os.path.join(features_path, file), 'rb')) for file in os.listdir(features_path) if filename in file])
        return tst_df

In [3]:
tst_df = load_sentence_embeddings()

In [4]:
tst_df.shape

(22332, 2)

In [5]:
tst_df.head()

Unnamed: 0,sentence_embeddings,label
0,"[-0.21086546778678894, -0.005486507900059223, ...",0
1,"[-0.3340323865413666, 0.1280461549758911, -0.6...",0
2,"[-0.4694300889968872, -0.14065003395080566, -0...",0
3,"[-0.19011789560317993, 0.16007745265960693, -0...",0
4,"[-0.5151359438896179, -0.09672432392835617, -0...",0


In [6]:
features = np.array(tst_df.sentence_embeddings.tolist())
labels = tst_df['label'].values

In [7]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


In [8]:
model = Sequential()
model.add(Dense(12, input_shape=(768, ), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_m])

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                9228      
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 1)                 9         
                                                                 
Total params: 9341 (36.49 KB)
Trainable params: 9341 (36.49 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
#model.fit(features, labels, epochs=150, batch_size=100)

In [11]:
model.fit(features, labels, epochs=150, batch_size=100, workers=32, use_multiprocessing=True)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

Epoch 85/150
Epoch 86/150
Epoch 87/150
Epoch 88/150
Epoch 89/150
Epoch 90/150
Epoch 91/150
Epoch 92/150
Epoch 93/150
Epoch 94/150
Epoch 95/150
Epoch 96/150
Epoch 97/150
Epoch 98/150
Epoch 99/150
Epoch 100/150
Epoch 101/150
Epoch 102/150
Epoch 103/150
Epoch 104/150
Epoch 105/150
Epoch 106/150
Epoch 107/150
Epoch 108/150
Epoch 109/150
Epoch 110/150
Epoch 111/150
Epoch 112/150
Epoch 113/150
Epoch 114/150
Epoch 115/150
Epoch 116/150
Epoch 117/150
Epoch 118/150
Epoch 119/150
Epoch 120/150
Epoch 121/150
Epoch 122/150
Epoch 123/150
Epoch 124/150
Epoch 125/150
Epoch 126/150
Epoch 127/150
Epoch 128/150
Epoch 129/150
Epoch 130/150
Epoch 131/150
Epoch 132/150
Epoch 133/150
Epoch 134/150
Epoch 135/150
Epoch 136/150
Epoch 137/150
Epoch 138/150
Epoch 139/150
Epoch 140/150
Epoch 141/150
Epoch 142/150
Epoch 143/150
Epoch 144/150
Epoch 145/150
Epoch 146/150
Epoch 147/150
Epoch 148/150
Epoch 149/150
Epoch 150/150


<keras.src.callbacks.History at 0x2276a0bbd30>

In [12]:
loss, f1_score = model.evaluate(features, labels, verbose=0)

In [15]:
loss, f1_score

(0.0040576765313744545, 0.33626389503479004)

In [16]:
predictions = (model.predict(features) > 0.5).astype(int)




In [17]:
for i in range(len(predictions)):
    if predictions[i] != labels[i]:
        print(i, predictions[i])

2359 [0]
5465 [0]
5491 [0]
5505 [0]
5559 [0]
5613 [0]
5653 [0]
5661 [0]
5813 [0]
5946 [0]
5993 [0]
6162 [0]
6184 [0]
12416 [0]
14890 [0]
14892 [0]
14893 [0]
15174 [0]
17990 [1]
19796 [1]
20878 [1]
21221 [1]
21709 [1]
21790 [1]
22023 [1]
22315 [1]


In [18]:
score = model.evaluate(test_features, test_labels, verbose=0)

NameError: name 'test_features' is not defined

In [20]:
model = Sequential()
model.add(Dense(12, input_shape=(768, ), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', FalseNegatives()])

In [13]:
accuracy

[0.9998656511306763, 3.0]

In [None]:

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])
