In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.utils import to_categorical
from keras.layers import Conv1D

from keras.layers import Dense, Flatten, Dropout, BatchNormalization, Conv2D, Conv1D, MaxPooling1D, LeakyReLU
import tensorflow
import numpy as np
import random
import json
import os
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

In [None]:
with open("../../data/train_test_spit.json", "r") as f:
    split = json.load(f)

In [None]:
EMBEDDING_FILENAME = os.path.join("..", "..", "data", "wikilinks_train.emb")
embeddings = np.genfromtxt(EMBEDDING_FILENAME, delimiter=',')

In [None]:
def construct_dataset(X, y, embeddings, links, class_):
    for i, j in links:
        X.append(np.concatenate((embeddings[i], embeddings[j]), axis=0))
        y.append(class_)
    return X, y

In [None]:
tr_1 = random.sample(split["train"]["1"], int(len(split["train"]["1"]) * 0.02))
tr_0 = random.sample(split["train"]["0"], int(len(split["train"]["0"]) * 0.02))

te_1 = random.sample(split["test"]["1"], int(len(split["test"]["1"]) * 0.02))
te_0 = random.sample(split["test"]["0"], int(len(split["test"]["0"]) * 0.02))

In [None]:
X_train, y_train, X_test, y_test = [], [], [], []

X_train, y_train = construct_dataset(X_train, y_train, embeddings, tr_1, 1)
X_train, y_train = construct_dataset(X_train, y_train, embeddings, tr_0, 0)

X_test, y_test = construct_dataset(X_train, y_train, embeddings, te_1, 1)
X_test, y_test = construct_dataset(X_train, y_train, embeddings, te_0, 0)

In [None]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
# Reshape the input data for the CNN architecture
X_train_cnn = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_valid_cnn = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))

print(X_train_cnn.shape)
print(X_valid_cnn.shape)

In [None]:
model = Sequential()
model.add(Conv1D(filters = 128, kernel_size = 5, activation = 'relu', input_shape = (X_train_cnn.shape[1], 1)))
model.add(Dropout(rate = 0.25))
model.add(Flatten())
model.add(Dense(32, activation = 'relu', input_dim = X_train.shape[1]))
model.add(Dropout(rate = 0.25))
model.add(Dense(1, activation = 'sigmoid'))

# compile the model - use categorical crossentropy, and the adam optimizer
model.compile(
                loss = 'binary_crossentropy',
                optimizer = 'adam',
                metrics = ['accuracy'])

In [None]:
model.fit(X_train_cnn, y_train, batch_size = 128, epochs=1, verbose = 1)

In [None]:
y_valid_preds_hybrid = model.predict(X_valid_cnn, verbose = 1)

In [None]:
def print_report(y_actual, y_pred):
    auc = roc_auc_score(y_actual, y_pred)
    accuracy = accuracy_score(y_actual, y_pred)
    recall = recall_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    print('AUC:%.3f'%auc)
    print('accuracy:%.3f'%accuracy)
    print('recall:%.3f'%recall)
    print('precision:%.3f'%precision)
    print(' ')

In [None]:
y_valid_preds_hybrid[y_valid_preds_hybrid < 0.5] = 0
y_valid_preds_hybrid[y_valid_preds_hybrid >= 0.5] = 1

In [None]:
print('Valid')
print_report(y_test, y_valid_preds_hybrid)