In [2]:
import numpy as np
import os
import pickle
import networkx as nx
from sklearn.metrics import confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [5]:
def load_pickle(filename):
    with open(filename, 'rb') as pkl_file:
        data = pickle.load(pkl_file)
    return data

In [6]:
def save_as_pickle(filename, data):
    with open(filename, 'wb') as output:
        pickle.dump(data, output)

In [7]:
class gcn(nn.Module):
    def __init__(self, X_size, A_hat, args): # X_size = num features
        super(gcn, self).__init__()
        self.A_hat = torch.tensor(A_hat, requires_grad=False).float()
        self.weight = nn.parameter.Parameter(torch.FloatTensor(X_size, args["hidden_size_1"]))
        var = 2./(self.weight.size(1)+self.weight.size(0))
        self.weight.data.normal_(0,var)
        self.weight2 = nn.parameter.Parameter(torch.FloatTensor(args["hidden_size_1"], args["hidden_size_2"]))
        var2 = 2./(self.weight2.size(1)+self.weight2.size(0))
        self.weight2.data.normal_(0,var2)
        self.bias = nn.parameter.Parameter(torch.FloatTensor(args["hidden_size_1"]))
        self.bias.data.normal_(0,var)
        self.bias2 = nn.parameter.Parameter(torch.FloatTensor(args["hidden_size_2"]))
        self.bias2.data.normal_(0,var2)
        self.fc1 = nn.Linear(args["hidden_size_2"], args["num_classes"])
        
    def forward(self, X): ### 2-layer GCN architecture
        X = torch.mm(X, self.weight)
        X = (X + self.bias)
        X = F.relu(torch.mm(self.A_hat, X))
        X = torch.mm(X, self.weight2)
        X = (X + self.bias2)
        X = F.relu(torch.mm(self.A_hat, X))
        return self.fc1(X)

In [8]:
def evaluate_model_results(columnName, args=None):
    #Load data here
    
    ### Loads graph data
    G_path = "text_graph_%s.pkl" % columnName
    G = load_pickle(G_path)
    A = nx.to_numpy_array(G, weight="weight"); A = A + np.eye(G.number_of_nodes())
    degrees = []
    for d in G.degree(weight=None):
        if d == 0:
            degrees.append(0)
        else:
            degrees.append(d[1]**(-0.5))
    degrees = np.diag(degrees)
    X = np.eye(G.number_of_nodes()) # Features are just identity matrix
    A_hat = degrees@A@degrees
    f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net
    f = torch.from_numpy(f).float()
    print("A_hat, f and X generated for testing") #marker
    ### Loads labels
    test_idxs = load_pickle("test_idxs.pkl")
    labels_selected = load_pickle("labels_selected.pkl")
    labels_not_selected = load_pickle("labels_not_selected.pkl")
    
    ### Loads best model ###
    net = gcn(X.shape[1], A_hat, args)
    net_model_state = load_pickle("net_model_state.pkl")
    net.load_state_dict(net_model_state)
    print("model revived for testing") #marker
    ### Inference
    net.eval()
    with torch.no_grad():
        pred_labels = net(f)
    save_as_pickle("pred_labels_%s.pkl" % columnName , pred_labels)

    print("pred_labels saved") #marker
#     fig = plt.figure(figsize=(25,25))
#     ax = fig.add_subplot(111)
#     sb.heatmap(c_m, annot=False)
#     ax.set_title("Confusion Matrix", fontsize=20)
#     ax.set_xlabel("Actual class", fontsize=17)
#     ax.set_ylabel("Predicted", fontsize=17)
#     plt.savefig("confusion_matrix.png")
    
    return

In [9]:
def load_datasets(columnName):
    """Loads dataset and graph if exists, else create and process them from raw data
    Returns --->
    f: torch tensor input of GCN (Identity matrix)
    X: input of GCN (Identity matrix)
    A_hat: transformed adjacency matrix A
    selected: indexes of selected labelled nodes for training
    test_idxs: indexes of not-selected nodes for inference/testing
    labels_selected: labels of selected labelled nodes for training
    labels_not_selected: labels of not-selected labelled nodes for inference/testing
    """
    G_path = "text_graph_%s.pkl" % columnName
    print(G_path)
    G = load_pickle(G_path)
    
    A = nx.to_numpy_array(G, weight="weight"); A = A + np.eye(G.number_of_nodes())
    degrees = []
    for d in G.degree(weight=None):
        if d == 0:
            degrees.append(0)
        else:
            degrees.append(d[1]**(-0.5))
    degrees = np.diag(degrees)
    X = np.eye(G.number_of_nodes()) # Features are just identity matrix
    A_hat = degrees@A@degrees
    f = X # (n X n) X (n X n) x (n X n) X (n X n) input of net
    print("A_hat, f and X generated") #marker
    selected = load_pickle("selected.pkl")
    labels_selected = load_pickle("labels_selected.pkl")
    labels_not_selected = load_pickle("labels_not_selected.pkl")
    
    f_selected = f[selected]; f_selected = torch.from_numpy(f_selected).float()
    
    f_not_selected = f[test_idxs]; f_not_selected = torch.from_numpy(f_not_selected).float()
    
    f = torch.from_numpy(f).float()
    return f, X, A_hat
    

In [10]:
def predict_labels(column):
    f, X, A_hat = load_datasets(column)
    net = gcn(X.shape[1], A_hat, args)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=args["lr"])
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1000,2000,3000,4000,5000,6000], gamma=0.77)

    start_epoch, best_pred, checkpoint = 0, 0, None
    losses_per_epoch, accuracy_per_epoch = [], []
    print("Start training") #marker
    net.train()
    evaluation_trained = []
    for e in range(start_epoch, args["num_epochs"]):
        print("epoch_no. %d" % e) #marker
        optimizer.zero_grad()
        output = net(f)
        loss = criterion(output[selected], torch.tensor(labels_selected).long() -1)
        # losses_per_epoch.append(loss.item())
        loss.backward()
        optimizer.step()
        scheduler.step()

    save_as_pickle("net_model_state.pkl" % net.state_dict())
    print("Evaluating model results") #marker
    evaluate_model_results(column, args=args)
    return

In [11]:
args = {
        "hidden_size_1": 330,
        "hidden_size_2": 130,
        "num_classes": 66,
        "test_ratio": 0.1,
        "num_epochs": 3300,
        "lr": 0.011,
        "model_no": 0
}
save_as_pickle("args.pkl", args)

In [12]:
df_data = pd.read_csv("PubMedDataClean.csv", index_col=False)
### stratified test samples
test_idxs = []
dum = df_data["DocID"]

test_idxs.extend(list(np.random.choice(dum.index, size=round(args["test_ratio"]*len(dum)), replace=False)))
save_as_pickle("test_idxs.pkl", test_idxs)
# select only certain labelled nodes for semi-supervised GCN
selected = []
for i in range(len(df_data)):
    if i not in test_idxs:
        selected.append(i)
save_as_pickle("selected.pkl", selected)

labels_selected = [l for idx, l in enumerate(df_data["DocID"]) if idx in selected]
labels_not_selected = [l for idx, l in enumerate(df_data["DocID"]) if idx not in selected]
save_as_pickle("labels_selected.pkl", labels_selected)
save_as_pickle("labels_not_selected.pkl", labels_not_selected)

In [None]:
for column in ["TITLE_CLEAN", "KEYWORDS_CLEAN", "ABSTRACT_CLEAN"]:
    predict_labels(column = column)

In [None]:
pred_labels_majority = []

pred_labels_TITLE_CLEAN = load_pickle("pred_labels_TITLE_CLEAN.pkl")
pred_labels_KEYWORDS_CLEAN = load_pickle("pred_labels_KEYWORDS_CLEAN.pkl")
pred_labels_ABSTRACT_CLEAN = load_pickle("pred_labels_ABSTRACT_CLEAN.pkl")

for i in range(len(pred_labels_TITLE_CLEAN)):
    pred_labels_majority[i] = int((pred_labels_TITLE_CLEAN[i] + pred_labels_KEYWORDS_CLEAN[i] + pred_labels_ABSTRACT_CLEAN[i]) / 2)

c_m = confusion_matrix([(e-1) for e in labels_not_selected], list(pred_labels_majority[test_idxs].max(1)[1].numpy()))
save_as_pickle("confusion_matrix.pkl", c_m)

In [18]:
print(columnList[0])

TITLE_CLEAN


In [None]:
predict_labels(column = columnList[0])

text_graph_TITLE_CLEAN.pkl
