In [20]:
from load_dataset import load_dataset

In [21]:
import numpy as np
import dgl
from dgl.data import DGLDataset
import torch
from sklearn.model_selection import train_test_split

In [22]:
f, r = load_dataset('data/nx_network_data','politifact')

In [23]:
target_labels = np.concatenate([np.ones(len(f)), np.zeros(len(r))], axis=0)

In [24]:
len(target_labels)

628

In [25]:
f_arr = np.array(f,dtype=object)

In [26]:
r_arr = np.array(r,dtype=object)

In [27]:
dataset = np.concatenate((f_arr, r_arr), axis=0)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(dataset, target_labels, stratify=target_labels,
                                                        test_size=0.2, random_state=42)

In [29]:
class FakeNewsDataset(DGLDataset):
    def __init__(self,data,targets):
        self.data = data
        self.targets = targets
        super().__init__(name='fakenews')


    def process(self):
        #edges = pd.read_csv('./graph_edges.csv')
        #properties = pd.read_csv('./graph_properties.csv')
        self.graphs = []
        self.labels = []

        for n,target in zip(self.data,self.targets):
        
            g = dgl.from_networkx(n)
            g = dgl.add_self_loop(g)
            num_nodes = len(g.nodes())
            g.ndata['h']= torch.ones(num_nodes, 1)
            self.graphs.append(g)
            self.labels.append(target)
        # Convert the label list to tensor for saving.
        self.labels = torch.LongTensor(self.labels)

    def __getitem__(self, i):
        return self.graphs[i], self.labels[i]

    def __len__(self):
        return len(self.graphs)


In [30]:
X_train_dataset = FakeNewsDataset(X_train,y_train)

  self.labels = torch.LongTensor(self.labels)


In [31]:
graph, label = X_train_dataset[0]
print(graph, label)

Graph(num_nodes=28, num_edges=55,
      ndata_schemes={'h': Scheme(shape=(1,), dtype=torch.float32)}
      edata_schemes={}) tensor(1)


In [32]:
# """
# How Powerful are Graph Neural Networks
# https://arxiv.org/abs/1810.00826
# https://openreview.net/forum?id=ryGs6iA5Km
# Author's implementation: https://github.com/weihua916/powerful-gnns
# """


# import torch
# import torch.nn as nn
# import torch.nn.functional as F
# from dgl.nn.pytorch.conv import GINConv
# from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling


# class ApplyNodeFunc(nn.Module):
#     """Update the node feature hv with MLP, BN and ReLU."""
#     def __init__(self, mlp):
#         super(ApplyNodeFunc, self).__init__()
#         self.mlp = mlp
#         self.bn = nn.BatchNorm1d(self.mlp.output_dim)

#     def forward(self, h):
#         h = self.mlp(h)
#         h = self.bn(h)
#         h = F.relu(h)
#         return h


# class MLP(nn.Module):
#     """MLP with linear output"""
#     def __init__(self, num_layers, input_dim, hidden_dim, output_dim):
#         """MLP layers construction
#         Paramters
#         ---------
#         num_layers: int
#             The number of linear layers
#         input_dim: int
#             The dimensionality of input features
#         hidden_dim: int
#             The dimensionality of hidden units at ALL layers
#         output_dim: int
#             The number of classes for prediction
#         """
#         super(MLP, self).__init__()
#         self.linear_or_not = True  # default is linear model
#         self.num_layers = num_layers
#         self.output_dim = output_dim

#         if num_layers < 1:
#             raise ValueError("number of layers should be positive!")
#         elif num_layers == 1:
#             # Linear model
#             self.linear = nn.Linear(input_dim, output_dim)
#         else:
#             # Multi-layer model
#             self.linear_or_not = False
#             self.linears = torch.nn.ModuleList()
#             self.batch_norms = torch.nn.ModuleList()

#             self.linears.append(nn.Linear(input_dim, hidden_dim))
#             for layer in range(num_layers - 2):
#                 self.linears.append(nn.Linear(hidden_dim, hidden_dim))
#             self.linears.append(nn.Linear(hidden_dim, output_dim))

#             for layer in range(num_layers - 1):
#                 self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))
#         print(f"self.linear_or_not{self.linear_or_not}")

#     def forward(self, x):
#         if self.linear_or_not:
#             # If linear model
#             return self.linear(x)
#         else:
#             # If MLP
#             h = x
#             for i in range(self.num_layers - 1):
#                 h = F.relu(self.batch_norms[i](self.linears[i](h)))
#                 print(f"i {i}")
#                 print(f"h.size() {h.size()}")
#                 print(f"h {h}")
#             return self.linears[-1](h)


# class GIN(nn.Module):
#     """GIN model"""
#     def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim,
#                  output_dim, final_dropout, learn_eps, graph_pooling_type,
#                  neighbor_pooling_type):
#         """model parameters setting
#         Paramters
#         ---------
#         num_layers: int
#             The number of linear layers in the neural network
#         num_mlp_layers: int
#             The number of linear layers in mlps
#         input_dim: int
#             The dimensionality of input features
#         hidden_dim: int
#             The dimensionality of hidden units at ALL layers
#         output_dim: int
#             The number of classes for prediction
#         final_dropout: float
#             dropout ratio on the final linear layer
#         learn_eps: boolean
#             If True, learn epsilon to distinguish center nodes from neighbors
#             If False, aggregate neighbors and center nodes altogether.
#         neighbor_pooling_type: str
#             how to aggregate neighbors (sum, mean, or max)
#         graph_pooling_type: str
#             how to aggregate entire nodes in a graph (sum, mean or max)
#         """
#         super(GIN, self).__init__()
#         self.num_layers = num_layers
#         self.learn_eps = learn_eps

#         # List of MLPs
#         self.ginlayers = torch.nn.ModuleList()
#         self.batch_norms = torch.nn.ModuleList()

#         for layer in range(self.num_layers - 1):
#             if layer == 0:
#                 mlp = MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim)
#             else:
#                 mlp = MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim)

#             self.ginlayers.append(
#                 GINConv(ApplyNodeFunc(mlp), neighbor_pooling_type, 0, self.learn_eps))
#             self.batch_norms.append(nn.BatchNorm1d(hidden_dim))

#         # Linear function for graph poolings of output of each layer
#         # which maps the output of different layers into a prediction score
#         self.linears_prediction = torch.nn.ModuleList()

#         for layer in range(num_layers):
#             if layer == 0:
#                 self.linears_prediction.append(
#                     nn.Linear(input_dim, output_dim))
#             else:
#                 self.linears_prediction.append(
#                     nn.Linear(hidden_dim, output_dim))

#         self.drop = nn.Dropout(final_dropout)

#         if graph_pooling_type == 'sum':
#             self.pool = SumPooling()
#         elif graph_pooling_type == 'mean':
#             self.pool = AvgPooling()
#         elif graph_pooling_type == 'max':
#             self.pool = MaxPooling()
#         else:
#             raise NotImplementedError

#     def forward(self, g):
#         # list of hidden representation at each layer (including input)
#         num_nodes = len(g.nodes())
#         #h = torch.tensor([1]).view(1, -1) ## blank feature
#         h = torch.ones(num_nodes, 1)
#         hidden_rep = [h]
#         #print(f"g, {g}")
#         #print(f"h.size {h.size()}")
#         #print(f"h {h}")
#         #print(f"num_layers, {self.num_layers}")
#         for i in range(self.num_layers - 1):
#             #print(f"i {i}")
#             #print(f"h.size {h.size()}")
#             #print('h begining', h)
#             h = self.ginlayers[i](g, h)
#             #print(f"h.size {h.size()}")
#             #print('h after gin', h)
#             #h = self.batch_norms[i](h)
#             #print('h after gin', h)
#             h = F.relu(h)
#             #print(f"h.size {h.size()}")
#             #print('h after relu', h)
#             hidden_rep.append(h)

#         #print(f"hidden_rep {hidden_rep}")
#         score_over_layer = 0

#         # perform pooling over all nodes in each graph in every layer
#         for i, h in enumerate(hidden_rep):
#             #print(f"h.size {h.size()}")
#             #print(f"i, h in hidden_rep {h}")
#             pooled_h = self.pool(g, h)
#             #print(f"h.size {h.size()}")
#             #print(f"pooled_h {pooled_h}")

#             #score_over_layer += self.drop(self.linears_prediction[i](pooled_h))
#             this_layer= self.linears_prediction[i](pooled_h)
#             #print(f"this_layer, {this_layer.size()}")
#             #print(f"this_layer, {this_layer}")
#             score_over_layer += self.linears_prediction[i](pooled_h)
#             #print(f"score_over_layer {score_over_layer.size()}")
#             #print(f"score_over_layer {score_over_layer}")

#         return score_over_layer

In [33]:
"""
How Powerful are Graph Neural Networks
https://arxiv.org/abs/1810.00826
https://openreview.net/forum?id=ryGs6iA5Km
Author's implementation: https://github.com/weihua916/powerful-gnns
"""


import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch.conv import GINConv
from dgl.nn.pytorch.glob import SumPooling, AvgPooling, MaxPooling


class ApplyNodeFunc(nn.Module):
    """Update the node feature hv with MLP, BN and ReLU."""
    def __init__(self, mlp):
        super(ApplyNodeFunc, self).__init__()
        self.mlp = mlp
        self.bn = nn.BatchNorm1d(self.mlp.output_dim)

    def forward(self, h):
        h = self.mlp(h)
        h = self.bn(h)
        h = F.relu(h)
        return h


class MLP(nn.Module):
    """MLP with linear output"""
    def __init__(self, num_layers, input_dim, hidden_dim, output_dim):
        """MLP layers construction
        Paramters
        ---------
        num_layers: int
            The number of linear layers
        input_dim: int
            The dimensionality of input features
        hidden_dim: int
            The dimensionality of hidden units at ALL layers
        output_dim: int
            The number of classes for prediction
        """
        super(MLP, self).__init__()
        self.linear_or_not = True  # default is linear model
        self.num_layers = num_layers
        self.output_dim = output_dim

        if num_layers < 1:
            raise ValueError("number of layers should be positive!")
        elif num_layers == 1:
            # Linear model
            self.linear = nn.Linear(input_dim, output_dim)
        else:
            # Multi-layer model
            self.linear_or_not = False
            self.linears = torch.nn.ModuleList()
            self.batch_norms = torch.nn.ModuleList()

            self.linears.append(nn.Linear(input_dim, hidden_dim))
            for layer in range(num_layers - 2):
                self.linears.append(nn.Linear(hidden_dim, hidden_dim))
            self.linears.append(nn.Linear(hidden_dim, output_dim))

            for layer in range(num_layers - 1):
                self.batch_norms.append(nn.BatchNorm1d((hidden_dim)))

    def forward(self, x):
        if self.linear_or_not:
            # If linear model
            return self.linear(x)
        else:
            # If MLP
            h = x
            for i in range(self.num_layers - 1):
                h = F.relu(self.batch_norms[i](self.linears[i](h)))
            return self.linears[-1](h)


class GIN(nn.Module):
    """GIN model"""
    def __init__(self, num_layers, num_mlp_layers, input_dim, hidden_dim,
                 output_dim, final_dropout, learn_eps, graph_pooling_type,
                 neighbor_pooling_type):
        """model parameters setting
        Paramters
        ---------
        num_layers: int
            The number of linear layers in the neural network
        num_mlp_layers: int
            The number of linear layers in mlps
        input_dim: int
            The dimensionality of input features
        hidden_dim: int
            The dimensionality of hidden units at ALL layers
        output_dim: int
            The number of classes for prediction
        final_dropout: float
            dropout ratio on the final linear layer
        learn_eps: boolean
            If True, learn epsilon to distinguish center nodes from neighbors
            If False, aggregate neighbors and center nodes altogether.
        neighbor_pooling_type: str
            how to aggregate neighbors (sum, mean, or max)
        graph_pooling_type: str
            how to aggregate entire nodes in a graph (sum, mean or max)
        """
        super(GIN, self).__init__()
        self.num_layers = num_layers
        self.learn_eps = learn_eps

        # List of MLPs
        self.ginlayers = torch.nn.ModuleList()
        self.batch_norms = torch.nn.ModuleList()

        for layer in range(self.num_layers - 1):
            if layer == 0:
                mlp = MLP(num_mlp_layers, input_dim, hidden_dim, hidden_dim)
            else:
                mlp = MLP(num_mlp_layers, hidden_dim, hidden_dim, hidden_dim)

            self.ginlayers.append(
                GINConv(ApplyNodeFunc(mlp), neighbor_pooling_type, 0, self.learn_eps))
            self.batch_norms.append(nn.BatchNorm1d(hidden_dim))

        # Linear function for graph poolings of output of each layer
        # which maps the output of different layers into a prediction score
        self.linears_prediction = torch.nn.ModuleList()

        for layer in range(num_layers):
            if layer == 0:
                self.linears_prediction.append(
                    nn.Linear(input_dim, output_dim))
            else:
                self.linears_prediction.append(
                    nn.Linear(hidden_dim, output_dim))

        self.drop = nn.Dropout(final_dropout)

        if graph_pooling_type == 'sum':
            self.pool = SumPooling()
        elif graph_pooling_type == 'mean':
            self.pool = AvgPooling()
        elif graph_pooling_type == 'max':
            self.pool = MaxPooling()
        else:
            raise NotImplementedError

    def forward(self, g, h):
        # list of hidden representation at each layer (including input)
        hidden_rep = [h]

        for i in range(self.num_layers - 1):
            h = self.ginlayers[i](g, h)
            h = self.batch_norms[i](h)
            h = F.relu(h)
            hidden_rep.append(h)

        score_over_layer = 0

        # perform pooling over all nodes in each graph in every layer
        for i, h in enumerate(hidden_rep):
            pooled_h = self.pool(g, h)
            score_over_layer += self.drop(self.linears_prediction[i](pooled_h))

        return score_over_layer

In [34]:
import torch.optim as optim
from dgl.dataloading import GraphDataLoader
data_loader = GraphDataLoader(X_train_dataset, batch_size=8, shuffle=True)

In [35]:
model = GIN(num_layers = 2, num_mlp_layers =1, input_dim =1 , hidden_dim =64,
                 output_dim = 2, final_dropout = 0.2, learn_eps= False, graph_pooling_type='max',
                 neighbor_pooling_type='sum')
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
model.train()

GIN(
  (ginlayers): ModuleList(
    (0): GINConv(
      (apply_func): ApplyNodeFunc(
        (mlp): MLP(
          (linear): Linear(in_features=1, out_features=64, bias=True)
        )
        (bn): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (batch_norms): ModuleList(
    (0): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (linears_prediction): ModuleList(
    (0): Linear(in_features=1, out_features=2, bias=True)
    (1): Linear(in_features=64, out_features=2, bias=True)
  )
  (drop): Dropout(p=0.2, inplace=False)
  (pool): MaxPooling()
)

In [36]:
epoch_losses = []
for epoch in range(100):
    epoch_loss = 0
    for iter, (bg, label) in enumerate(data_loader):
        prediction = model(bg, bg.ndata['h'])
        loss = loss_func(prediction, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
    epoch_loss /= (iter + 1)
    print('Epoch {}, loss {:.4f}'.format(epoch, epoch_loss))
    epoch_losses.append(epoch_loss)

Epoch 0, loss 3.0358
Epoch 1, loss 1.5846
Epoch 2, loss 1.5506
Epoch 3, loss 2.8701
Epoch 4, loss 2.8756
Epoch 5, loss 2.6211
Epoch 6, loss 2.3613
Epoch 7, loss 1.2559
Epoch 8, loss 1.2450
Epoch 9, loss 1.0349
Epoch 10, loss 1.4873
Epoch 11, loss 1.0463
Epoch 12, loss 1.9234
Epoch 13, loss 1.0191
Epoch 14, loss 1.0772
Epoch 15, loss 0.9955
Epoch 16, loss 0.8957
Epoch 17, loss 1.3114
Epoch 18, loss 0.9493
Epoch 19, loss 0.8798
Epoch 20, loss 0.9551
Epoch 21, loss 0.9870
Epoch 22, loss 1.1008
Epoch 23, loss 0.8671
Epoch 24, loss 0.7587
Epoch 25, loss 0.9818
Epoch 26, loss 0.7713
Epoch 27, loss 1.0568
Epoch 28, loss 0.7917
Epoch 29, loss 0.7717
Epoch 30, loss 1.0259
Epoch 31, loss 0.7173
Epoch 32, loss 0.8181
Epoch 33, loss 1.0393
Epoch 34, loss 1.4695
Epoch 35, loss 0.8646
Epoch 36, loss 0.8665
Epoch 37, loss 0.8079
Epoch 38, loss 0.9221
Epoch 39, loss 0.7838
Epoch 40, loss 0.7253
Epoch 41, loss 0.8450
Epoch 42, loss 0.8226
Epoch 43, loss 0.7371
Epoch 44, loss 0.7370
Epoch 45, loss 0.741

In [37]:
X_test_dataset = FakeNewsDataset(X_test,y_test)

  self.labels = torch.LongTensor(self.labels)


In [38]:
model.eval()
# Convert a list of tuples to two lists
test_X, test_Y = map(list, zip(*X_test_dataset))
test_bg = dgl.batch(test_X)
test_Y = torch.tensor(test_Y).float().view(-1, 1)
probs_Y = torch.softmax(model(test_bg,test_bg.ndata['h']), 1)
sampled_Y = torch.multinomial(probs_Y, 1)
argmax_Y = torch.max(probs_Y, 1)[1].view(-1, 1)
print('Accuracy of sampled predictions on the test set: {:.4f}%'.format(
    (test_Y == sampled_Y.float()).sum().item() / len(test_Y) * 100))
print('Accuracy of argmax predictions on the test set: {:4f}%'.format(
    (test_Y == argmax_Y.float()).sum().item() / len(test_Y) * 100))

Accuracy of sampled predictions on the test set: 45.2381%
Accuracy of argmax predictions on the test set: 55.555556%
