In [8]:
import snap
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import pandas as pd  # To read data
from math import log

import csv
import webbrowser

import torch
import torch.nn as nn
import torch.nn.functional as F
import networkx as nx
from dgl import DGLGraph
import dgl.function as fn
from functools import partial

In [None]:
from load_datasets import load_banned
from load_datasets import load_graph
from load_datasets import get_banned_ids, banned_sub_in_G, get_random_subgraph
import random

DATASET = 'title' 

# lists of strings, which are the names of banned subreddits to train and test on
TESTING, TRAINING = [], []

if __name__ == '__main__':
  # load graph and maps that are used to map ids (int) to subreddit names (str)
  G, id_to_subreddit, subreddit_to_id = load_graph(DATASET)
  # load banned subreddits [str]
  banned = load_banned()
  # get banned subreddits that appear in G
  banned = banned_sub_in_G(G, id_to_subreddit, banned)
  # get list of random sample of banned subreddits to train on. 
  TRAINING = random.sample(banned, int(len(banned) * 3 / 4))
  # rest of subreddits are for testing.
  TESTING = list(set(banned) - set(TRAINING))
  # generate subgraph of G
  G_new = get_random_subgraph(G, get_banned_ids(TRAINING, subreddit_to_id))
  print('number of nodes in random subgraph: ', G_new.GetNodes())

In [5]:
class RGCNLayer(nn.Module):
    def __init__(self, in_feat, out_feat, num_rels, num_bases=-1, bias=None,
                 activation=None, is_input_layer=False):
        super(RGCNLayer, self).__init__()
        self.in_feat = in_feat
        self.out_feat = out_feat
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.bias = bias
        self.activation = activation
        self.is_input_layer = is_input_layer

        # sanity check
        if self.num_bases <= 0 or self.num_bases > self.num_rels:
            self.num_bases = self.num_rels

        # weight bases in equation (3)
        self.weight = nn.Parameter(torch.Tensor(self.num_bases, self.in_feat,
                                                self.out_feat))
        if self.num_bases < self.num_rels:
            # linear combination coefficients in equation (3)
            self.w_comp = nn.Parameter(torch.Tensor(self.num_rels, self.num_bases))

        # add bias
        if self.bias:
            self.bias = nn.Parameter(torch.Tensor(out_feat))

        # init trainable parameters
        nn.init.xavier_uniform_(self.weight,
                                gain=nn.init.calculate_gain('relu'))
        if self.num_bases < self.num_rels:
            nn.init.xavier_uniform_(self.w_comp,
                                    gain=nn.init.calculate_gain('relu'))
        if self.bias:
            nn.init.xavier_uniform_(self.bias,
                                    gain=nn.init.calculate_gain('relu'))

    def forward(self, g):
        if self.num_bases < self.num_rels:
            # generate all weights from bases (equation (3))
            weight = self.weight.view(self.in_feat, self.num_bases, self.out_feat)
            weight = torch.matmul(self.w_comp, weight).view(self.num_rels,
                                                        self.in_feat, self.out_feat)
        else:
            weight = self.weight

        if self.is_input_layer:
            def message_func(edges):
                # for input layer, matrix multiply can be converted to be
                # an embedding lookup using source node id
                embed = weight.view(-1, self.out_feat)
                index = edges.data['rel_type'] * self.in_feat + edges.src['id']
                return {'msg': embed[index] * edges.data['norm']}
        else:
            def message_func(edges):
                w = weight[edges.data['rel_type']]
                msg = torch.bmm(edges.src['h'].unsqueeze(1), w).squeeze()
                msg = msg * edges.data['norm']
                return {'msg': msg}

        def apply_func(nodes):
            h = nodes.data['h']
            if self.bias:
                h = h + self.bias
            if self.activation:
                h = self.activation(h)
            return {'h': h}

        g.update_all(message_func, fn.sum(msg='msg', out='h'), apply_func)

In [9]:
class Model(nn.Module):
    def __init__(self, num_nodes, h_dim, out_dim, num_rels,
                 num_bases=-1, num_hidden_layers=1):
        super(Model, self).__init__()
        self.num_nodes = num_nodes
        self.h_dim = h_dim
        self.out_dim = out_dim
        self.num_rels = num_rels
        self.num_bases = num_bases
        self.num_hidden_layers = num_hidden_layers

        # create rgcn layers
        self.build_model()

        # create initial features
        self.features = self.create_features()

    def build_model(self):
        self.layers = nn.ModuleList()
        # input to hidden
        i2h = self.build_input_layer()
        self.layers.append(i2h)
        # hidden to hidden
        for _ in range(self.num_hidden_layers):
            h2h = self.build_hidden_layer()
            self.layers.append(h2h)
        # hidden to output
        h2o = self.build_output_layer()
        self.layers.append(h2o)

    # initialize feature for each node
    def create_features(self):
        features = torch.arange(self.num_nodes)
        return features

    def build_input_layer(self):
        return RGCNLayer(self.num_nodes, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu, is_input_layer=True)

    def build_hidden_layer(self):
        return RGCNLayer(self.h_dim, self.h_dim, self.num_rels, self.num_bases,
                         activation=F.relu)

    def build_output_layer(self):
        return RGCNLayer(self.h_dim, self.out_dim, self.num_rels, self.num_bases,
                         activation=partial(F.softmax, dim=1))

    def forward(self, g):
        if self.features is not None:
            g.ndata['id'] = self.features
        for layer in self.layers:
            layer(g)
        return g.ndata.pop('h')

In [14]:
edges=pd.read_csv('title-data/snap-redditHyperlinks-title.csv', names = ["source","destination"])
convertName2Id=pd.read_csv('title-data/snap-subreddit-ids-title.csv', names = ["name","id"])

In [13]:
banned=pd.read_csv('gcntestdata-cleaned.csv')
banned.head()

Unnamed: 0,Name
0,Beatingwomen
1,Braincels
2,CreepShots
3,CringeAnarchy
4,DarkNetMarkets


In [15]:
def makeGraph(edges, N):
    """
    :param - N: number of nodes

    return type: snap.PUNGraph
    """
    ############################################################################
    # TODO: Your code here!
    G1 = snap.TUNGraph.New()
    for i in range(N):
        G1.AddNode(i)
    for index, row in edges.iterrows():
        a = int(row['source'])
        b = int(row['destination'])
#         print(a)
#         print(b)
        G1.AddEdge(a, b)
    Graph = snap.ConvertGraph(snap.PUNGraph, G1)
    ############################################################################
    return Graph


In [16]:
G = makeGraph(edges, len(edges))

In [None]:
# load graph data
import numpy as np
# data = load_data(dataset='aifb')
num_nodes = G_new.GetNodes()
num_rels = G_new.GetEdges()
num_classes = 2
labels = data.labels
train_idx = TRAINING
test =  TESTING
# split training and validation set
val_idx = TRAINING[:len(TRAINING) // 5]
train_idx = train_idx[len(TRAINING) // 5:]

# edge type and normalization factor
edge_type = torch.from_numpy(data.edge_type)
edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)

labels = torch.from_numpy(labels).view(-1)