In [1]:
import gzip
import pickle
import networkx as nx
import pandas as pd
import numpy as np
import random
import os

In [2]:
from torch.optim import SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau

  from .autonotebook import tqdm as notebook_tqdm


## Data Loading

### Writing PyG Dataset

In [3]:
import torch

In [4]:
from torch_geometric.data import InMemoryDataset
from torch.utils.data import DataLoader

In [5]:
from torch_geometric.utils import from_networkx, to_networkx

In [6]:
def clustering_coefficient(G, node):
    ns = [n for n in G.neighbors(node)]
    if len(ns) <= 1:
        return 0
    
    numerator = 0
    denominator = len(ns) * (len(ns) - 1) / 2
    for i in range(0, len(ns)):
        for j in range(i+1, len(ns)):
            n1, n2 = ns[i], ns[j]
            numerator += G.has_edge(n1, n2)
    
    return numerator / denominator
                

In [7]:
def generate_feature_vector(G):
    x = torch.randn(G.number_of_nodes(), 4)
    ind = 0
    for node in G.nodes():
        x[ind][0] = 1 # uniform
        x[ind][1] = G.degree[node] # node degree as a scalar 
        x[ind][2] = clustering_coefficient(G, node) # triangle counting?
        x[ind][3] = ind # node ID features
        ind += 1
    return x

In [8]:
class LamanDataset(InMemoryDataset):
    def __init__(self, root, data_dir, transform=None, pre_transform=None, pre_filter=None):
        self.data_dir = data_dir
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def processed_file_names(self):
        return ['data.pt']
        
    def process(self):
        total_laman_data = None
        with gzip.open(self.data_dir, 'r') as f:
            total_laman_data = pickle.load(f)
            
        data_list = []
        for ind, graph in enumerate(total_laman_data[0]):
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 0
            data_list.append(graph_as_data)
            
        for ind, graph in enumerate(total_laman_data[1]):
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 1
            data_list.append(graph_as_data)
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [9]:
DATA_PATH = "data/custom-generated.pkl.gz"

In [10]:
laman_data = LamanDataset("", DATA_PATH)

In [11]:
laman_data[0]

Data(edge_index=[2, 64], x=[15, 4], label=[1], num_nodes=15)

## Split into Train / Test

In [12]:
from torch.utils.data import random_split

proportions = [.7, .3]
lengths = [int(p * len(laman_data)) for p in proportions]
lengths[-1] = len(laman_data) - sum(lengths[:-1])

generator1 = torch.Generator().manual_seed(42)
train_data, test_data = random_split(laman_data, lengths, generator=generator1)

In [13]:
from torch_geometric.loader import DataLoader
train_loader = DataLoader(train_data, batch_size = 256, shuffle=True)
test_loader = DataLoader(test_data, batch_size = 256, shuffle=True)

In [14]:
print("Number of train batches: ", len(train_loader))
print("Number of test batches: ", len(test_loader))

Number of train batches:  27
Number of test batches:  12


In [15]:
for data in train_loader:
    print(data)
    break

DataBatch(edge_index=[2, 15082], x=[3840, 4], label=[256], num_nodes=3840, batch=[3840], ptr=[257])


## Model Architecture & Training

In [16]:
from gin.gin_k_layers import GIN

In [21]:
model = GIN(layers = 2, num_features=4, dim_h=5)
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

GIN(
  (conv_layers): ModuleList(
    (0): GINConv(nn=Sequential(
      (0): Linear(in_features=4, out_features=5, bias=True)
      (1): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=5, out_features=5, bias=True)
      (4): ReLU()
    ))
    (1): GINConv(nn=Sequential(
      (0): Linear(in_features=5, out_features=5, bias=True)
      (1): BatchNorm1d(5, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Linear(in_features=5, out_features=5, bias=True)
      (4): ReLU()
    ))
  )
  (lin1): Linear(in_features=30, out_features=30, bias=True)
  (lin2): Linear(in_features=30, out_features=1, bias=True)
)
Number of parameters:  1096


In [22]:
from torch_geometric.data import DataLoader
from torch.nn import BCELoss
import warnings
warnings.filterwarnings("ignore")

loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),
                                      lr=0.001)

# scheduler = ReduceLROnPlateau(optimizer, 'min', min_lr=1e-6, verbose=True, patience=10)

In [23]:
def train(model, data, features_to_use):
    ind = 0
    for batch in data:
        optimizer.zero_grad()
        pred, embedding = model(batch.x[:, features_to_use], batch.edge_index, batch.batch)
        pred = torch.squeeze(pred)
        loss = loss_fn(pred.float(), batch.label.float())
        loss.backward()
        optimizer.step()
        ind += 1

    return loss, model

In [24]:
def check_accuracy(model, loader, features_to_use):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for batch in loader:
            y = batch.label
            batch.label = 0
            pred, embedding = model(batch.x[:, features_to_use], batch.edge_index, batch.batch)
            pred = torch.squeeze(pred)
            predictions = (pred > 0.5).long() 
            num_correct += (predictions == y).sum() 
            num_samples += predictions.size(0)
            
    return float(num_correct)/float(num_samples)*100

In [None]:
print("Starting training...")
losses = []

bestModel, highestAcc = None, 0

for epoch in range(600):
    loss, model = train(model, train_loader, [0, 1, 2, 3])
    losses.append(loss)
    print(f"Epoch {epoch} | Train loss {loss}")
    train_acc, test_acc = check_accuracy(model, train_loader, [0, 1, 2, 3]), check_accuracy(model, test_loader, [0, 1, 2, 3])
    print(f"Train Accuracy {train_acc} | Test Accuracy {test_acc}")
    
    if test_acc > highestAcc:
        highestAcc = test_acc
        bestModel = model

Starting training...
Epoch 0 | Train loss 0.6849928498268127
Train Accuracy 55.693950177935946 | Test Accuracy 55.482531995849186
Epoch 1 | Train loss 0.6633871793746948
Train Accuracy 55.693950177935946 | Test Accuracy 55.482531995849186
Epoch 2 | Train loss 0.6894745826721191
Train Accuracy 55.63463819691577 | Test Accuracy 55.55171221030785
Epoch 3 | Train loss 0.6548170447349548
Train Accuracy 57.23606168446026 | Test Accuracy 56.62400553441715
Epoch 4 | Train loss 0.6661596894264221
Train Accuracy 56.86536180308423 | Test Accuracy 57.6271186440678
Epoch 5 | Train loss 0.676677942276001
Train Accuracy 57.725385527876625 | Test Accuracy 57.59252853683846
Epoch 6 | Train loss 0.6616142392158508
Train Accuracy 57.87366548042705 | Test Accuracy 57.6271186440678
Epoch 7 | Train loss 0.7219956517219543
Train Accuracy 58.377817319098455 | Test Accuracy 58.42269111034244
Epoch 8 | Train loss 0.6362115144729614
Train Accuracy 59.14887307236062 | Test Accuracy 59.25285368384642
Epoch 9 | Tra

Epoch 76 | Train loss 0.2874738574028015
Train Accuracy 87.5741399762752 | Test Accuracy 86.40608785887235
Epoch 77 | Train loss 0.31146079301834106
Train Accuracy 87.76690391459074 | Test Accuracy 87.23625043237634
Epoch 78 | Train loss 0.30428311228752136
Train Accuracy 87.79655990510084 | Test Accuracy 87.44379107575233
Epoch 79 | Train loss 0.22341381013393402
Train Accuracy 87.97449584816133 | Test Accuracy 86.99411968177101
Epoch 80 | Train loss 0.30001357197761536
Train Accuracy 87.95966785290629 | Test Accuracy 87.37461086129366
Epoch 81 | Train loss 0.3173699378967285
Train Accuracy 87.90035587188612 | Test Accuracy 87.40920096852301
Epoch 82 | Train loss 0.2565793991088867
Train Accuracy 88.1376037959668 | Test Accuracy 87.23625043237634
Epoch 83 | Train loss 0.24362696707248688
Train Accuracy 88.22657176749703 | Test Accuracy 87.47838118298166
Epoch 84 | Train loss 0.2315070778131485
Train Accuracy 88.5379596678529 | Test Accuracy 87.96264268419232
Epoch 85 | Train loss 0.28

Train Accuracy 91.51838671411625 | Test Accuracy 90.66067104808025
Epoch 152 | Train loss 0.20587629079818726
Train Accuracy 91.65183867141162 | Test Accuracy 90.52231061916291
Epoch 153 | Train loss 0.28129324316978455
Train Accuracy 91.57769869513642 | Test Accuracy 90.38395019024559
Epoch 154 | Train loss 0.22043509781360626
Train Accuracy 91.31079478054566 | Test Accuracy 90.2801798685576
Epoch 155 | Train loss 0.1489722579717636
Train Accuracy 91.1773428232503 | Test Accuracy 90.38395019024559
Epoch 156 | Train loss 0.1918436884880066
Train Accuracy 91.50355871886121 | Test Accuracy 90.48772051193359
Epoch 157 | Train loss 0.1510874629020691
Train Accuracy 91.34045077105574 | Test Accuracy 90.41854029747492
Epoch 158 | Train loss 0.24050162732601166
Train Accuracy 91.59252669039147 | Test Accuracy 90.62608094085091
Epoch 159 | Train loss 0.14495502412319183
Train Accuracy 91.32562277580071 | Test Accuracy 90.72985126253892
Epoch 160 | Train loss 0.15313830971717834
Train Accuracy 

Train Accuracy 91.80011862396204 | Test Accuracy 90.55690072639226
Epoch 227 | Train loss 0.16418956220149994
Train Accuracy 91.63701067615658 | Test Accuracy 90.93739190591491
Epoch 228 | Train loss 0.22120310366153717
Train Accuracy 92.12633451957295 | Test Accuracy 90.97198201314424
Epoch 229 | Train loss 0.20863653719425201
Train Accuracy 91.77046263345196 | Test Accuracy 90.52231061916291
Epoch 230 | Train loss 0.16627417504787445
Train Accuracy 91.94839857651246 | Test Accuracy 90.76444136976825
Epoch 231 | Train loss 0.19830092787742615
Train Accuracy 91.99288256227757 | Test Accuracy 90.55690072639226
Epoch 232 | Train loss 0.1661868691444397
Train Accuracy 91.45907473309609 | Test Accuracy 90.97198201314424
Epoch 233 | Train loss 0.1576758474111557
Train Accuracy 91.85943060498221 | Test Accuracy 90.55690072639226
Epoch 234 | Train loss 0.16058875620365143
Train Accuracy 91.63701067615658 | Test Accuracy 90.79903147699758
Epoch 235 | Train loss 0.17483459413051605
Train Accura

In [None]:
# generate a twenty node graph

In [None]:
# generate fully connected 11-node graph

In [None]:
# generate an 20-node graph
n = 30

In [None]:
g = nx.Graph()

In [None]:
for i in range(0, 30):
    j = (np.random.randint(0, 30))
    while j == i:
        j = (np.random.randint(0, 30))
        
    print(j == i)
    g.add_edge(i, j)

In [None]:
all_possible_edges = set()
for i in range(0, 30):
    for j in range(i+1, 30):
        if not g.has_edge(i, j):
            all_possible_edges.add((i,j))

In [None]:
print(all_possible_edges)

In [None]:
import random 
non_edges_sample_order = random.sample(all_possible_edges, 57 - 31)

In [None]:
for (u,v) in non_edges_sample_order:
    g.add_edge(u, v)

In [None]:
print(g.number_of_edges())

In [None]:
model.eval()
with torch.no_grad():
    graph_as_data = from_networkx(g)
    graph_as_data.x = generate_feature_vector(g)
    validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)
    for batch in validation_set:
        pred = model(batch.x[:, [0]], batch.edge_index, batch.batch)
        print(pred)


In [None]:
check_accuracy(model, test_loader, [0])

In [None]:
check_accuracy(model, train_loader, [0, 1, 2, 3])

In [None]:
# sanity check

In [None]:
# test on best model

In [None]:
# square
import networkx as nx
square = nx.Graph()
square.add_edge(0, 1)
square.add_edge(1, 3)
square.add_edge(0, 2)
square.add_edge(2, 3)

In [None]:
# square with cross bar (rigid)
import networkx as nx
square_bar = nx.Graph()
square_bar.add_edge(0, 1)
square_bar.add_edge(1, 3)
square_bar.add_edge(0, 2)
square_bar.add_edge(2, 3)
square_bar.add_edge(0, 3)
square_bar.add_edge(1, 2)

In [None]:
# triangle
import networkx as nx
triangle = nx.Graph()
triangle.add_edge(0, 1)
triangle.add_edge(0, 2)
triangle.add_edge(1, 2)

In [None]:
# pentagon
import networkx as nx
pentagon = nx.Graph()
pentagon.add_edge(0, 1)
pentagon.add_edge(1, 3)
pentagon.add_edge(3, 4)
pentagon.add_edge(4, 2)
pentagon.add_edge(2, 0)

In [None]:
toy_problems = [square, square_bar, triangle, pentagon]
labels = [1, 0, 0, 1]

for index, toy_problem in enumerate(toy_problems):
    model.eval()
    with torch.no_grad():
        graph_as_data = from_networkx(toy_problem)
        graph_as_data.x = generate_feature_vector(toy_problem)
#         graph_as_data.label = labels[index]
        validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)
        for batch in validation_set:
            pred = model(batch.x[:, [0, 1, 2, 3]], batch.edge_index, batch.batch)
            print(pred)
    

In [None]:
graph_as_data

In [None]:
validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)

In [None]:
for batch in validation_set:
    pred = bestModel(batch.x, batch.edge_index, batch.batch)
    print(pred[0])
    print(pred[1])

In [None]:
class LamanTestDataset(InMemoryDataset):
    def __init__(self, root, data_dir, transform=None, pre_transform=None, pre_filter=None):
        self.data_dir = data_dir
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def processed_file_names(self):
        return ['data_test.pt']
        
    def process(self):
        # processing code here
        total_laman_data = None
        with gzip.open(self.data_dir, 'r') as f:
            total_laman_data = pickle.load(f)
            
        data_list = []
        ind = 0
        # convert from graph to Data object
        for graph in total_laman_data[0]:
#             print(ind)
            ind += 1
            num_nodes = nx.number_of_nodes(graph)
#             x = torch.randn(num_nodes, 1)
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 0
            data_list.append(graph_as_data)
            
        ind = 0
        for graph in total_laman_data[1]:
#             print(ind)
            ind += 1
            num_nodes = nx.number_of_nodes(graph)
#             x = torch.randn(num_nodes, 64)
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 1
            data_list.append(graph_as_data)
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
# add functionality to support a test dataset
TEST_DATA_PATH = "../data-2d/data/test-dataset-30loc-5std.pkl.gz"
laman_test_set = LamanTestDataset("", TEST_DATA_PATH)

In [None]:
from torch_geometric.loader import DataLoader
laman_test_loader = DataLoader(laman_test_set, batch_size = 2, shuffle=True)

In [None]:
random_test_acc = check_accuracy(bestModel, laman_test_loader)
print(f"Accuracy {random_test_acc}")

In [None]:
# generate statistics on the data

In [None]:
# test the clustering coefficient

In [None]:
clustering_coefficient(square, 0)

In [None]:
clustering_coefficient(triangle, 0)

In [None]:
clustering_coefficient(square_bar, 0)

In [None]:
# generate graph correlating clustering coefficient to rigidity

In [None]:
for item in train_data:
    item = to_networkx(item)
    print(type(item))
    
    break

In [None]:
torch_geometric.utils.convert.to_networkx()

# Scratch Work: Sahil

In [None]:
total_laman_data = None
with gzip.open(DATA_PATH, 'r') as f:
    total_laman_data = pickle.load(f)

In [None]:
sample_graph = total_laman_data[0][0]

In [None]:
print(type(sample_graph))

In [None]:
to_data = from_networkx(sample_graph)
from_data = to_networkx(to_data, to_undirected = True)

In [None]:
print(type(from_data))

In [None]:
def compute_min_clustering_coefficient(G):
    min_coefficient = 1
    for node in G.nodes():
        min_coefficient = min(min_coefficient, clustering_coefficient(G, node))
        
    return min_coefficient

In [None]:
for index, sample_graph in enumerate(train_data):
    label = sample_graph.label
    networkx_sample_graph = to_networkx(sample_graph, to_undirected = True)
    print(label, " ", index, " ", compute_min_clustering_coefficient(networkx_sample_graph))
    
    if index == 10:
        break

In [None]:
print(compute_min_clustering_coefficient(from_data))

In [None]:
compute_min_clustering_coefficient(triangle)

In [None]:
# what if instead of training a gnn – you just trained on 

In [None]:
# train a binary classifier on just the degrees of the nodes



In [None]:
# train a binary classifier on just the triangle feature

In [None]:
# train a binary classifier on the triangle features and the degree

In [None]:
# train a network with just degree of the node 

In [None]:
model_just_degree = GIN(num_features=1)
print(model_just_degree)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

In [None]:
print("Starting training...")
losses = []

bestModel, highestAcc = None, 0

for epoch in range(1000):
    loss, h = train(train_loader, [0])
    losses.append(loss)
    print(f"Epoch {epoch} | Train loss {loss}")
    train_acc, test_acc = check_accuracy(model_just_degree, train_loader), check_accuracy(model_just_degree, test_loader)
    print(f"Train Accuracy {train_acc} | Test Accuracy {test_acc}")
#     scheduler.step(test_acc)
    
    if test_acc > highestAcc:
        highestAcc = test_acc
        bestModel = model

In [None]:
generate_feature_vector(sample_graph)

In [None]:
sample_graph.x[:, [0, 1]]