In [None]:
import gzip
import pickle
import networkx as nx
import pandas as pd
import numpy as np
import random
import os
from pebble import lattice

In [2]:
from torch.optim import SGD
from torch.optim.lr_scheduler import ReduceLROnPlateau

  from .autonotebook import tqdm as notebook_tqdm


## Data Loading

### Writing PyG Dataset

In [3]:
import torch

In [4]:
from torch_geometric.data import InMemoryDataset
from torch.utils.data import DataLoader

In [5]:
from torch_geometric.utils import from_networkx, to_networkx

In [6]:
def clustering_coefficient(G, node):
    ns = [n for n in G.neighbors(node)]
    if len(ns) <= 1:
        return 0
    
    numerator = 0
    denominator = len(ns) * (len(ns) - 1) / 2
    for i in range(0, len(ns)):
        for j in range(i+1, len(ns)):
            n1, n2 = ns[i], ns[j]
            numerator += G.has_edge(n1, n2)
    
    return numerator / denominator
                

In [7]:
def generate_feature_vector(G):
    x = torch.randn(G.number_of_nodes(), 4)
    ind = 0
    for node in G.nodes():
        x[ind][0] = 1 # uniform
        x[ind][1] = G.degree[node] # node degree as a scalar 
        x[ind][2] = clustering_coefficient(G, node) # triangle counting?
        x[ind][3] = ind # node ID features
        ind += 1
    return x

In [8]:
class LamanDataset(InMemoryDataset):
    def __init__(self, root, data_dir, transform=None, pre_transform=None, pre_filter=None):
        self.data_dir = data_dir
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def processed_file_names(self):
        return ['data.pt']
        
    def process(self):
        total_laman_data = None
        with gzip.open(self.data_dir, 'r') as f:
            total_laman_data = pickle.load(f)
            
        data_list = []
        for ind, graph in enumerate(total_laman_data[0]):
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 0
            data_list.append(graph_as_data)
            
        for ind, graph in enumerate(total_laman_data[1]):
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 1
            data_list.append(graph_as_data)
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [9]:
DATA_PATH = "data/custom-generated.pkl.gz"

In [10]:
laman_data = LamanDataset("", DATA_PATH)

In [11]:
laman_data[0]

Data(edge_index=[2, 58], x=[15, 4], label=[1], num_nodes=15)

## Split into Train / Test

In [12]:
from torch.utils.data import random_split

proportions = [.7, .3]
lengths = [int(p * len(laman_data)) for p in proportions]
lengths[-1] = len(laman_data) - sum(lengths[:-1])

generator1 = torch.Generator().manual_seed(42)
train_data, test_data = random_split(laman_data, lengths, generator=generator1)

In [13]:
from torch_geometric.loader import DataLoader
train_loader = DataLoader(train_data, batch_size = 256, shuffle=True)
test_loader = DataLoader(test_data, batch_size = 256, shuffle=True)

In [14]:
print("Number of train batches: ", len(train_loader))
print("Number of test batches: ", len(test_loader))

Number of train batches:  244
Number of test batches:  105


In [15]:
for data in train_loader:
    print(data)
    break

DataBatch(edge_index=[2, 15466], x=[3840, 4], label=[256], num_nodes=3840, batch=[3840], ptr=[257])


## Model Architecture & Training

In [16]:
from basic_gcn.gcn import GCN

In [17]:
model = GCN(num_features=4)
print(model)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

GCN(
  (initial_conv): GCNConv(4, 10)
  (out): Linear(in_features=20, out_features=20, bias=True)
  (out2): Linear(in_features=20, out_features=1, bias=True)
)
Number of parameters:  491


In [18]:
from torch_geometric.data import DataLoader
from torch.nn import BCELoss
import warnings
warnings.filterwarnings("ignore")

loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),
                                      lr=0.001)

# scheduler = ReduceLROnPlateau(optimizer, 'min', min_lr=1e-6, verbose=True, patience=10)

In [19]:
def train(data, features_to_use):
    ind = 0
    model.train()
    for batch in data:
        optimizer.zero_grad()
        pred, embedding = model(batch.x[:, features_to_use], batch.edge_index, batch.batch)
        pred = torch.squeeze(pred)
        loss = loss_fn(pred.float(), batch.label.float())
        loss.backward()
        optimizer.step()
        ind += 1

    return loss, None

In [20]:
def check_accuracy(model, loader, features_to_use):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for batch in loader:
            pred, embedding = model(batch.x[:, features_to_use], batch.edge_index, batch.batch)
            pred = torch.squeeze(pred)
            y = batch.label
            predictions = (pred > 0.5).long() 
            num_correct += (predictions == y).sum() 
            num_samples += predictions.size(0)
            
    return float(num_correct)/float(num_samples)*100

In [21]:
print("Starting training...")
losses = []

bestModel, highestAcc = None, 0

for epoch in range(300):
    loss, _ = train(train_loader, [0, 1, 2, 3])
    losses.append(loss)
    print(f"Epoch {epoch} | Train loss {loss}")
    train_acc, test_acc = check_accuracy(model, train_loader, [0, 1, 2, 3]), check_accuracy(model, test_loader, [0, 1, 2, 3])
    print(f"Train Accuracy {train_acc} | Test Accuracy {test_acc}")
    
    if test_acc > highestAcc:
        highestAcc = test_acc
        bestModel = model

Starting training...
Epoch 0 | Train loss 0.5980435013771057
Train Accuracy 72.9021091095583 | Test Accuracy 73.00127140827163
Epoch 1 | Train loss 0.4411201775074005
Train Accuracy 82.3578434515033 | Test Accuracy 82.13671378356145
Epoch 2 | Train loss 0.4325883388519287
Train Accuracy 83.96852362330918 | Test Accuracy 83.9241642360332
Epoch 3 | Train loss 0.34170225262641907
Train Accuracy 84.29867299185845 | Test Accuracy 84.2158402512901
Epoch 4 | Train loss 0.3407120108604431
Train Accuracy 84.66408103083532 | Test Accuracy 84.36915713110463
Epoch 5 | Train loss 0.29925134778022766
Train Accuracy 84.73940637220335 | Test Accuracy 84.74684017650138
Epoch 6 | Train loss 0.3032626211643219
Train Accuracy 84.94935572793128 | Test Accuracy 84.76927679305962
Epoch 7 | Train loss 0.32959598302841187
Train Accuracy 85.05673440605167 | Test Accuracy 84.90389649240895
Epoch 8 | Train loss 0.34215569496154785
Train Accuracy 85.15449708314635 | Test Accuracy 85.03851619175829
Epoch 9 | Train 

Epoch 76 | Train loss 0.31625425815582275
Train Accuracy 88.98647349189051 | Test Accuracy 88.82656495400494
Epoch 77 | Train loss 0.25126975774765015
Train Accuracy 88.71882813000833 | Test Accuracy 88.41522698377085
Epoch 78 | Train loss 0.16792063415050507
Train Accuracy 88.64670812231553 | Test Accuracy 88.29556502879366
Epoch 79 | Train loss 0.20350387692451477
Train Accuracy 89.03936149753189 | Test Accuracy 88.86769875102834
Epoch 80 | Train loss 0.27415233850479126
Train Accuracy 88.72203346368357 | Test Accuracy 88.40774811158477
Epoch 81 | Train loss 0.2759786546230316
Train Accuracy 88.87108147958203 | Test Accuracy 88.57602273577145
Epoch 82 | Train loss 0.27774858474731445
Train Accuracy 89.04256683120713 | Test Accuracy 88.87517762321441
Epoch 83 | Train loss 0.26944464445114136
Train Accuracy 89.04737483171998 | Test Accuracy 88.77421284870242
Epoch 84 | Train loss 0.24679717421531677
Train Accuracy 88.64350278864029 | Test Accuracy 88.35165657018922
Epoch 85 | Train los

Train Accuracy 88.1482787358164 | Test Accuracy 87.82065664497793
Epoch 152 | Train loss 0.31760892271995544
Train Accuracy 89.10827617154946 | Test Accuracy 88.94622690898213
Epoch 153 | Train loss 0.29565343260765076
Train Accuracy 88.98807615872812 | Test Accuracy 88.8826564954005
Epoch 154 | Train loss 0.3015795648097992
Train Accuracy 88.93038015257389 | Test Accuracy 88.7929100291676
Epoch 155 | Train loss 0.2243097424507141
Train Accuracy 89.02493749599333 | Test Accuracy 88.87143818712137
Epoch 156 | Train loss 0.28650742769241333
Train Accuracy 89.26213218796076 | Test Accuracy 88.98736070600553
Epoch 157 | Train loss 0.25676649808883667
Train Accuracy 89.09866017052374 | Test Accuracy 88.92752972851694
Epoch 158 | Train loss 0.24843788146972656
Train Accuracy 89.2300788512084 | Test Accuracy 88.9537057811682
Epoch 159 | Train loss 0.270255982875824
Train Accuracy 89.07622283479711 | Test Accuracy 88.9537057811682
Epoch 160 | Train loss 0.3020096719264984
Train Accuracy 89.097

Epoch 227 | Train loss 0.32043540477752686
Train Accuracy 89.34386819667928 | Test Accuracy 89.17059307456435
Epoch 228 | Train loss 0.27003175020217896
Train Accuracy 89.40316686967114 | Test Accuracy 89.234163488146
Epoch 229 | Train loss 0.23010732233524323
Train Accuracy 89.3647028655683 | Test Accuracy 89.20050856330866
Epoch 230 | Train loss 0.2536255717277527
Train Accuracy 89.02654016283094 | Test Accuracy 88.84152269837709
Epoch 231 | Train loss 0.20968526601791382
Train Accuracy 88.89031348163344 | Test Accuracy 88.72560017949294
Epoch 232 | Train loss 0.24137452244758606
Train Accuracy 89.48650554522726 | Test Accuracy 89.2154663076808
Epoch 233 | Train loss 0.29955580830574036
Train Accuracy 89.33585486249117 | Test Accuracy 89.16311420237828
Epoch 234 | Train loss 0.30466845631599426
Train Accuracy 89.38553753445734 | Test Accuracy 89.19302969112259
Epoch 235 | Train loss 0.2652409076690674
Train Accuracy 89.1307135072761 | Test Accuracy 88.99857901428464
Epoch 236 | Train

In [22]:
# sanity check

In [23]:
# test on best model

In [24]:
# square
import networkx as nx
square = nx.Graph()
square.add_edge(0, 1)
square.add_edge(1, 3)
square.add_edge(0, 2)
square.add_edge(2, 3)

In [25]:
# square with cross bar (rigid)
import networkx as nx
square_bar = nx.Graph()
square_bar.add_edge(0, 1)
square_bar.add_edge(1, 3)
square_bar.add_edge(0, 2)
square_bar.add_edge(2, 3)
square_bar.add_edge(0, 3)

In [26]:
# triangle
import networkx as nx
triangle = nx.Graph()
triangle.add_edge(0, 1)
triangle.add_edge(0, 2)
triangle.add_edge(1, 2)

In [27]:
# pentagon
import networkx as nx
pentagon = nx.Graph()
pentagon.add_edge(0, 1)
pentagon.add_edge(1, 3)
pentagon.add_edge(3, 4)
pentagon.add_edge(4, 2)
pentagon.add_edge(2, 0)

In [28]:
# no triangle and rigid
rigid = nx.Graph()
rigid.add_edge(0, 1)
rigid.add_edge(0, 2)
rigid.add_edge(0, 4)
rigid.add_edge(1, 2)
rigid.add_edge(1, 5)
rigid.add_edge(2, 3)
rigid.add_edge(3, 4)
rigid.add_edge(3, 5)
rigid.add_edge(4, 5)

In [29]:
model.eval()
bestModel.eval()

GCN(
  (initial_conv): GCNConv(4, 10)
  (out): Linear(in_features=20, out_features=20, bias=True)
  (out2): Linear(in_features=20, out_features=1, bias=True)
)

In [34]:
toy_problems = [square, square_bar, triangle, pentagon, rigid]
labels = [1, 0, 0, 1, 0]

for index, toy_problem in enumerate(toy_problems):
    graph_as_data = from_networkx(toy_problem)
    graph_as_data.x = generate_feature_vector(toy_problem)
#     print(generate_feature_vector(toy_problem))
#     graph_as_data.label = labels[index]
    validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)
    for batch in validation_set:
        model.eval()
        with torch.no_grad():
#             print(batch.x[:, [0, 1, 2, 3]])
#             print("******")
#             print(batch.edge_index)
#             print("******")
#             print(batch.batch)
#             print("******")
            pred = model(batch.x[:, [0, 1, 2, 3]], batch.edge_index, batch.batch)
            print(pred)
#     break
    

(tensor([[0.8983]]), tensor([[2.1784]]))
(tensor([[0.0891]]), tensor([[-2.3246]]))
(tensor([[0.4025]]), tensor([[-0.3951]]))
(tensor([[0.9303]]), tensor([[2.5917]]))
(tensor([[0.0054]]), tensor([[-5.2084]]))


In [37]:
# generate the bad examples

In [42]:
test_to_generate_bad_data = DataLoader(test_data, batch_size = 1, shuffle=True)

In [74]:
train_to_generate_bad_data = DataLoader(train_data, batch_size = 1, shuffle=True)

In [43]:
print(len(test_to_generate_bad_data))

26742


In [76]:
check_accuracy(model, train_to_generate_bad_data, [0, 1, 2, 3])

89.29418552471311

In [79]:
incorrectly_predicted_flexible_graphs = []

In [80]:
incorrectly_predicted_rigid_graphs = []

In [110]:
print(len(incorrectly_predicted_flexible_graphs))

5360


In [111]:
print(len(incorrectly_predicted_rigid_graphs))

4275


In [109]:
check_accuracy(model, test_to_generate_bad_data, [0, 1, 2, 3])

88.95744521726124

In [112]:
output_file = "incorrectly-predicted.pkl.gz"
with gzip.open(output_file, 'wb') as f:
    pickle.dump((incorrectly_predicted_rigid_graphs, incorrectly_predicted_flexible_graphs), f, pickle.HIGHEST_PROTOCOL)

In [108]:
print(len(train_to_generate_bad_data))

62396


In [100]:
def check_accuracy(model, loader, features_to_use):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for batch in loader:
            pred, embedding = model(batch.x[:, features_to_use], batch.edge_index, batch.batch)
            pred = torch.squeeze(pred)
            y = batch.label
            predictions = (pred > 0.5).long() 
            num_correct += (predictions == y).sum() 
            if not (predictions == y).sum():
                graph = to_networkx(batch, to_undirected = True)
                if y[0] == 0:
                    incorrectly_predicted_rigid_graphs.append(graph)
                elif y[0] == 1:
                    incorrectly_predicted_flexible_graphs.append(graph)
                
            num_samples += 1
            
    return float(num_correct)/float(num_samples)*100

In [None]:
print(model.training)
model.train()
print(model.training)

In [None]:
rigid_data, not_rigid_data = [], []
stats = {}
stats_considered = {}
prev_graphs = []

stats_wrong = {}

num_nodes = 30
for p in np.arange(0.01, 0.3, 0.01):
    stats[p] = 0
    stats_wrong[p] = 0
    for num_graphs in range(1000):
        G = nx.erdos_renyi_graph(num_nodes, p)        
        l = lattice()
        num_edges = 0

        for (u, v) in G.edges():
            if l.add_bond(u, v):
                num_edges += 1

        label = 1
        rigid = False
        if num_edges >= (num_nodes * 2) - 3: # rigid 
            rigid = True
            stats[p] += 1
            label = 0

        graph_as_data = from_networkx(G)
        graph_as_data.x = generate_feature_vector(G)
        validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)
        for batch in validation_set:
            pred = model(batch.x[:, [0]], batch.edge_index, batch.batch)
            pred_label = 1
            if (pred[0][0][0] < 0.5):
                pred_label = 0
                
            if pred_label != label:
                print(pred[0][0][0] , " ", label)
                stats_wrong[p] += 1
                print("wrong: , with number of edges: " , G.number_of_edges(), " ", num_edges)
                
    print(stats[p])

In [None]:
print(stats)

In [None]:
print(stats_wrong)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(list(stats_wrong.keys()), stats_wrong.values(), color='g')
plt.show()

In [None]:
(sum(stats_wrong.values())) / (len(stats_wrong) * 1000)

In [None]:
len(stats_wrong) * 10000

In [None]:
graph_as_data

In [None]:
rigid_data, not_rigid_data = [], []
stats = {}
prev_graphs = []

num_nodes = 30
for p in np.arange(0.01, 0.3, 0.01):
    stats[p] = 0
    for num_graphs in range(1000):
        G = nx.erdos_renyi_graph(num_nodes, p)
        l = lattice()
        num_edges = 0

        for (u, v) in G.edges():
            if l.add_bond(u, v):
                num_edges += 1

        rigid = False
        if num_edges >= (num_nodes * 2) - 3: # rigid 
            rigid_data.append(G)
            stats[p] += 1
        else:
            not_rigid_data.append(G)

        prev_graphs.append(G)
        
    print(stats[p])

In [None]:
rigid_data_1_wrong, not_rigid_data_1_wrong = [], []
stats_wrong = {}
stats_wrong_cum = {}
prev_graphs = []

num_nodes = 30
for num_edges in range(57, 200):
    model.eval()
    stats_wrong[num_edges] = 0
    for num_graphs in range(10):
        G = generate_rigid_nodes_edges(num_nodes, num_edges)
        
#         G = nx.erdos_renyi_graph(num_nodes, p)
#         generate_rigid_nodes_edges()
        graph_as_data = from_networkx(G)
        graph_as_data.x = generate_feature_vector(G)
#         graph_as_data.label = labels[index]
        validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)
        for batch in validation_set:
            with torch.no_grad():
                pred = model(batch.x[:, [0, 1, 2, 3]], batch.edge_index, batch.batch)
                if (pred[0][0][0] > 0.5):
                    stats_wrong[num_edges] += 1
                    print("WRONG")
                

print(stats_wrong)

In [None]:
validation_set = DataLoader([graph_as_data], batch_size = 1, shuffle=True)

In [None]:
for batch in validation_set:
    pred = bestModel(batch.x, batch.edge_index, batch.batch)
    print(pred[0])
    print(pred[1])

In [None]:
class LamanTestDataset(InMemoryDataset):
    def __init__(self, root, data_dir, transform=None, pre_transform=None, pre_filter=None):
        self.data_dir = data_dir
        super().__init__(root, transform, pre_transform, pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def processed_file_names(self):
        return ['data_test.pt']
        
    def process(self):
        # processing code here
        total_laman_data = None
        with gzip.open(self.data_dir, 'r') as f:
            total_laman_data = pickle.load(f)
            
        data_list = []
        ind = 0
        # convert from graph to Data object
        for graph in total_laman_data[0]:
#             print(ind)
            ind += 1
            num_nodes = nx.number_of_nodes(graph)
#             x = torch.randn(num_nodes, 1)
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 0
            data_list.append(graph_as_data)
            
        ind = 0
        for graph in total_laman_data[1]:
#             print(ind)
            ind += 1
            num_nodes = nx.number_of_nodes(graph)
#             x = torch.randn(num_nodes, 64)
            x = generate_feature_vector(graph)
            graph_as_data = from_networkx(graph)
            graph_as_data.x = x
            graph_as_data.label = 1
            data_list.append(graph_as_data)
            
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [None]:
# add functionality to support a test dataset
TEST_DATA_PATH = "../data-2d/data/test-dataset-30loc-5std.pkl.gz"
laman_test_set = LamanTestDataset("", TEST_DATA_PATH)

In [None]:
from torch_geometric.loader import DataLoader


In [None]:
from torch_geometric.loader import DataLoader
laman_test_loader = DataLoader(laman_test_set, batch_size = 2, shuffle=True)

In [None]:
random_test_acc = check_accuracy(model, test_loader, [0, 1, 2, 3])
print(f"Accuracy {random_test_acc}")

In [None]:
# generate statistics on the data

In [None]:
# test the clustering coefficient

In [None]:
clustering_coefficient(square, 0)

In [None]:
clustering_coefficient(triangle, 0)

In [None]:
clustering_coefficient(square_bar, 0)

In [None]:
# generate graph correlating clustering coefficient to rigidity

In [None]:
for item in train_data:
    item = to_networkx(item)
    print(type(item))
    
    break

In [None]:
torch_geometric.utils.convert.to_networkx()

# Scratch Work: Sahil

In [None]:
total_laman_data = None
with gzip.open(DATA_PATH, 'r') as f:
    total_laman_data = pickle.load(f)

In [None]:
sample_graph = total_laman_data[0][0]

In [None]:
print(type(sample_graph))

In [None]:
to_data = from_networkx(sample_graph)
from_data = to_networkx(to_data, to_undirected = True)

In [None]:
print(type(from_data))

In [None]:
def compute_min_clustering_coefficient(G):
    min_coefficient = 1
    for node in G.nodes():
        min_coefficient = min(min_coefficient, clustering_coefficient(G, node))
        
    return min_coefficient

In [None]:
for index, sample_graph in enumerate(train_data):
    label = sample_graph.label
    networkx_sample_graph = to_networkx(sample_graph, to_undirected = True)
    print(label, " ", index, " ", compute_min_clustering_coefficient(networkx_sample_graph))
    
    if index == 10:
        break

In [None]:
print(compute_min_clustering_coefficient(from_data))

In [None]:
compute_min_clustering_coefficient(triangle)

In [None]:
model_just_degree = GIN(num_features=1)
print(model_just_degree)
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

In [None]:
print("Starting training...")
losses = []

bestModel, highestAcc = None, 0

for epoch in range(1000):
    loss, h = train(train_loader, [0, 1])
    losses.append(loss)
    print(f"Epoch {epoch} | Train loss {loss}")
    train_acc, test_acc = check_accuracy(model_just_degree, train_loader), check_accuracy(model_just_degree, test_loader)
    print(f"Train Accuracy {train_acc} | Test Accuracy {test_acc}")
#     scheduler.step(test_acc)
    
    if test_acc > highestAcc:
        highestAcc = test_acc
        bestModel = model

In [None]:
generate_feature_vector(sample_graph)

In [None]:
sample_graph.x[:, [0, 1]]

In [None]:
import importlib

In [None]:
from data_gen import generate_rigid_nodes_edges

In [None]:
import sys
sys.path.insert(0, '/Users/sahiljain/Documents/Fall 2022/Independent Work/reversible-inductive-construction/code/genric/laman')

In [None]:
clustering_coefficient(triangle, 0)

In [35]:
# save the torch model

In [36]:
torch.save(model.state_dict(), f"gcn-model-filtered-data.pt")