In [39]:
# local python tests
from pprint import pprint
from igraph import *
import igraph.test
import pickle
import os
import numpy as np
from scipy.sparse import csr_matrix
import collections
import copy
from random import randint





"""

    %%%%%%%%%%%%%%%%%%%%%%%%%%% gcn tests %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    x  feature vectors
    tx feature vectors for test (unlabeled)
    allx feature vectors for all

    y  label vectors       numpy.ndarray
    ty test label vectors  numpy.ndarray
    ally  test label of all

    graph : adjacency matrix (dict of lists)
    test.index: node id list for tests?


    steps
    1) inspect ind.citeseer.x/tx/y/ty/allx/ally/test.index

    2) build those files from code

    3) call learning algorithm
        cd src/notebooks
        python gcn_tests_input.py&&  cp ind.* ../gcn/gcn/gcn/data
        cd src/gcn/gcn/gcn
        python train.py --dataset t1


    4) look at embedding result

"""


def pickleToDisk(myobj, filepath):
    pickle.dump(myobj, open(filepath, "wb"))


def printGraphObject(filepath):


    myobject = os.path.basename(filepath)
    print("\n\n*************** "+myobject+" ********************")
    y = pickle.load(open(filepath,'rb'))
    print(type(y))
    if type(y) ==  type([]) or type(y) == type(np.ndarray([])):
        
        print(len(y))
        pprint(y[:min(3,len(y)-1)])
    elif type(y) == type({}) or type(y)==type(collections.defaultdict()):
        pprint(y.keys()[:min(3,len(y)-1)])
        pprint(y[y.keys()[0]])
        pprint(y[y.keys()[1]])
        pprint(y[y.keys()[20]])
        pprint(y[y.keys()[3000]])

    else:
        
        pprint(y.shape)
        pprint(y[0,:min(3,y.shape[1]-1)].toarray())
        pprint(y[0,0].shape)
        pprint(type(y[0,0]))
        pprint(y[0,0])

def readListFile(filepath):
    myobject = os.path.basename(filepath)
    print("\n\n*************** "+myobject+" ********************")
    

    testInstances =[]
    with open(filepath, 'rb') as f:
        for line in f.readlines():
            testInstances.append(int(line))

    pprint(testInstances[:min(3,len(testInstances)-1)])


def inspectData():

    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.y')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.ty')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.x')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.tx')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.graph')
    readListFile('../gcn/gcn/gcn/data/ind.citeseer.test.index')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.allx')
    printGraphObject('../gcn/gcn/gcn/data/ind.citeseer.ally')


def discretize(elements):
    """ create a binary array for each of the 10 ranges of values between 0 and 1"""
    orig = np.array(elements)
    results = []
    for i in range(10):
        print(i) 
        if i<9:
            range1 = np.bitwise_and(orig >= 0.1*i ,orig <= 0.1*(i+1))
        else:
            range1 = orig >= 0.1*i
        print(range1)
        results.append(range1) 
        
    result = np.transpose(np.asarray(results))
    print(result.shape)
    return result


def generateData(suffix, n_node=1000, n_test=200, n_train=300 ):
    
    

    #igraph.test.run_tests()

    # generation
    #g = Graph.Tree(127, 2)
    #g = Graph.GRG(100, 0.2)
    #g = Graph.Erdos_Renyi(100, 0.2)
    #g = Graph.Watts_Strogatz(1, 100, 4, 0.5 )
    g = Graph.Barabasi(n_node)
    summary(g)

    # graph metrics
    # pprint(g.degree([2,3,6,99]))
    # pprint(g.edge_betweenness())
    # pprint(g.pagerank())
    #pprint(g.get_adjacency())
    #pprint(dir(g))
    


    # test.index
    testList = [] 
    with open(suffix+'.test.index','wb') as f:
        while len(testList) < n_test:
            j = randint(0, n_node-1)
            #verify it is not alerady there
            try:
                pos = testList.index(j)
            except:
                testList.append(j)
            f.write(str.encode(str(j)+"\n"))


    # adjacency dict (graph)
    i = 0
    A = g.get_adjlist()
    graphAdj = {}
    for node in A:
        graphAdj[i] = node
        i+=1
    pickleToDisk(graphAdj, suffix+".graph")

    
    xFeatures = np.ones(n_node)
    # allx is for labeled and unlabeled training samples
    allx  = copy.deepcopy([xFeatures[i] for i in range(n_node) if i not in testList])
    # x is for labeled training samples only
    x = copy.deepcopy(allx[:-500])
    #tx = copy.deepcopy(xFeatures[range(n_node-n_test, n_node)])
    tx = [xFeatures[i] for i in testList]
    

    
    allx = csr_matrix(allx)
    allx = np.transpose(allx)
    pickleToDisk(allx, suffix+".allx")
    x = csr_matrix(x)
    x = np.transpose(x)
    pickleToDisk(x, suffix+".x")
    tx = csr_matrix(tx)
    tx = np.transpose(tx)
    pickleToDisk(tx, suffix+".tx")

    print(x.shape)
    print(tx.shape)
    print(allx.shape)
    


    # labels (ally,y , ty)
    prs = discretize(g.pagerank())

    

    # labels of x, and empty labels of unlabeled training samples
    prs_all = copy.deepcopy([prs[i] for i in range(n_node) if i not in testList])
    # labels of the labeled training samples
    prs_train = copy.deepcopy(prs_all[:-500])
    prs_test = copy.deepcopy([prs[i] for i in testList])
    #testList.extend(validationList)
    #finalList = testList
    #prs_train = [ prs[i] for i in range(prs.shape[0]) if i not in finalList ]

    ally = np.array(prs_all)
    y = np.array(prs_train)  
    ty = np.array(prs_test)


    pickleToDisk(ally, suffix+".ally")
    pickleToDisk(y, suffix+".y")
    pickleToDisk(ty, suffix+".ty")
    
    print(y.shape)
    print(ty.shape)
    print(ally.shape)
    
    

    # run training on those files

In [40]:
generateData(suffix="ind.t3",n_node=4000, n_train=700, n_test=300)
!mv ind.* ./data/

IGRAPH U--- 4000 3999 -- 
(3200, 1)
(300, 1)
(3700, 1)
0
[ True  True  True ...  True  True  True]
1
[False False False ... False False False]
2
[False False False ... False False False]
3
[False False False ... False False False]
4
[False False False ... False False False]
5
[False False False ... False False False]
6
[False False False ... False False False]
7
[False False False ... False False False]
8
[False False False ... False False False]
9
[False False False ... False False False]
(4000, 10)
(3200, 10)
(300, 10)
(3700, 10)


In [113]:
!~/.pyenv/shims/python  train.py --dataset t3


len(y): 3200
labels shape:
(4000, 10)
len(idx_train) 3200
len(idx_val) 500
len(idx_test) 310
features after row normalize and tuple
(array([[   0,    0],
       [   1,    0],
       [   2,    0],
       ...,
       [3997,    0],
       [3998,    0],
       [3999,    0]], dtype=int32),
 array([1., 1., 1., ..., 1., 1., 1.]),
 (4000, 1))
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

2019-01-22 10:59:28.203330: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Epoch: 0001 train_loss= 2.40027 train_acc= 0.03344 val_loss= 2.33939 val_acc= 0.00000 time= 0.04083
Epoch: 0002 train_loss= 2.36241 train_acc= 0.04688 val_loss= 2.31095 val_acc= 0.00000 time= 0.01175
Epoch: 0003 train_loss= 2.32397 train_acc= 0.03469 val_loss= 2.28803 val_acc= 0.00000 time= 0.012

In [122]:
!~/.pyenv/shims/python  train.py --dataset cora


len(y): 140
labels shape:
(2708, 7)
len(idx_train) 140
len(idx_val) 500
len(idx_test) 1000
features after row normalize and tuple
(array([[   0, 1274],
       [   0, 1247],
       [   0, 1194],
       ...,
       [2707,  329],
       [2707,  186],
       [2707,   19]], dtype=int32),
 array([0.11111111, 0.11111111, 0.11111111, ..., 0.07692308, 0.07692308,
       0.07692308], dtype=float32),
 (2708, 1433))
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

2019-01-22 11:04:21.995226: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Epoch: 0001 train_loss= 1.95369 train_acc= 0.17857 val_loss= 1.94959 val_acc= 0.19600 time= 0.05467
Epoch: 0002 train_loss= 1.94921 train_acc= 0.22857 val_loss= 1.94556 val_acc= 0.25000 time= 0.01941
Epoch: 0003 train_loss= 1.

Epoch: 0073 train_loss= 1.13352 train_acc= 0.66429 val_loss= 1.14714 val_acc= 0.66600 time= 0.01889
Epoch: 0074 train_loss= 1.01581 train_acc= 0.62857 val_loss= 1.14444 val_acc= 0.65800 time= 0.02054
Epoch: 0075 train_loss= 0.99083 train_acc= 0.62143 val_loss= 1.14228 val_acc= 0.66200 time= 0.02001
Epoch: 0076 train_loss= 1.05695 train_acc= 0.60000 val_loss= 1.14428 val_acc= 0.66000 time= 0.01973
Epoch: 0077 train_loss= 1.06611 train_acc= 0.62857 val_loss= 1.14551 val_acc= 0.65200 time= 0.02008
Epoch: 0078 train_loss= 1.04987 train_acc= 0.62143 val_loss= 1.14293 val_acc= 0.65200 time= 0.02045
Epoch: 0079 train_loss= 0.93315 train_acc= 0.70000 val_loss= 1.13907 val_acc= 0.65200 time= 0.01960
Epoch: 0080 train_loss= 0.92226 train_acc= 0.75000 val_loss= 1.13974 val_acc= 0.64400 time= 0.01865
Epoch: 0081 train_loss= 1.02945 train_acc= 0.61429 val_loss= 1.14003 val_acc= 0.64400 time= 0.01806
Epoch: 0082 train_loss= 0.93219 train_acc= 0.70000 val_loss= 1.14038 val_acc= 0.64400 time= 0.01873


In [186]:


def plot2DEmbedding_oldd(filepath, labelspath):
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    import matplotlib.lines as mlines
    %matplotlib inline

    # read embedding labels file
    with open(labelspath,"r") as f:
        size = len(f.readlines())
        labels = np.zeros((size,1))

    # read embedding file
    with open(filepath,"r") as f:
        size = len(f.readlines())

        
    # read embedding labels file
    embeddings = {}
    with open(labelspath,"r") as f:
        i=0
        for label in f.readlines():
            #print(line)
            vals = label.split()
            label_int = 0
            for j in range(len(vals)):
                label_int += float(vals[j])*j
            labels[i]=int(label_int)
            i=i+1
            
        
        for j in set(labels.tolist()):
            embeddings[j]=np.zeros((size,2))
            
        with open(filepath,"r") as f2:
            i = 0
            for line in f2.readlines():
                #print(line)
                d1,d2 = line.split()
                embedding = embeddings[labels[j]]
                np.append(embedding, [floag(d1), float(d2)], axis=0)
                i=i+1
        
        
        print(labels)
        pprint(embeddings)

        # separate embedding by label
        
        
        color_labels = [int(k % 23) for k in labels]
        for col in color_labels:
            plt.scatter(embeddings[col][:,0], embedding[col][:,1], c=col)
        plt.suptitle('Embedding')
        plt.xlabel('dim 1')
        plt.ylabel('dim 2')
        plt.legend()
        plt.show()
        
    
def plot2DEmbedding(filepath, labelspath):
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    import matplotlib.lines as mlines
    %matplotlib inline

    # read embedding labels file
    with open(labelspath,"r") as f:
        size = len(f.readlines())
        labels = np.zeros((size,1))
    # read embedding labels file
    with open(labelspath,"r") as f:
        i = 0
        for line in f.readlines():
            #print(line)
            vals = line.split()
            label_int = 0
            for j in range(len(vals)):
                label_int += float(vals[j])*j
            labels[i]=int(label_int)
            i=i+1
        
    print(labels)
    
    # read embedding file
    embedding = np.zeros((1,1))
    with open(filepath,"r") as f:
        size = len(f.readlines())
        embedding = np.zeros((size,2))


    with open(filepath,"r") as f:
        i = 0
        for line in f.readlines():
            #print(line)
            d1,d2 = line.split()
            embedding[i,0] = float(d1)
            embedding[i,1] = float(d2)
            i=i+1

        #print(embedding.shape)
        #print(embedding[:,0])
        #print(embedding[:,1]) 
        
        # separate embedding by label
        
        
        color_labels = [int(k % 23) for k in labels]
        plt.scatter(embedding[:,0], embedding[:,1], c=color_labels)
        plt.suptitle('Embedding')
        plt.xlabel('dim 1')
        plt.ylabel('dim 2')
        
        myhandles = []
        for thecolor in set(color_labels):   
            print(thecolor)
            myhandles.append( mpatches.Patch( color=str(thecolor), label=thecolor))

        plt.legend(handles=myhandles)
        plt.show()
        
plot2DEmbedding_oldd("embedding","embedding_labels")

TypeError: unhashable type: 'list'

In [250]:

def plot2DEmbedding(filepath, labelspath):
    import matplotlib.pyplot as plt
    import matplotlib.patches as mpatches
    import matplotlib.lines as mlines
    import matplotlib.cm as cm
    %matplotlib inline

    # read embedding labels file
    with open(labelspath,"r") as f:
        size = len(f.readlines())
        labels = np.zeros((size,1))

    # read embedding file
    with open(filepath,"r") as f:
        size = len(f.readlines())

        
    # read embedding labels file
    embeddings = {}
    with open(labelspath,"r") as f:
        i=0
        for label in f.readlines():
            #print(line)
            vals = label.split()
            label_int = 0
            for j in range(len(vals)):
                label_int += float(vals[j])*j
            labels[i]=int(label_int)
            i=i+1
            
        labels_list = [ lab[0] for lab in labels.tolist()]
        for j in set(labels_list):
            embeddings[j]=np.empty((0,2))
            
        with open(filepath,"r") as f2:
            i = 0
            for line in f2.readlines():
                #print(line)
                d1,d2 = line.split()
                key = int(labels_list[i])
                embedding = embeddings[key]
                
                embeddings[key] = np.append(embedding, [[float(d1), float(d2)]], axis=0)
                #embedding[i,0]=float(d1)
                #embedding[i,1]=float(d2)
                i=i+1
        
        # separate embedding by label
        
        color_labels = [int(k % 23) for k in set(labels_list) ]
        colors = cm.rainbow(np.linspace(0, 1, len(color_labels)))
        for colref,col in zip(colors, color_labels) :
            print(col)
            #print([[col]*embeddings[col].shape[0]])
            print(embeddings[col].shape[0])
            plt.scatter(embeddings[col][:,0], embeddings[col][:,1], c=[col]*embeddings[col].shape[0], label=str(colabel))
        #col=0
        #plt.scatter(embeddings[col][:,0], embeddings[col][:,1], c=str(col), label=str(col))
        plt.suptitle('Embedding')
        plt.xlabel('dim 1')
        plt.ylabel('dim 2')
        plt.legend()
        plt.show()
        
   
        
plot2DEmbedding("embedding","embedding_labels")

[0.5 0.  1.  1. ]


TypeError: unhashable type: 'numpy.ndarray'