In [None]:
import time, copy, os, pickle, random
from scipy.io import loadmat
import pickle
import networkx as nx
import numpy as np
import pandas as pd
import base
import sys
import math
%matplotlib inline
#import ipyparallel as ipp
import dill

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold

from collections import OrderedDict 

import matplotlib.pyplot as plt
from IPython.core.display import display, SVG, HTML
from tqdm import tqdm, tqdm_notebook
pd.options.display.max_columns = 150

In [None]:
def return_grid_search_random_forest():
    return RandomForestClassifier(n_estimators=100, 
                        min_samples_leaf=1, 
                            min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, bootstrap=True, 
                                oob_score=False, n_jobs=6, verbose=0, warm_start=False,
                                    max_depth = None,class_weight="balanced_subsample")

In [None]:
def return_connected_components(n=5):
    CONNECTED_COMPONENTS_PATH   = 'graphs'
    dirs = []
    connected_graphs_components = {}
    for i in os.listdir(CONNECTED_COMPONENTS_PATH):
        if n >= int(i[3:]):
            dirs.append(i)

    for path in dirs:
        for count,file in enumerate(os.listdir(os.path.join(CONNECTED_COMPONENTS_PATH, path))): 
            file_path = os.path.join(CONNECTED_COMPONENTS_PATH, path, file)
            if file_path.endswith('.txt'): 
                connected_graph = np.loadtxt(file_path)
                connected_graphs_components['g_'+path[3:]+'_'+str(count+1)] =  nx.from_numpy_array(np.array(connected_graph))
    return connected_graphs_components           

In [None]:
def return_dataset(file_name):
    #i = 'G_nci1'
    dd = base.fetch_dataset(file_name)
    graph_list = []
    for gg in dd.data:
        g_ = nx.Graph()
        g_.add_edges_from([(i[0], i[1]) for i in gg[0]])
        graph_list.append(g_)
    data_y = dd.target
    return graph_list, data_y

In [None]:
def return_compressed_graph_whole(g1, value, primeN):
    idx = 1
    data = OrderedDict() 
    #combined = []
    data[idx] = OrderedDict()
    # check if list is empty
    temp_graph = copy.copy(g1)
    
    gNew = nx.Graph()

    for v in value:
        #print(" value:", v)
        gX = nx.Graph(g1.subgraph(v))
        gNew.add_nodes_from(gX.nodes())
        gNew.add_edges_from(gX.edges())

    gNew.remove_edges_from(gNew.selfloop_edges())

    avDegrees = []
    connected_shapes = []

    for gX in list(nx.connected_component_subgraphs(gNew)):
        sum = 0
        for key, value in gX.degree():
            sum += value
        sum /= len(gX.degree())
        #print(sum)
        avDegrees.append(sum)
        connected_shapes.append(list(gX.nodes()))
    
    # contract
    for i in connected_shapes:
        #combined.append(i[0])
        p = primeN        
        for node in i[1:]:
            temp_graph = nx.contracted_nodes(temp_graph, i[0], node, self_loops = False)
            p = p * g1.node[node]['layer']
        temp_graph.node[i[0]]['layer'] = p 
        
    return temp_graph, avDegrees

In [None]:
#connected_graphs_components  = return_connected_components(n=6)       
def return_counts(g_, connected_graphs_components):
    data = OrderedDict()  # all the data indices goes here
    no_connected = OrderedDict() 
    count_dict = OrderedDict() 

    idx = 1

    g1 = g_
    #store indices of a connected graphs for a particular graph
    data[idx] = OrderedDict() 
    #iterating through connected graphs found in round1
    no_connected[idx] = OrderedDict() 
    #no_connected['before'] = OrderedDict() 
    count_dict = OrderedDict() 
    for name, graph in connected_graphs_components.items():
        GM = nx.isomorphism.MultiGraphMatcher(g1,graph)
        sub_nodes = list(GM.subgraph_isomorphisms_iter())
        if len(sub_nodes)>0:
            p = [list(i.keys()) for i in list(GM.subgraph_isomorphisms_iter())]
            sub_nodes = np.unique(np.sort(p,axis=1),axis=0)
        count_dict[name] = (sub_nodes)
        
    #print("count dict: " ,count_dict)
    counts = []
    for key, v in count_dict.items():
        counts.append(len(v))
    
    
    #data = pd.DataFrame(counts).transpose()
    #data_hist = counts.applymap(lambda x: len(x)) #contains histogram without merging connected shapes
    
    return counts, count_dict

In [None]:
def print_graphs(connected_graphs_components):
    i = 1
    k = len(connected_graphs_components)
    ky = 5
    kx = math.ceil(k/ky)

    plt.figure(figsize=(20, kx*ky))
    for key, g in connected_graphs_components.items():
        #plt.figure(i)
        plt.subplot(kx, ky, i)
        plt.title(key)
        nx.draw(g)
        i = i + 1
    return

In [None]:
def return_graph_stuff(graph, connected_graphs_components, label, indexKey):
    #counts, count_dict = return_counts(g_, connected_graphs_components)

    
    data_hist1, count_dict1 = return_counts(graph, connected_graphs_components)

    pbar2 = tqdm_notebook(range(len(count_dict1)), desc="Computing Sub-Histogram")

    compressedGs = OrderedDict() 

    for key, value in count_dict1.items():

        currDict = OrderedDict() 
        data_histT = [0] * len(count_dict1)
        data_histT2 = [0] * len(count_dict1)
        
        numSepCounts = 6
        newnodes = 0
        newedges = 0
            
        data_histT3 = [0] * len(count_dict1) * numSepCounts
        avDegrees = []
        
        if (len(value) > 0):
            g2, avDegrees = return_compressed_graph_whole(graph, value, 2)
            newnodes = len(g2.nodes())
            newedges = len(g2.edges())
            
            data_histT, count_dict2 = return_counts(g2, connected_graphs_components)
            counts = []
            separateCounts = {}
            for i in range(numSepCounts):
                separateCounts[i + 1] = []
                
                
            for k, v in count_dict2.items():
                thiscount = 0
                # v is each 
                separateCountThis = {}
                for i in range(numSepCounts):
                    separateCountThis[i + 1] = 0
                
                # for each component present for this graphlet
                for oneV in v:
                    # one subset of connected components 
                    condition = False
                    numLayered = 0;
                    
                    for oneNode in oneV:
                        if (g2.node[oneNode]['layer'] == 2):
                            numLayered += 1
                            
                    if (numLayered > 0):
                        #print(oneV)
                        thiscount += 1
                        separateCountThis[numLayered] += 1
                    
                counts.append(thiscount)
                for i in range(numSepCounts):
                    separateCounts[i + 1].append(separateCountThis[i+1])

            data_histT2 = counts
            data_histT3 = []
            for i in range(numSepCounts):
                data_histT3 += separateCounts[i + 1]
                    
        #currDict['key'] = key

        currDict['nodes'] = [newnodes]
        currDict['edges'] = [newedges]
        currDict['degreeList'] = avDegrees
        currDict['HoG'] = data_histT
        currDict['HoGN'] = data_histT2
        currDict['HoGN2'] = data_histT3

        compressedGs[key] = currDict

        pbar2.update(1)


    pbar2.close()


    retDict = OrderedDict() 
    #retDict['graph'] = graph
    retDict['nodes'] = len(graph.nodes())
    retDict['edges'] = len(graph.edges())
    retDict['class'] = label
    retDict['key'] = indexKey
    retDict['HoG'] = data_hist1
    retDict['compressedGs'] = compressedGs

    return retDict


    

In [None]:
# we have, in results, a list of graphs with various information
#graph_item = results[0]

#for graph_item in results:
#    graph_item['compressedGs'][]
def ret_minmax(results, key):

    globMin = 110
    globMax = 0
    globMinG = 0
    globMaxG = 0

    #for key in connected_graphs_components.keys():
    #key = 'g_6_55'
    some = False
    for graph_item in results:
        if (len(graph_item['compressedGs'][key]['degreeList']) == 0):
            continue
        some = True
        possMin = np.min(graph_item['compressedGs'][key]['degreeList'])
        possMax = np.max(graph_item['compressedGs'][key]['degreeList'])

        if (possMax > globMax):
            globMax = possMax
            #globMaxG = graph_item['graph']
        if (possMin < globMin):
            globMin = possMin
            #globMinG = graph_item['graph']

    #print(" Max: ", globMax, " Min: ", globMin)
    glob = {}
    if (some == True):
        glob['min'] = globMin
        glob['max'] = globMax
    return glob



In [None]:
## from joblib import Parallel, delayed
from joblib import Parallel, delayed
import multiprocessing


In [None]:
curr  = return_connected_components(n=6)

only_keep = ['g_3_1', 'g_4_2', 'g_5_21', 'g_6_24', 
             'g_4_1', 'g_5_1', 'g_6_1', 
             'g_3_2', 'g_4_4', 'g_5_6', 'g_6_55', 
             'g_4_6', 'g_5_14', 'g_6_16']
             
connected_graphs_components = OrderedDict()

for key, val in curr.items():
    if (key in only_keep):
        connected_graphs_components[key] = curr[key]

In [None]:
print_graphs(connected_graphs_components)

In [None]:
name = "PTC_MR"
graph_list, data_y = return_dataset(name)

for g in graph_list:
    nx.set_node_attributes(g, 1, 'layer')

num_cores = multiprocessing.cpu_count()

results = Parallel(n_jobs=num_cores - 2, verbose=50)(delayed(
    return_graph_stuff)(graph_list[key], connected_graphs_components, data_y[key], key) for key in range(len(graph_list)))
pickle.dump( results, open( name + "_G.p", "wb" ) )

In [None]:
name = "PTC_MR"
results = pickle.load( open( name + "_G.p", "rb" ) )
glob = {}

for k in connected_graphs_components.keys():
    retD = ret_minmax(results, k)
    if (len(retD) > 0):
        glob[k] = retD
for k, v in glob.items():
    v['max'] = math.ceil(v['max'])
    v['min'] = math.floor(v['min'])

dataX_1 = []
dataX_2 = []
dataX_3 = []
dataX_4 = []

binD = 10

for graph_item in results:
    data_item = copy.copy(graph_item['HoG'])
    data_item2 = copy.copy(graph_item['HoG'])
    data_item3 = copy.copy(graph_item['HoG'])
    data_item4 = copy.copy(graph_item['HoG'])

    data_item.append(graph_item['nodes'])
    data_item2.append(graph_item['nodes'])
    data_item3.append(graph_item['nodes'])
    data_item4.append(graph_item['nodes'])

    data_item.append(graph_item['edges'])
    data_item2.append(graph_item['edges'])
    data_item3.append(graph_item['edges'])
    data_item4.append(graph_item['edges'])

    for key, value in graph_item['compressedGs'].items():
        if ((key not in glob.keys())):
            continue

        globMin = glob[key]['min']
        globMax = glob[key]['max']

        # set binwidth for this histogram
        binW = binD*(globMax-globMin)
        if (binW == 0):
            binW = 1

        # make histogram of degrees
        degreeHist = [0] * binW
        avDegrees = value['degreeList']
        if (len(avDegrees) > 0):
            #print(binW)
            hist, bin_edges = np.histogram(avDegrees, bins=binW, range=(globMin, globMax), normed=None, weights=None, density=None)
            degreeHist = hist.tolist()

        data_item += degreeHist
        data_item2 += degreeHist
        data_item3 += degreeHist
        data_item4 += degreeHist


        data_item += value['nodes']
        data_item2 += value['nodes']
        data_item3 += value['nodes']
        data_item4 += value['nodes']

        data_item += value['edges']
        data_item2 += value['edges']
        data_item3 += value['edges']
        data_item4 += value['edges']

        thisHoGD = [x-y for x, y in zip(value['HoG'], value['HoGN'])] 

        if (not all(item >= 0 for item in thisHoGD)):
            print("error")

        #data_item += thisHoGD
        #data_item += value['HoGN2']

        #data_item2 = copy.copy(data_item)
        #data_item3 = copy.copy(data_item)
        data_item += value['HoG']
        data_item4 += value['HoG']

        data_item2 += thisHoGD
        data_item3 += thisHoGD
        data_item4 += thisHoGD

        data_item2 += value['HoGN']
        data_item3 += value['HoGN2']


        data_item4 += value['HoGN']
        data_item4 += value['HoGN2']


    dataX_1.append(data_item)
    dataX_2.append(data_item2)
    dataX_3.append(data_item3)
    dataX_4.append(data_item4)


dataX_1_ = np.array(dataX_1)
dataX_2_ = np.array(dataX_2)
dataX_3_ = np.array(dataX_3)
dataX_4_ = np.array(dataX_4)

data_X_1_ = dataX_1_[:,~np.all(dataX_1_==0,axis=0)]
data_X_2_ = dataX_2_[:,~np.all(dataX_2_==0,axis=0)]
data_X_3_ = dataX_3_[:,~np.all(dataX_3_==0,axis=0)]
data_X_4_ = dataX_4_[:,~np.all(dataX_4_==0,axis=0)]

if not os.path.exists(name):
    os.makedirs(name)

np.savetxt(name + '\data_X_1_.txt',data_X_1_)
np.savetxt(name + '\data_X_2_.txt',data_X_2_)
np.savetxt(name + '\data_X_3_.txt',data_X_3_)
np.savetxt(name + '\data_X_4_.txt',data_X_4_)

In [None]:
data1 = pd.DataFrame(dataX_3_)
data1

# k-fold 

In [None]:
data_X = data1
data_y = np.array(myDataY)
max_arr= []
#,567,890,5678,78, 6,1122,101,11111,42
for seed in [345]:
    estimator  = RandomForestClassifier(criterion='gini', max_depth=None, min_samples_leaf=1, min_weight_fraction_leaf=0.0,
                                            max_leaf_nodes=None, bootstrap=True, 
                                            oob_score=False, n_jobs=6,verbose=0, warm_start=False,
                                            class_weight=None)

    param_grid = {'n_estimators':[50,100,500], 'max_features':['sqrt'], 'min_samples_split':[2,3,4,5]}
    kf = StratifiedKFold(n_splits=10, random_state = seed)
    grid_rf    = GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=6, 
                 iid=False, refit=True, cv=kf, verbose=3, pre_dispatch='n_jobs', 
                 error_score='raise-deprecating')
    #print(data_X.shape)
    grid_rf.fit(data_X, data_y)
    df = pd.DataFrame(grid_rf.grid_scores_)
    max_arr.append(df['mean_validation_score'].max())
    #grid_rf.cv_results_['mean_test_score']
    #Pipeline2 = Pipeline([('drop_empty_column', dropCols(frac=1.0)),('grid_rf', grid_rf)])
    #res = Pipeline2.fit(data_X, data_y)
    print('Result computed for', filename, 'results are:',np.mean(max_arr))

In [None]:
grid_rf.cv_results_