In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms.tree.mst import count

from os import listdir
from os.path import isfile, join

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import seaborn as sns

import time
import warnings
import random
from typing_extensions import Counter

warnings.filterwarnings(action = 'ignore')
start_time = time.time()

In [2]:
from google.colab import drive
drive.mount('/content/drive')
data_set = []
tdDirectory="/content/drive/MyDrive/Colab Notebooks/datasets/male/td/"
asdDirectory="/content/drive/MyDrive/Colab Notebooks/datasets/male/asd/"
matSize = 116

tdFiles = [f for f in listdir(tdDirectory) if isfile(join(tdDirectory, f))]
asdFiles = [f for f in listdir(asdDirectory) if isfile(join(asdDirectory, f))]


print(tdDirectory)
for files in tdFiles:
  np.loadtxt(tdDirectory + files)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/datasets/male/td/


##Graph representaion

In [3]:
# implementation of an undirected graph using Adjacency Lists
class Vertex:
    def __init__(self, name):
        self.name = name
        self.neighbors = list()

    def add_neighbor(self, v):
        if v not in self.neighbors:
            self.neighbors.append(v)

class Graph:
    def __init__(self):
        self.vertices = {}

    def add_vertex(self, vertex):
        if isinstance(vertex, Vertex) and vertex.name not in self.vertices:
            self.vertices[vertex.name] = vertex
            return True
        else:
            return False

    def add_edge(self, u, v):
        if u in self.vertices and v in self.vertices:
            if v not in self.vertices[u].neighbors: # removing multiple edges between two nodes
                self.vertices[u].add_neighbor(v)
                return True
        else:
            return False

    def sort_neighbors(self):
        for x in self.vertices:
            self.vertices[x].neighbors.sort()

    def print_graph(self):
        for key in self.vertices.keys():
            print(str(key) + ": " + str(self.vertices[key].neighbors))

##k-core algorithm

In [4]:
# https://pdfs.semanticscholar.org/fce2/bc7618628c0481d4cf6a43f5f368e7f8b9c0.pdf
# VZ algorithm implementation of k-core -- O(m)

def k_core_implementation(G):
    n = len(G.vertices)

    # calculating the max degree in the graph and also degree of each node
    deg = [0]*n
    md = 0 # maximum degree
    for i in range(n):
        deg[i] = len(G.vertices[i].neighbors)
        md = max(deg[i], md)
    md += 1 # incrementing maximum degree by 1 to include the max index in the bucket

    # calculating the size of each bucket
    bucket = [0]*md
    for i in range(n):
        bucket[deg[i]] += 1

    # partial sum of bucket size, denotes the starting index of each bucket
    cursum = 0
    for i in range(md):
        val = bucket[i]
        bucket[i] = cursum
        cursum += val

    # sorting using bucket sort and keeping the position in pos array
    valBucket = [0]*md
    vert = [0]*n # sorted array
    pos = [0]*n # pos of a node in sorted bucket
    for i in range(n):
        pos[i] = bucket[deg[i]] + valBucket[deg[i]] # starting position of bucket + val found so far
        vert[pos[i]] = i
        valBucket[deg[i]] += 1
    # calculating the core of each node by eliminating lower degree nodes
    for i in range(n):
        u = vert[i]
        for v in G.vertices[u].neighbors:
            if(deg[v] > deg[u]):
                curDeg = deg[v]
                posV = pos[v]
                firstValOfCurBucketInd = bucket[curDeg]
                firstValOfCurBucket = vert[firstValOfCurBucketInd]
                if firstValOfCurBucket != v: # swap the values inside bucket
                    pos[v] = firstValOfCurBucketInd
                    pos[firstValOfCurBucket] = posV
                    vert[firstValOfCurBucketInd] = v
                    vert[posV] = firstValOfCurBucket
                bucket[curDeg] += 1
                deg[v] -= 1
    return deg

##Summary matrix for td files

In [5]:
numOfTrainTDFiles = int(len(tdFiles) * 0.80)
numOfTestTDFiles = len(tdFiles) - numOfTrainTDFiles

random.shuffle(tdFiles)
trainTDFiles = tdFiles[:numOfTrainTDFiles]
testTDFiles = tdFiles[numOfTrainTDFiles:]

print("[TD Files] Total: ", len(tdFiles))
print("[TD Files] Train:", len(trainTDFiles))
print("[TD Files] Test", len(testTDFiles))

[TD Files] Total:  418
[TD Files] Train: 334
[TD Files] Test 84


In [6]:

finalMatrix = np.zeros((matSize, matSize), dtype=int)
matrix = np.zeros((matSize, matSize), dtype=int)

for file in trainTDFiles:
    cur_matrix = np.loadtxt(tdDirectory + file, dtype=int)
    finalMatrix = finalMatrix + cur_matrix


finalMatrix = finalMatrix / len(trainTDFiles)
# print(finalMatrix)

threshold = 0.75
# lowerbound = 45
# upperbound = 75
# increment = 5
# for x in range(lowerbound, upperbound+increment, increment):
#   threshold = x / 100
tempMatrix = finalMatrix >= threshold
tempMatrix = tempMatrix.astype(np.int)
tdFinalMatrix = tempMatrix

print(tdFinalMatrix)

[[1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 0 1 1]]


##Populate graph for the td summary matrix

In [7]:
tdSummaryGraph = Graph()
n = tdFinalMatrix.shape[0]
for x in range(0, n):
  tdSummaryGraph.add_vertex(Vertex(x))
x = 0;
for line in tdFinalMatrix:
  y = 0;
  for val in line:
      if val == 1:
          tdSummaryGraph.add_edge(x, y)
      y = y + 1
  x = x + 1

# print("adjacency list, representing the neighbors of each node:")
# tdSummaryGraph.print_graph()

##k-core decompositon on td summary graph

In [8]:
tdKCore = k_core_implementation(tdSummaryGraph)
# print(kCore)
maxKCore = max(tdKCore)
print("Max core:", maxKCore)

tdSummaryKCoreNodes = []
x = 0
for val in tdKCore:
  if val >= maxKCore:
    tdSummaryKCoreNodes.append(x)
  x = x + 1
print(len(tdSummaryKCoreNodes), tdSummaryKCoreNodes)

tempGraph = nx.Graph()
for x in tdSummaryKCoreNodes:
    for y in tdSummaryGraph.vertices[x].neighbors: # getting the neighbour from original graph
        if y in tdSummaryKCoreNodes and y > x:
            tempGraph.add_edge(x, y)
tdSummaryKCoreEdgeList = tempGraph.edges

Max core: 8
11 [16, 17, 28, 29, 62, 63, 72, 78, 79, 80, 81]


##Summary matrix for asd files

In [9]:
numOfTrainASDFiles = int(len(asdFiles) * 0.80)
numOfTestASDFiles = len(asdFiles) - numOfTrainASDFiles

random.shuffle(asdFiles)
trainASDFiles = asdFiles[:numOfTrainASDFiles]
testASDFiles = asdFiles[numOfTrainASDFiles:]

print("[ASD Files] Total: ", len(asdFiles))
print("[ASD Files] Train:", len(trainASDFiles))
print("[ASD Files] Test", len(testASDFiles))

[ASD Files] Total:  420
[ASD Files] Train: 336
[ASD Files] Test 84


In [10]:
finalMatrix = np.zeros((matSize, matSize), dtype=int)
matrix = np.zeros((matSize, matSize), dtype=int)

for file in trainASDFiles:
    cur_matrix = np.loadtxt(asdDirectory + file, dtype=int)
    finalMatrix = finalMatrix + cur_matrix


finalMatrix = finalMatrix / len(trainASDFiles)
# print(finalMatrix)

threshold = 0.75
# lowerbound = 45
# upperbound = 75
# increment = 5
# for x in range(lowerbound, upperbound+increment, increment):
#   threshold = x / 100
tempMatrix = finalMatrix >= threshold
tempMatrix = tempMatrix.astype(np.int)
asdFinalMatrix = tempMatrix

print(asdFinalMatrix)

[[1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 1]
 [0 0 0 ... 0 1 1]]


##Populate graph for the asd summary matrix

In [11]:
asdSummaryGraph = Graph()
n = asdFinalMatrix.shape[0]
for x in range(0, n):
  asdSummaryGraph.add_vertex(Vertex(x))
x = 0;
for line in asdFinalMatrix:
  y = 0;
  for val in line:
      if val == 1:
          asdSummaryGraph.add_edge(x, y)
      y = y + 1
  x = x + 1

# print("adjacency list, representing the neighbors of each node:")
# asdSummaryGraph.print_graph()

##k-core decompositon on asd summary graph

In [12]:
asdKCore = k_core_implementation(asdSummaryGraph)
# print(kCore)
maxKCore = max(asdKCore)
print("Max core:", maxKCore)

asdSummaryKCoreNodes = []
x = 0
for val in asdKCore:
  if val >= maxKCore:
    asdSummaryKCoreNodes.append(x)
  x = x + 1
print(len(asdSummaryKCoreNodes), asdSummaryKCoreNodes)

tempGraph = nx.Graph()
for x in asdSummaryKCoreNodes:
    for y in asdSummaryGraph.vertices[x].neighbors: # getting the neighbour from original graph
        if y in asdSummaryKCoreNodes and y > x:
            tempGraph.add_edge(x, y)
asdSummaryKCoreEdgeList = tempGraph.edges

Max core: 7
9 [16, 17, 28, 29, 72, 78, 79, 80, 81]


##For td files calculate jaccard similarity with td and asd summary graph

In [13]:
goodTDfile = 0
for file in testTDFiles:
    A = np.loadtxt(tdDirectory + file, dtype=int)
    # now calculate the k-core nodes for the current file
    G = Graph()
    n = A.shape[0]
    for x in range(0, n):
        G.add_vertex(Vertex(x))
    x = 0;
    for line in A:
      y = 0;
      for val in line:
            if val == 1:
                G.add_edge(x, y)
            y = y + 1
      x = x + 1

    kCore = k_core_implementation(G)
    maxKCore = max(kCore)

    kCoreNodes = []
    x = 0
    for val in kCore:
      if val >= maxKCore:
        kCoreNodes.append(x)
      x = x + 1

    tempGraph = nx.Graph()
    for x in kCoreNodes:
        for y in G.vertices[x].neighbors: # getting the neighbour from original graph
            if y in kCoreNodes and y > x:
                tempGraph.add_edge(x, y)
    edgeList = tempGraph.edges

    # now add the td jaccard similarity
    intersectArr = np.intersect1d(kCoreNodes, tdSummaryKCoreNodes, assume_unique = False)
    lenA = len(intersectArr)
    unionArr = np.union1d(kCoreNodes, tdSummaryKCoreNodes)
    lenB = len(unionArr)
    tdJaccard = lenA/lenB

    # now add the asd jaccard similarity
    intersectArr = np.intersect1d(kCoreNodes, asdSummaryKCoreNodes, assume_unique = False)
    lenA = len(intersectArr)
    unionArr = np.union1d(kCoreNodes, asdSummaryKCoreNodes)
    lenB = len(unionArr)
    asdJaccard = lenA/lenB

    if tdJaccard > asdJaccard:
      goodTDfile = goodTDfile + 1

GoodTDFilePercentage = (goodTDfile/len(testTDFiles)) *100
print(GoodTDFilePercentage)

BadTDFilePercentage = 100 - GoodTDFilePercentage
print(BadTDFilePercentage)





76.19047619047619
23.80952380952381


##For asd files calculate jaccard similarity with td and asd summary graph

In [14]:
goodASDfile = 0
for file in testASDFiles:
    A = np.loadtxt(asdDirectory + file, dtype=int)

    # now calculate the k-core nodes for the current file
    G = Graph()
    n = A.shape[0]
    for x in range(0, n):
        G.add_vertex(Vertex(x))
    x = 0;
    for line in A:
      y = 0;
      for val in line:
            if val == 1:
                G.add_edge(x, y)
            y = y + 1
      x = x + 1

    kCore = k_core_implementation(G)
    maxKCore = max(kCore)

    kCoreNodes = []
    x = 0
    for val in kCore:
      if val >= maxKCore:
        kCoreNodes.append(x)
      x = x + 1

    tempGraph = nx.Graph()
    for x in kCoreNodes:
        for y in G.vertices[x].neighbors: # getting the neighbour from original graph
            if y in kCoreNodes and y > x:
                tempGraph.add_edge(x, y)
    edgeList = tempGraph.edges

    # now add the td jaccard similarity
    intersectArr = np.intersect1d(kCoreNodes, tdSummaryKCoreNodes, assume_unique = False)
    lenA = len(intersectArr)
    unionArr = np.union1d(kCoreNodes, tdSummaryKCoreNodes)
    lenB = len(unionArr)
    tdJaccard = lenA/lenB

    # now add the asd jaccard similarity
    intersectArr = np.intersect1d(kCoreNodes, asdSummaryKCoreNodes, assume_unique = False)
    lenA = len(intersectArr)
    unionArr = np.union1d(kCoreNodes, asdSummaryKCoreNodes)
    lenB = len(unionArr)
    asdJaccard = lenA/lenB

    if tdJaccard < asdJaccard:
      goodASDfile = goodASDfile + 1


GoodASDFilePercentage = (goodASDfile/len(testASDFiles)) *100
print(GoodASDFilePercentage)

BadASDFilePercentage = 100 - GoodASDFilePercentage
print(BadASDFilePercentage)



13.095238095238097
86.9047619047619
