In [1]:
# Import libraries
import pandas as pd
import numpy as np
import time
import igraph
# import networkx as nx

In [2]:
print(igraph.__version__)

0.7.1


In [3]:
# Load the network edgelist file and create the graph 
start_time = time.time()

print("Reading network data and creating graph...")

g = igraph.Graph.Read_Ncol('../data/network.tsv', directed=False)

end_time = time.time()

print("Network graph created. Process took {:.04f} seconds".format(end_time - start_time))

# Check that the graph was created correctly
print("We expect there to be 6626753 vertices. The graph has {} vertices.".format(g.vcount()))
print("We expect there to be 30915267 edges. The graph has {} edges".format(g.ecount()))

Reading network data and creating graph...
Network graph created. Process took 199.2736 seconds
We expect there to be 6626753 vertices. The graph has 6626753 vertices.
We expect there to be 30915267 edges. The graph has 30915267 edges


In [4]:
# Taken and modified from stack overflow: https://stackoverflow.com/questions/34917550/write-
# a-graph-into-a-file-in-an-adjacency-list-form-mentioning-all-neighbors-of

def adj_list_to_file(G, file_name):
    f = open(file_name, "w")
    for n in G.vs:
        f.write(str(n) + ',')
        for neighbor in G.neighbors(n):
            f.write(str(neighbor) + ' ')
        f.write('\n')

In [5]:
# Write save save adjacency list 
adj_list_to_file(g, '../adjacency_list_2.txt')

In [2]:
# # Read data as chunks
# print("Reading network data as chunks...")
# start_time = time.time()

# g_data_chunks = pd.read_csv("./all/network.tsv",
#                             delimiter='\t',
#                             usecols=[0,1],
#                             names=['n', 'v'],
#                             dtype={'n': np.int32, 'v': np.int32},
#                             header=None,
#                             chunksize=100000)

# # Combine
# # g_data_chunks pd.concat(g_data_chunks)

# end_time = time.time()
# print("Chunks created. Process took {:.04f} seconds".format(end_time - start_time))

Reading network data as chunks...
Chunks created. Process took 0.0036 seconds


In [7]:
# # Check number of rows
# g_data_full.shape[0]

30915267

In [3]:
# g_data_chunks = [x for x in g_data_chunks]

In [2]:
# print("Creating network graph...")
# start_time = time.time() 

# with open("./all/network.tsv", 'rb') as f:
#     grph = nx.read_edgelist(path=f, delimiter='\t', encoding='utf8')

# end_time = time.time()
# print("Network graph created. Process took {:.04f} seconds".format(end_time - start_time))

Creating network graph...
Network graph created. Process took 225.2254 seconds


In [3]:
print("Creating network graph...")

start_time = time.time() 

# Initalize undirected simple graph
grph = nx.Graph()

# Populate the graph by adding edges from chunked dataframes
for chunk in g_data_chunks: 
  grph.add_edges_from([tuple(x) for x in chunk.values])

end_time = time.time()

print("Network graph created. Process took {:.04f} seconds".format(end_time - start_time))

Creating network graph...
Network graph created. Process took 220.6446 seconds


In [4]:
# Check that graph is of correct size
print("Number of edges: {}".format(grph.number_of_edges())) # There should be 30915267
print("Number of nodes: {}".format(grph.number_of_nodes())) # There should be 6626753

Number of edges: 30915267
Number of nodes: 6626753


In [9]:
# Get all edges for node 4009630
# The result will be the node, followed by nodes its connected to
grph.edges(4009630)

EdgeDataView([(4009630, 3942361), (4009630, 4251483), (4009630, 2072811), (4009630, 3086356), (4009630, 5077327), (4009630, 846118), (4009630, 3776891), (4009630, 4312881), (4009630, 4950631), (4009630, 2544100), (4009630, 4507852), (4009630, 3970312), (4009630, 5650628), (4009630, 5161251), (4009630, 370884), (4009630, 2686972), (4009630, 1177053), (4009630, 3696168)])

In [18]:
# Taken and modified from stack overflow: https://stackoverflow.com/questions/34917550/write-a-graph-into-a-file-in-an-adjacency-list-form-mentioning-all-neighbors-of
def adj_list_to_file(G, file_name):
    f = open(file_name, "w")
    for n in G.nodes():
        f.write(str(n) + ',')
        for neighbor in G.neighbors(n):
            f.write(str(neighbor) + ' ')
        f.write('\n')

In [19]:
# Write save save adjacency list 
adj_list_to_file(grph, './adjacency_list.txt')

# Load text into pandas df 

In [28]:
# Open file and create dictionary
adj_list = {}

with open('./adjacency_list.txt', 'r') as f:
    # For each line in the file, create a dictionary that has a key = node and value = edges
    c = 0 
    for line in f:
        adj_list[line.split(',')[0]] = line.split(',')[1].rstrip().split(' ')

In [129]:
# Specify constant L for filtering by nodes Common Neighbors
def filter_by_lemma1(adj_list, L):
    '''
    If the number of neighbors of a node is not greater than L, 
    remove the node pairs that contain that node since these
    pairs will not have more than L common neighbors. 
    
    :params adj_list: adjacency list of network
    :type adj_list: dict
    :params L: threshold for common neighbors 
    :type L: int
    :return: adjacency list with nodes that satisfy the threshold
    :rtype: dict
    '''
    
    adj_new = {}
    
    for k, v in adj_list.items():
        if len(v) > L:
            adj_new[k] = v
            
    return adj_new

In [130]:
def filter_by_lemma2(adj_list):
    '''
    In the remaining network after filtering by lemma 1, 
    if a node appears at most in L node adjacencies, 
    this node will not have more than L common neighbors. 
    
    :params adj_list: adjacency list of network
    :type adj_list: dict
    :return: inverted adjacency list of network 
    :rtype: dict
    '''
    
    adj_inv = {}
    
    for k, v in adj_list.items():
        for i in v:
            if i in adj_inv:
                adj_inv[i].append(k)
            else:
                adj_inv[i] = [k]
            
    return adj_inv

In [131]:
TEST = {
    0: [1, 2, 4, 5, 7],
    1: [0, 2, 4, 7],
    2: [0, 1, 3, 5, 6],
    3: [2, 4, 6, 7],
    4: [0, 1, 3, 6],
    5: [0, 2],
    6: [2, 3, 4],
    7: [0, 1, 3]
}

In [132]:
TEST_filter1 = filter_by_lemma1(TEST, 3)
TEST_filter1

{0: [1, 2, 4, 5, 7],
 1: [0, 2, 4, 7],
 2: [0, 1, 3, 5, 6],
 3: [2, 4, 6, 7],
 4: [0, 1, 3, 6]}

In [133]:
TEST_filter2 = filter_by_lemma2(TEST_filter1)
TEST_filter2

{1: [0, 2, 4],
 2: [0, 1, 3],
 4: [0, 1, 3],
 5: [0, 2],
 7: [0, 1, 3],
 0: [1, 2, 4],
 3: [2, 4],
 6: [2, 3, 4]}

In [141]:
def generate_accompanied_groups(adj_list):
    '''
    Generate all accompanied groups in address and size representation. 
    
    For example, 4 is a node at adjacency list 0 '[1, 2, 4]', and the 
    ranking of 4 in adjacency list 0 is equal to the size of the 
    accompanied group to 4, which is two.  
    
    The output for this node would look like the following: [4, (0, 2)]
    
    :params adj_list: lemma2 filtered adjacency list of network
    :type adj_list: dict
    :params L: threshold for common neighbors 
    :type L: int
    :return: accompanied groups in (adj adress, size) representation. 
    :rtype: dict
    '''
    
    acc_group = {}
    
    # Find accompanied groups of nodes by address and size 
    for k, v in adj_list.items():
        for i in range(1, len(v)):
            if v[i] in acc_group:
                acc_group[v[i]].append((k, i))
            else:
                acc_group[v[i]] = [(k, i)]

    return acc_group

In [142]:
TEST_accompanied_group = generate_accompanied_groups(TEST_filter2)
TEST_accompanied_group

{2: [(1, 1), (5, 1), (0, 1)],
 4: [(1, 2), (0, 2), (3, 1), (6, 2)],
 1: [(2, 1), (4, 1), (7, 1)],
 3: [(2, 2), (4, 2), (7, 2), (6, 1)]}

In [146]:
def filter_accompanied_groups(acc_group, L):
    '''
    Filter accompanied groups by threshold L.
    
    :params acc_group: accompanied groups 
    :type acc_group: dict
    :params L: threshold for common neighbors 
    :type L: int
    :return: accompanied groups greater than L
    :rtype: dict
    '''
    
    f_acc_group = {}
    
    # Filter by L
    for k, v in acc_group.items():
        if len(v) > L:
            f_acc_group[k] = v
    
    return f_acc_group

In [147]:
TEST_Filtered_AC = filter_accompanied_groups(TEST_accompanied_group, 3)
TEST_Filtered_AC

{4: [(1, 2), (0, 2), (3, 1), (6, 2)], 3: [(2, 2), (4, 2), (7, 2), (6, 1)]}

In [169]:
def generate_node_pairs(acc_group, adj_list, L):
    '''
    Accept filtered accompanied groups and generate node pairs and
    corresponding common neighbor values.
    
    :params acc_group: accompanied groups
    :type acc_group: dict
    :params adj_list: adjacency list filtered by lemma1 and lemma2 
    :type adj_list: dict
    :return: node pairs and CN values
    :rtype: dict
    '''

    node_pairs = {}
    
    for k, v in acc_group.items():
        for i in v:
            # Read adjaceny list up to size 
            for j in adj_list[i[0]][:i[1]]:
                node_pairs[(k, j)] = node_pairs.get((k, j), 0) + 1 
    
    filtered_node_pairs = []
    
    for k, v in node_pairs.items():
        if v > L:
            filtered_node_pairs.append([k, v])
    
    return filtered_node_pairs

In [170]:
results = generate_node_pairs(TEST_Filtered_AC, TEST_filter2, 3)
results

[[(4, 2), 4]]

In [118]:
import unittest 

class TestFilter(unittest.TestCase):
    
    # Run before every single test
    def setUp(self):
        self.adj_list_A = {0: [1, 2, 3, 4, 5],
                           1: [0, 2, 4],
                           2: [0, 1, 4, 5],
                           3: [0],
                           4: [0, 1, 2],
                           5: [0, 2]}
        
        self.adj_list_B = {0: [1, 2, 3, 4, 5],
                           2: [0, 1, 4, 5]}
        
        self.adj_list_C = {0: [2],
                           1: [0, 2],
                           2: [0],
                           3: [0],
                           4: [0, 2],
                           5: [0, 2]}
    
    def test_filter_lemma1(self):
        print("Test filter lemma 1...")
        self.assertEqual(filter_by_lemma1(self.adj_list_A, 3), self.adj_list_B)
    
    def test_filter_lemma2(self):
        print("Test filter lemma 2...")
        self.assertEqual(filter_by_lemma2(self.adj_listB), self.adj_list_C)
    
    def test_both_lemma_filters(self):
        print("Test both lemma filters...")
        self.assertEqual(filter_by_lemma2(filter_by_lemma1(self.adj_listA)), self.adj_list_C)

In [120]:
if __name__ == '__main__':
    unittest.main()


E
ERROR: /Users/koki/Library/Jupyter/runtime/kernel-9817c219-911b-46f9-8dcf-f6cd17d89a34 (unittest.loader._FailedTest)
----------------------------------------------------------------------
AttributeError: module '__main__' has no attribute '/Users/koki/Library/Jupyter/runtime/kernel-9817c219-911b-46f9-8dcf-f6cd17d89a34'

----------------------------------------------------------------------
Ran 1 test in 0.004s

FAILED (errors=1)


SystemExit: True

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
def link_prediction(g):
    # Get all node connectivity relationships to exclude them later for similarity calculation 
    if 

networkx uses a dict structure which takes up a lot of memory. 
https://graph-tool.skewed.de/performance
igraph is implemented in C. 
