# 0. Import libraries

In [146]:
import networkx as nx
import pandas as pd
import statistics
import collections
import timeit

# 1. Reading data

### Reading reduced edges file

In [2]:
reduced = pd.read_csv('wiki-topcats-reduced.txt', names=['source', 'destination'], sep='\t')

In [3]:
reduced.head()

Unnamed: 0,source,destination
0,52,401135
1,52,1069112
2,52,1163551
3,62,12162
4,62,167659


In [4]:
# Number of edges
print('Number of edges:', len(reduced)) 

Number of edges: 2645247


In [5]:
# Number of nodes
set_nodes = set([*list(reduced['source']), *list(reduced['destination'])])
print('Number of nodes:', len(set_nodes)) 

Number of nodes: 461193


### Reading page names file

In [6]:
#The ID is the same as the index of row so we are only saving the names ordered
def read_names_list(file_path):
    names_list = []
    with open(file_path) as file:
        for line in file:
            cline = line.rstrip().split()
            #row_id = cline[0]
            row_name = cline[1:]
            #names_list.append((row_id, " ".join(row_name)))
            names_list.append(" ".join(row_name))
    return names_list
        
names_list = read_names_list('wiki-topcats-page-names.txt')

In [7]:
names_list[0]

'Chiasmal syndrome'

In [8]:
len(names_list)

1791489

### Reading categories file

In [9]:
def read_categories_list(file_path):
    categories = {}
    with open(file_path) as file:
        for line in file:
            row_cat = line.rstrip().partition(':')[2].partition('; ')[0]
            row_artic = list(map(int, line.rstrip().partition(':')[2].partition('; ')[2].split()))
            if len(row_artic) > 3500: # Each category must have more than 3500 nodes
                categories[row_cat] = row_artic
    return categories

categories = read_categories_list('wiki-topcats-categories.txt')

In [10]:
# Print categories keys and length of the contained list
length_categories = {key: len(value) for key, value in categories.items()}
print(length_categories)
del length_categories

{'English_footballers': 9237, 'The_Football_League_players': 9467, 'Association_football_forwards': 6959, 'Association_football_goalkeepers': 3997, 'Association_football_midfielders': 8270, 'Association_football_defenders': 6668, 'Living_people': 418223, 'Year_of_birth_unknown': 3760, 'Harvard_University_alumni': 6154, 'Major_League_Baseball_pitchers': 6580, 'Members_of_the_United_Kingdom_Parliament_for_English_constituencies': 6546, 'Indian_films': 5913, 'Year_of_death_missing': 7851, 'English_cricketers': 3813, 'Year_of_birth_missing_(living_people)': 34721, 'Rivers_of_Romania': 7729, 'Main_Belt_asteroids': 13704, 'Asteroids_named_for_people': 5701, 'English-language_albums': 4853, 'English_television_actors': 3501, 'British_films': 4551, 'English-language_films': 22699, 'American_films': 15302, 'Fellows_of_the_Royal_Society': 3697, 'People_from_New_York_City': 4888, 'American_Jews': 3542, 'American_television_actors': 11661, 'American_film_actors': 13938, 'Debut_albums': 8401, 'Blac

In [11]:
print("Number of categories:", len(categories.keys()))

Number of categories: 35


Now we have to intersect the dictionary of categories with the set of nodes in reduced file:

In [12]:
# Intersect categories and set_nodes of reduced
def intersect_categories_reduced(categories, set_nodes):
    categories_reduced = {}
    for key, values in categories.items():
        aux_list = set_nodes.intersection(values)
        if(len(aux_list) > 3500): # Each category must have more than 3500 nodes
            categories_reduced[key] = aux_list   
    return categories_reduced

categories_reduced = intersect_categories_reduced(categories, set_nodes)

In [13]:
length_categories_reduced = {key: len(value) for key, value in categories_reduced.items()}
print(length_categories_reduced)
del length_categories_reduced

{'English_footballers': 7538, 'The_Football_League_players': 7814, 'Association_football_forwards': 5097, 'Association_football_goalkeepers': 3737, 'Association_football_midfielders': 5827, 'Association_football_defenders': 4588, 'Living_people': 348300, 'Harvard_University_alumni': 5549, 'Major_League_Baseball_pitchers': 5192, 'Members_of_the_United_Kingdom_Parliament_for_English_constituencies': 6491, 'Indian_films': 5568, 'Year_of_death_missing': 4122, 'Year_of_birth_missing_(living_people)': 28498, 'Rivers_of_Romania': 7729, 'Main_Belt_asteroids': 11660, 'Asteroids_named_for_people': 4895, 'English-language_albums': 4760, 'British_films': 4422, 'English-language_films': 22463, 'American_films': 15159, 'People_from_New_York_City': 4614, 'American_television_actors': 11531, 'American_film_actors': 13865, 'Debut_albums': 7561, 'Black-and-white_films': 10759, 'Year_of_birth_missing': 4346, 'Place_of_birth_missing_(living_people)': 5532, 'American_military_personnel_of_World_War_II': 37

In [14]:
print("Number of categories_reduced:", len(categories_reduced.keys()))

Number of categories_reduced: 29


# [RQ1] Build the graph

Build the graph G=(V, E), where V is the set of articles and E the hyperlinks among them. For this section we are using networkx to build the graph.

We have a file with the source and destination node id so it seems to be a directed graph. Just to be sure we are going to build to graphs. Directed and Undirected graph.

In [15]:
# Create a directed Graph
Gdir = nx.DiGraph() # Directed graph
#Gund = nx.Graph() # Undirected graph

# Insert Nodes to both graphs
Gdir.add_nodes_from(set_nodes)
#Gund.add_nodes_from(set_nodes)

# Add Edges to both graphs
for index, row in reduced.iterrows():
    Gdir.add_edge(row['source'], row['destination'])
    #Gund.add_edge(row['source'], row['destination'])

In [16]:
print("Edges in file wiki-topcats-reduced.txt: ", len(reduced))
print("Edges in graph directed: ", Gdir.number_of_edges(), )
#print("Edges in graph undirected: ", Gund.number_of_edges())

Edges in file wiki-topcats-reduced.txt:  2645247
Edges in graph directed:  2645247


We can observe that it is a directed graph because there are more edges in the directed so some of them are unidirected. (To be undirected need to have all source/destination in both directions)

In [17]:
# Directed graph
print("Directed Graph:")
print(nx.info(Gdir),'\n')
# Undirected graph
#print("Undirected Graph:")
#print(nx.info(Gund))

Directed Graph:
Name: 
Type: DiGraph
Number of nodes: 461193
Number of edges: 2645247
Average in degree:   5.7357
Average out degree:   5.7357 



In [18]:
nx.density(Gdir)

1.2436602635647606e-05

### Graph results:

- It is a **DIRECTED graph** because the edges given are source-destination
- The number of nodes: **461193**
- The number of edges: **2645247**
- The average node degree: **Average in degree:   5.7357 | Average out degree:   5.7357** Each node has on average 5.7 nodes in and 5.7 nodes out
- Is the graph dense?: **density = 1.24e-05** The density is 0 for a graph without edges and 1 for a complete graph. So it is not dense.

### Clean active variables to save memory

In [19]:
%whos

Variable                       Type         Data/Info
-----------------------------------------------------
Gdir                           DiGraph      
categories                     dict         n=35
categories_reduced             dict         n=29
collections                    module       <module 'collections' fro<...>ollections\\__init__.py'>
index                          int          2645246
intersect_categories_reduced   function     <function intersect_categ<...>ed at 0x0000019092EFC2F0>
names_list                     list         n=1791489
nx                             module       <module 'networkx' from '<...>\\networkx\\__init__.py'>
pd                             module       <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>
read_categories_list           function     <function read_categories<...>st at 0x0000019092ECFAE8>
read_names_list                function     <function read_names_list at 0x0000019080004598>
reduced                        DataFrame          

In [20]:
#set_nodes, reduced, names_list, categories, categories_reduced, Gdir, Gund, index, row
del set_nodes, reduced, names_list, categories, index, row # Remove unused variables

In [21]:
%whos

Variable                       Type        Data/Info
----------------------------------------------------
Gdir                           DiGraph     
categories_reduced             dict        n=29
collections                    module      <module 'collections' fro<...>ollections\\__init__.py'>
intersect_categories_reduced   function    <function intersect_categ<...>ed at 0x0000019092EFC2F0>
nx                             module      <module 'networkx' from '<...>\\networkx\\__init__.py'>
pd                             module      <module 'pandas' from 'C:<...>es\\pandas\\__init__.py'>
read_categories_list           function    <function read_categories<...>st at 0x0000019092ECFAE8>
read_names_list                function    <function read_names_list at 0x0000019080004598>
statistics                     module      <module 'statistics' from<...>da3\\lib\\statistics.py'>


# [RQ2] Block Ranking

Obtain a block-ranking, where the blocks are represented by the categories. In particular, we want: block_ranking = [C0, C1, C2...]

Each category  corresponds to a list of nodes.
The order of the remaining categories is given by:

**distance(Co, Ci) = median(ShortestPath(Co, Ci))**

# TEST

## Test 1 (networkX functions)

In [264]:
# USING shortest_path_length
def distance_time():
    CxCy = []
    # Input category to make block ranking
    source_category = 'English_footballers'
    target_category = 'The_Football_League_players'

    C0 = list(categories_reduced.get(source_category)) #C0
    CI = list(categories_reduced.get(target_category)) #C1
    
    for s_value in C0[:5]:
        for t_value in CI:
            #if s_value != t_value:
                try:
                    CxCy.append(nx.shortest_path_length(Gdir, source=s_value, target=t_value))
                except:
                    pass

    print("#Results:", len(CxCy), "Median:", statistics.median(CxCy))
    return statistics.median(CxCy)

In [265]:
timeit.timeit(distance_time, number = 1)
#takes 19 seconds to compare 20 values vs all values of 2 categories---> too much

#Results: 24483 Median: 6


4.964930928342255

## Test 2  

In [152]:
dic_adj = dict(Gdir.adjacency())

In [248]:
def shortest_path_bfs(dic_adj_graph, source, destination): 
    visited = {source}
    queue = collections.deque([(source, 0)])
    
    # Repeat until queue is empty
    while queue: 
        vertex = queue.popleft()
        # If vertex is destionation return level
        if vertex[0] == destination:
                return vertex[1]
        # Add to visited and append to queue
        for neighbour in dic_adj_graph[vertex[0]]: 
            if neighbour not in visited: 
                visited.add(neighbour) 
                queue.append((neighbour, vertex[1]+1)) 

In [249]:
print(shortest_path_bfs(dic_adj, 52, 1174302))

5


In [250]:
def has_path_bfs(dic_adj_graph, source, destination_list):
    visited = {source}
    queue = collections.deque([source])

    # Repeat until queue is empty
    while queue: 
        vertex = queue.popleft()
        # Add to visited and append to queue
        for neighbour in dic_adj_graph[vertex]: 
            if neighbour not in visited: 
                visited.add(neighbour) 
                queue.append(neighbour) 
    
    return visited.intersection(destination_list)

In [251]:
target_category = 'The_Football_League_players'
CI = list(categories_reduced.get(target_category)) #C1
print(len(has_path_bfs(dic_adj, 52, CI)))

4896


In [252]:
# THIS FUNCTION IS ONLY FOR TIMING
def distance_time_mine():
    CxCy = []
    # Input category to make block ranking
    source_category = 'English_footballers'
    target_category = 'The_Football_League_players'

    C0 = list(categories_reduced.get(source_category)) #C0
    CI = list(categories_reduced.get(target_category)) #C1
    
    for s_value in C0[:5]:
        for t_value in has_path_bfs(dic_adj, s_value, set(CI[:30])):
            CxCy.append(shortest_path_bfs(dic_adj, s_value, t_value))

    print("#Results:", len(CxCy), "Median:", statistics.median(CxCy))
    return statistics.median(CxCy)

In [253]:
timeit.timeit(distance_time_mine, number = 1)

#Results: 53 Median: 6


17.955541841476588

## Test 3

In [214]:
#try2
def shortest_path_bfs_list(dic_adj_graph, source, destination_list): 
    visited = {source}
    queue = collections.deque([(source, 0)])
    jumps_list = []
    
    # Repeat until queue is empty
    while queue: 
        vertex = queue.popleft()
        # If vertex is destionation return level
        if vertex[0] in destination_list:
            jumps_list.append(vertex[1])
        # Add to visited and append to queue
        for neighbour in dic_adj_graph[vertex[0]]: 
            if neighbour not in visited: 
                visited.add(neighbour) 
                queue.append((neighbour, vertex[1]+1)) 
    return jumps_list

In [262]:
# THIS FUNCTION IS ONLY FOR TIMING
def distance_time_mine2():
    CxCy = []
    # Input category to make block ranking
    source_category = 'English_footballers'
    target_category = 'The_Football_League_players'

    C0 = list(categories_reduced.get(source_category)) #C0
    CI = list(categories_reduced.get(target_category)) #C1
    
    for s_value in C0[:5]:
        CxCy.extend(shortest_path_bfs_list(dic_adj, s_value, set(CI)))

    print("#Results:", len(CxCy), "Median:", statistics.median(CxCy))
    return statistics.median(CxCy)

In [263]:
timeit.timeit(distance_time_mine2, number = 1)

#Results: 24483 Median: 6


4.598324591724122

## START BLOCK RANKING

In [128]:
def distance(C0, CI):
    CxCy = []

    C0 = list(categories_reduced.get(C0)) #C0
    CI = list(categories_reduced.get(CI)) #C1
    
    for s_value in C0[:5]:
        for t_value in CI[:20]:
            #if s_value != t_value:
                try:
                    CxCy.append(nx.shortest_path_length(Gdir, source=s_value, target=t_value))
                except:
                    pass

    return statistics.median(CxCy)

In [129]:
C0 = 'English_footballers'
block_ranking = [(0, C0)]
for cat in categories_reduced.keys():
    if cat != C0:
        block_ranking.append((distance(C0, cat), cat))

In [130]:
block_ranking.sort()

In [131]:
for i in block_ranking:
    print(int(i[0]), i[1])

0 English_footballers
5 Association_football_forwards
6 Association_football_defenders
6 Association_football_goalkeepers
6 Association_football_midfielders
6 British_films
6 The_Football_League_players
7 American_film_actors
7 American_films
7 American_military_personnel_of_World_War_II
7 American_television_actors
7 Black-and-white_films
7 Debut_albums
7 English-language_albums
7 English-language_films
7 Major_League_Baseball_pitchers
7 People_from_New_York_City
7 Place_of_birth_missing_(living_people)
8 Harvard_University_alumni
8 Indian_films
8 Living_people
8 Members_of_the_United_Kingdom_Parliament_for_English_constituencies
8 Rivers_of_Romania
8 Windows_games
9 Year_of_birth_missing_(living_people)
10 Asteroids_named_for_people
10 Year_of_death_missing
11 Year_of_birth_missing
14 Main_Belt_asteroids
