# 0. Import libraries

In [30]:
import networkx as nx
import pandas as pd

# 1. Reading data

### Reading reduced edges file

In [31]:
columns = ['source', 'destination']
reduced = pd.read_csv('wiki-topcats-reduced.txt', names=columns, sep='\t')

In [32]:
reduced[0:5]

Unnamed: 0,source,destination
0,52,401135
1,52,1069112
2,52,1163551
3,62,12162
4,62,167659


### Reading page names file

In [33]:
#The ID is the same as the index of row so we are only saving the names ordered
names_list = []
with open('wiki-topcats-page-names.txt') as file:
    for line in file:
        cline = line.rstrip().split()
        #row_id = cline[0]
        row_name = cline[1:]
        #names_list.append((row_id, " ".join(row_name)))
        names_list.append(" ".join(row_name))

In [34]:
names_list[0]

'Chiasmal syndrome'

In [35]:
len(names_list)

1791489

### Reading categories file

In [36]:
categories = {}
with open('wiki-topcats-categories.txt') as file:
    for line in file:
        row_cat = line.rstrip().partition(':')[2].partition('; ')[0]
        row_artic = line.rstrip().partition(':')[2].partition('; ')[2].split()
        if len(row_artic) > 3500:
            categories[row_cat] = row_artic

In [37]:
categories.keys()

dict_keys(['English_footballers', 'The_Football_League_players', 'Association_football_forwards', 'Association_football_goalkeepers', 'Association_football_midfielders', 'Association_football_defenders', 'Living_people', 'Year_of_birth_unknown', 'Harvard_University_alumni', 'Major_League_Baseball_pitchers', 'Members_of_the_United_Kingdom_Parliament_for_English_constituencies', 'Indian_films', 'Year_of_death_missing', 'English_cricketers', 'Year_of_birth_missing_(living_people)', 'Rivers_of_Romania', 'Main_Belt_asteroids', 'Asteroids_named_for_people', 'English-language_albums', 'English_television_actors', 'British_films', 'English-language_films', 'American_films', 'Fellows_of_the_Royal_Society', 'People_from_New_York_City', 'American_Jews', 'American_television_actors', 'American_film_actors', 'Debut_albums', 'Black-and-white_films', 'Year_of_birth_missing', 'Place_of_birth_missing_(living_people)', 'Article_Feedback_Pilot', 'American_military_personnel_of_World_War_II', 'Windows_gam

In [38]:
len(categories.get('English_footballers'))

9237

In [39]:
#G_Cat=nx.Graph(categories)

# [RQ1] Build the graph

Build the graph G=(V, E), where V is the set of articles and E the hyperlinks among them. For this section we are using networkx to build the graph.

We have a file with the source and destination node id so it seems to be a directed graph. Just to be sure we are going to build to graphs. Directed and Undirected graph.

In [84]:
# Create a directed Graph
Gdir = nx.DiGraph() # Directed graph
Gund = nx.Graph() # Undirected graph

# Insert Nodes to both graphs
Gdir.add_nodes_from(reduced['source'])
Gdir.add_nodes_from(reduced['destination'])
Gund.add_nodes_from(reduced['source'])
Gund.add_nodes_from(reduced['destination'])

# Add Edges to both graphs
for index, row in reduced.iterrows():
    Gdir.add_edge(row['source'], row['destination'])
    Gund.add_edge(row['source'], row['destination'])

In [85]:
print("Edges in file wiki-topcats-reduced.txt: ", len(reduced))
print("Edges in graph directed: ", Gdir.number_of_edges(), )
print("Edges in graph normal: ", Gund.number_of_edges())

Edges in file wiki-topcats-reduced.txt:  2645247
Edges in graph directed:  2645247
Edges in graph normal:  2174451


We can observe that it is a directed graph because there are more edges in the directed so some of them are not going both directions. (To be undirected need to have all source/destination in both directions)

In [91]:
# Directed graph
print("Directed Graph:")
print(nx.info(Gdir),'\n')
# Undirected graph
print("Undirected Graph:")
print(nx.info(Gund))

Directed Graph:
Name: 
Type: DiGraph
Number of nodes: 461193
Number of edges: 2645247
Average in degree:   5.7357
Average out degree:   5.7357 

Undirected Graph:
Name: 
Type: Graph
Number of nodes: 461193
Number of edges: 2174451
Average degree:   9.4297


In [94]:
nx.density(Gdir)

1.2436602635647606e-05

### Graph results:

- It is a **DIRECTED graph** because the edges given are source-destination
- The number of nodes: **461193**
- The number of edges: **2645247**
- The average node degree: **Average in degree:   5.7357 | Average out degree:   5.7357** Each node has on average 5.7 nodes in and 5.7 nodes out
- Is the graph dense?: **density = 1.24e-05** The density is 0 for a graph without edges and 1 for a complete graph. So it is not dense.

# [RQ2] _