In [26]:
import re
import os
import json
import numpy as np
from tqdm import tqdm
import networkx as nx
from collections import defaultdict
from itertools import combinations
from glob import glob

In [23]:
# get paths of all downloaded character files
path_list = glob('./data/wow_chars/*.txt')

# get names of all downloaded characters
name_list = [
    path.split('\\')[-1].replace('.txt', '').replace('_', ' ') 
    for path in path_list
]

In [24]:
# define linking patterns
patterns = [
    r'\[\[(.*?)(?:[\|#].*?)?\]\]',
    # r'\{\{(?:Term|Plural)\|(?:BotW|Series)\|([\w ()]*?)\|link(?:\|[\w= ]*?)?\}\}',
]

In [35]:
G = nx.DiGraph()
for name, path in zip(name_list, path_list):
    # read text from downloaded character file 
    with open(path, 'r', encoding='utf-8') as f:
        txt = f.read()
    
    # find all links on page
    links = np.unique([
        re.findall(pattern, txt) 
        for pattern in patterns
    ])

    # add edges
    for link in links:
        if link in name_list:
            G.add_edge(name, link)

# create a new graph from the largest component in G
Gcc = G.subgraph(max(nx.weakly_connected_components(G), key=len)).copy()

# save graphs
nx.write_gexf(G, './saved_graphs/G_wow.gexf')
nx.write_gexf(Gcc, './saved_graphs/Gcc_wow.gexf')

# print quick info
print(f'\t\tG\tGcc')
print(f"# of nodes:\t{len(list(G.nodes))}\t{len(list(Gcc.nodes))}")
print(f"# of links:\t{len(list(G.edges))}\t{len(list(Gcc.edges))}")