All Imports Go here

In [1]:
import pickle 
import os
import pandas as pd
import numpy as np
import urllib.request
from urllib.parse import quote
import re
import json
import networkx as nx


## Part 1: Download all Marvel & DC Charecter's WikiText
### 1. Load Charecter List Data

In [2]:
marvel = pickle.load( open( "data/marvel.pkl", "rb" ) )
dc = pickle.load( open( "data/dc.pkl", "rb" ) )

marvel.dropna().head(10)
dc.dropna().head(10)

marvel = marvel.dropna()
dc = dc.dropna()

### 2. Create WikiAPI Url From Charecter WikiLink

In [3]:
def get_api_url_for_plaintext(wikiLink):
    page_title = re.sub(r"\s+", '_', wikiLink)
    
    
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles=" + quote(page_title)

    content = "prop=extracts&exintro=1&explaintext=1&redirects=1"
    dataformat ="format=json"

    # https://en.wikipedia.org/w/api.php?action=query&prop=extracts&titles=Abomination_%28character%29&format=json&exintro=1&explaintext=1
    return "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)

# Get API For WikiText
def get_api_url(wikiLink):
    page_title = re.sub(r"\s+", '_', wikiLink)
       
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles=" + quote(page_title)

    content = "prop=revisions&rvprop=content&redirects=1&rvslots=main"
    dataformat ="format=json"

    # https://en.wikipedia.org/w/api.php?action=query&titles=Abomination_(character)&prop=revisions&rvprop=content&rvslots=main&format=json
    return "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)


### 3. Get WikiText From Wikilink

In [4]:
# Get Plaintext of short Description
def get_wiki_plaintext(wikiLink):
    query = get_api_url_for_plaintext(wikiLink)   

    response = urllib.request.urlopen(query)
    
    if response.getcode() != 200:
        return np.nan
    
    data = response.read()
    encoding = response.info().get_content_charset('utf-8')
    jsonData = json.loads(data.decode(encoding))

    page = next(iter(jsonData['query']['pages'].values()))
    return page['extract'] if "extract" in page else np.nan

# marvel["WikiLink"].head(5).apply(get_wiki_plaintext)


# Get wikitext of fullpage which contains link
def get_wiki_text(wikiLink):
    query = get_api_url(wikiLink)   

    response = urllib.request.urlopen(query)
    
    if response.getcode() != 200:
        return np.nan
    
    data = response.read()
    encoding = response.info().get_content_charset('utf-8')
    jsonData = json.loads(data.decode(encoding))

    page = next(iter(jsonData['query']['pages'].values()))
    
    try: 
        return page["revisions"][0]["slots"]["main"]["*"] 
    except KeyError as error: 
        return np.nan



# dc["WikiLink"].iloc[100:].apply(get_wiki_text)



### 4. Save Text To File

In [5]:
def create_directory(directory_ext):
    path = os.path.join(os.getcwd(), directory_ext)

    try: 
        os.mkdir(path)
    except OSError as error: 
        return None

create_directory("data")
create_directory("data/Marvel")
create_directory("data/DC")

def get_file_name_from_wikilink(wikilink):
    return re.sub(r"[^\w\s]", '', wikilink) + ".txt"

def save_file(text, wikilink, directory_ext):
    name = get_file_name_from_wikilink(wikilink)
    
    file = open(directory_ext + os.sep + name, "w", encoding="utf-8") 
    file.write(text) 
    file.close()
    

def save_wikitext(wikiLink, directory_ext):
    text = get_wiki_text(wikiLink)
    
    if  isinstance(text,str) and text != "":
        save_file(text, wikiLink, directory_ext)
        return text
    
    return np.nan

# save_wikitext(dc["WikiLink"][0],"data")


### 5. Save a Marvel and DC charecter's wikipages to file

In [6]:
def download_marvel_pages(wikiLink): 
    return save_wikitext(wikiLink, "data/Marvel")
def download_dc_pages(wikiLink): 
    return save_wikitext(wikiLink, "data/DC")
    
# download_marvel_pages("Achebe (comics)")
# download_dc_pages("Abin Sur")

### 6. Download wikitext , generate Datafram and save as picle file

```py
marvel["wikitext"] = marvel["WikiLink"].apply(download_marvel_pages)
dc["wikitext"] = dc["WikiLink"].apply(download_dc_pages)

marvel.dropna().to_pickle("./data/Marvel_Characters_Wikitext.pkl")
dc.dropna().to_pickle("./data/DC_Characters_Wikitext.pkl")
```

In [7]:
marvel["wikitext"] = marvel["WikiLink"].apply(download_marvel_pages)
dc["wikitext"] = dc["WikiLink"].apply(download_dc_pages)

marvel.dropna().to_pickle("./data/Marvel_Characters_Wikitext.pkl")
dc.dropna().to_pickle("./data/DC_Characters_Wikitext.pkl")

## Part 2: Create Graph of Marvel and DC universe using downloaded wikitext

We will create a DiGraph where every node is the marvel & DC charecter and edge will be every link in their wikitext to other charecter.

### 1. Load The Downloaded data:
 We can use the downloaded text file for this purpose using the below code:
 
```py
def get_directory_list(directory_ext):
    # Get the path of current working directory
    path = os.getcwd() + os.sep + directory_ext
    return os.listdir(path)  # Get the list of all files and directories in current working directory


def read_all_files(directory_ext):
    file_name_list = get_directory_list(directory_ext)
    text_dict = {}

    for file_name in file_name_list:
        file_name = directory_ext + os.sep + file_name

        try:
            text = ""
            file = open(file_name, 'r')
            Lines = file.readlines()
            for line in Lines:
                text = text + line.strip()
            file.close()
            text_dict[file_name_list] = text
        except Exception as e:
            pass
    return text_dict

marvel_list = read_all_files("data/Marvel")
dc_list = read_all_files("data/DC")
```
  But this will be slow. I prefer to use the curresponding pickle file which we have saved in previous step.
 

In [8]:
# read all downloaded text data
Marvel_Characters_Wikitext = pd.read_pickle("./data/Marvel_Characters_Wikitext.pkl")
DC_Characters_Wikitext = pd.read_pickle("./data/DC_Characters_Wikitext.pkl")
Marvel_Characters_Wikitext.head(5)

Unnamed: 0.1,Unnamed: 0,CharacterName,WikiLink,wikitext
0,0,Abomination,Abomination (character),{{For|the biblical term|Abomination (Bible)}}\...
1,1,Absorbing Man,Absorbing Man,{{Short description|Marvel Comics fictional ch...
5,5,Achebe,Achebe (comics),{{Short description|Fictional supervillain app...
13,13,Agent,Agent (comics),{{Short description|Fictional character in Mar...
14,14,Agent X,Agent X (Marvel Comics),{{short description|Fictional comic book chara...


### 2. Create A DiGraph Containing All Character as Node

In [9]:
G = nx.DiGraph(name="Marvel & DC charecter wikilink Graph")

def add_node(character_name,universe_name):
    G.add_node(character_name, universe=universe_name)
def add_marvel_charecter(character_name):
    add_node(character_name,"Marvel")
def add_dc_charecter(character_name):
    add_node(character_name,"DC")

Marvel_Characters_Wikitext["CharacterName"].apply(add_marvel_charecter)
DC_Characters_Wikitext["CharacterName"].apply(add_dc_charecter)
pass

### 2. Find All The outgoing link of a particuler charecter wikitext page 

In [10]:

def get_links(text):
    link_list =  re.findall(r"\[\[(.*?)\]\]", text)
    links = set()
    
    for link in link_list:
        links.update(link.split("|"))
    return list(links)

# get_links(Marvel_Characters_Wikitext["wikitext"][0])

marvel_link_charecter_dict = {}
dc_link_charecter_dict = {}

for link, name in zip(Marvel_Characters_Wikitext["WikiLink"], Marvel_Characters_Wikitext["CharacterName"]):
    marvel_link_charecter_dict[link] =  name
for link, name in zip(DC_Characters_Wikitext['WikiLink'], DC_Characters_Wikitext['CharacterName']):
    dc_link_charecter_dict[link] =  name


### 3. Add all the edges to our graph

In [11]:
def add_link_to_graph(name,link):
    if link in marvel_link_charecter_dict:
        G.add_edge(name, marvel_link_charecter_dict[link])
    elif link in dc_link_charecter_dict:
        G.add_edge(name, dc_link_charecter_dict[link])

for name,text in zip(Marvel_Characters_Wikitext["CharacterName"], Marvel_Characters_Wikitext["wikitext"]):
    for link in get_links(text):
        add_link_to_graph(name,link)

for name,text in zip(DC_Characters_Wikitext["CharacterName"], DC_Characters_Wikitext["wikitext"]):
    for link in get_links(text):
        add_link_to_graph(name,link)
    

### 4. Remove all the nodes which don't have any edges


In [12]:
for node in list(G.nodes()):
    if G.in_degree(node) == 0 and G.out_degree(node) == 0:
        G.remove_node(node)


## Part 2: Save The graph as edgelist.
Below Code will save the graph `comic_characters_universe.edgelist` in our data directory.

In [13]:
nx.write_edgelist(G, "data/comic_characters_universe.edgelist")