# Search Engine des alias d'un personnage de Game of Thrones

In [48]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import seaborn as sns
import re
from tqdm import tqdm
import jupyter_black
import os
jupyter_black.load(lab=False)
sns.set()

SAVE_PATH = (
    "/home/samy/csv_pickle_parquet/"  # Le directory ou se trouve les documents d'études
)

HTML_PATH = SAVE_PATH + "html_got/"

In [7]:
AWOIAF_LIST_URL = "https://awoiaf.westeros.org/index.php/List_of_characters"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.118 Safari/537.36"
}

r = requests.get(AWOIAF_LIST_URL, headers=HEADERS)

soup = BeautifulSoup(r.text)
items = soup.find_all("ul")[9:35]
li_list = []
for ul in items:
    temp = ul.find_all("li")
    for elem in temp:
        li_list.append(elem.find("a"))

### Création d'un dictionnaire avec tout les liens de téléchargement

In [8]:
characters_url_dict = {}
for item in li_list:
    title = item.get("title")
    if title:
        href = item.get("href")
        characters_url_dict[title] = "https://awoiaf.westeros.org" + href

### Téléchargement des données et transformation en csv

In [None]:
raw_data_wiki = pd.DataFrame(columns=["character", "raw_data"])


for chara in tqdm(characters_url_dict.keys()):
    fetched_html = requests.get(characters_url_dict[chara], headers=HEADERS).text
    raw_data_wiki.loc[len(raw_data_wiki), :] = [chara, fetched_html]


raw_data_wiki.to_csv(SAVE_PATH + "awoiaf_raw_html_v2.csv", sep=",", index=0)

### Obtention de la page html pour un personnage en particulier

In [50]:
import csv
import codecs
import sys

csv.field_size_limit(sys.maxsize)


FILENAME = SAVE_PATH + "awoiaf_raw_html_v2.csv"
ENCODING = "utf-8"

# Deux solutions pour lire des données : import via pandas ou streaming


def get_html_streaming(
    character,
):  # Cette solution est intéressante si la volumétrie de données est très importante (temps d'exec au pire des cas : 3.76s)
    with codecs.open(FILENAME, "r", ENCODING) as fp:
        reader = csv.reader(fp)
        i = 0
        for row in reader:
            if row[0] == character:
                return row[1]
        raise Exception("Character not found")


def get_html_pandas(
    character,
):  # Solution naive mais la plus efficace dans le cas présent (temps d'exec moyen : 1.72s)
    data = pd.read_csv(FILENAME)
    liste = data[data["character"] == character].raw_data.tolist()
    if liste:
        return liste[0]
    else:
        raise Exception("Character not found")
        


### Obtention des alias d'un personnage

In [99]:
%%time
BOOKS_TITLE = {
    "A Game of Thrones": "GOT1.txt",
    "A Clash of Kings": "GOT2.txt",
    "A Storm of Swords": "GOT3.txt",
    "A Feast for Crows": "GOT4.txt",
    "A Dance with Dragons": "GOT5.txt",
}

CHAR_URLS = []
for item in li_list:
    title = item.get("title")
    if title:
        href = item.get("href")
        CHAR_URLS.append(href)


def get_aliases(soup):
    aliases_list = []
    if soup.find("table", class_="infobox"):
        aliases_html_th = soup.find("table", class_="infobox").find("th", text="Aliases")
        if aliases_html_th:
            aliases_html_td = aliases_html_th.find_next("td").find_all("li")
            for elmt in aliases_html_td:
                name = elmt.text
                name = (
                    "".join([i for i in name if not i.isdigit()])
                    .replace("[", "")
                    .replace("]", "")
                )
                aliases_list.append(name)
    return aliases_list


def get_alias(soup):
    name = []
    if soup.find("table", class_="infobox"):
        alias_html_th = soup.find("table", class_="infobox").find("th", text="Alias")
        if alias_html_th:
            alias = alias_html_th.find_next("td").text
            alias = (
                "".join([i for i in alias if not i.isdigit()])
                .replace("[", "")
                .replace("]", "")
                .strip()
            )
            name.append(alias)
    return name


def get_title_name(soup):
    title_html_h1 = soup.find("h1").text
    return title_html_h1


def get_infobox_name(soup):
    infobox = soup.find("table", class_="infobox")
    if infobox:
        sub_table = infobox.find("table")
        if sub_table:
            infobox_name = sub_table.find_all("td")[1]
            span = infobox_name.find("span")
            if span:
                span.decompose()
            return infobox_name.text
        else:
            return infobox.find_next("th").text


def get_text_length(soup):
    text_list = soup.find_all("p")
    text = "".join(i.text for i in text_list)
    text = text.replace("\n", "")
    text = re.sub(r"\[\d+\]", "", text)
    return len(text)


def get_books(soup):
    book_list = []
    if soup.find("table", class_="infobox"):
        books_html_th = soup.find("table", class_="infobox").find("th", text="Books")
        if books_html_th:
            books_html_td = books_html_th.find_next("td").find_all("li")
            for elmt in books_html_td:
                name = elmt.text
                book_list.append(name)
    return book_list


def get_book(soup):
    name = []
    if soup.find("table", class_="infobox"):
        book_html_th = soup.find("table", class_="infobox").find("th", text="Book")
        if book_html_th:
            book = book_html_th.find_next("td").text
            book = (
                "".join([i for i in book if not i.isdigit()])
                .replace("[", "")
                .replace("]", "")
                .strip()
            )
            name.append(book)
    return name

def separate_book_appearance(input_book_list):
    output_dict = {}
    output_book_list,output_appearance_list = [],[]
    
    for book in input_book_list:
        if '(' in book:
            string_list = book.split('(')
            string_list[1] = string_list[1].replace(")","")
            output_dict[string_list[0].strip()] = string_list[1]
        else:
            output_dict[book] = "Undefined"
    for key in output_dict:
        if key in BOOKS_TITLE.keys():
            
            output_book_list.append(BOOKS_TITLE[key])
            output_appearance_list.append(output_dict[key])
            
    return output_book_list,output_appearance_list
separate_book_appearance(stark)


def get_page_rank(soup):
    rank = 0
    link_list = soup.find_all("a")
    for link in link_list:
        href = link.get("href")
        if href in CHAR_URLS:
            rank += 1
    return rank


def get_full_page(soup):
    text = soup.select("div#mw-content-text")[0].text
    return text

def clean(text):
    clean_text = text
    clean_text = re.sub(r"{^\w\s}","",clean_text)
    string_encode = clean_text.encode("ascii","ignore")
    clean_text = string_encode.decode()
    
    return clean_text

def get_total_information(html):
    soup = BeautifulSoup(html)
    if soup:
        book_list = get_books(soup) + get_book(soup)
        book_number,appearances = separate_book_appearance(book_list)


        return {
            "name_title": get_title_name(soup),
            "name_infobox": get_infobox_name(soup),
            "aliases": get_aliases(soup) + get_alias(soup),
            "page_rank": np.nan,
            "text_length": get_text_length(soup),
            "books": book_number,
            "nature_of_appearance":appearances,
            "html":html
        }
    else:
        return {
            "name_title": np.nan,
            "name_infobox": np.nan,
            "aliases": np.nan,
            "page_rank": np.nan,
            "text_length": np.nan,
            "books": np.nan,
            "nature_of_appearance":np.nan,
            "html":html
        }

CPU times: user 2.56 ms, sys: 0 ns, total: 2.56 ms
Wall time: 2.57 ms


###### Transfert de toutes les infos vers un DataFrame

In [105]:
data = pd.DataFrame(
    columns=[
        "name_title",
            "name_infobox",
            "aliases",
            "page_rank",
            "text_length",
            "books",
            "nature_of_appearance",
            "html"
    ]
)

In [106]:
for dirpath,dirnames,filenames in os.walk(HTML_PATH):
    for filename in tqdm(filenames):
        if filename.endswith(".html"):
            filepath = os.path.join(dirpath,filename)
            with open(filepath,"r") as fp:
                html = fp.read()
                data.loc[len(data),:] = get_total_information(html)
                


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3669/3669 [01:21<00:00, 45.19it/s]


In [109]:
data.to_csv(SAVE_PATH + "got_characters_information.csv",index=False)