# Search Engine des alias d'un personnage de Game of Thrones

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
import seaborn as sns
import re
from tqdm import tqdm
import jupyter_black

jupyter_black.load(lab=False)
sns.set()

SAVE_PATH = (
    "/home/samy/csv_pickle_parquet/"  # Le directory ou se trouve les documents d'études
)

In [None]:
AWOIAF_LIST_URL = "https://awoiaf.westeros.org/index.php/List_of_characters"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.118 Safari/537.36"
}

r = requests.get(AWOIAF_LIST_URL, headers=HEADERS)

soup = BeautifulSoup(r.text)
items = soup.find_all("ul")[9:35]
li_list = []
for ul in items:
    temp = ul.find_all("li")
    for elem in temp:
        li_list.append(elem.find("a"))

### Création d'un dictionnaire avec tout les liens de téléchargement

In [None]:
characters_url_dict = {}
for item in li_list:
    title = item.get("title")
    if title:
        href = item.get("href")
        characters_url_dict[title] = "https://awoiaf.westeros.org" + href

In [None]:
characters_url_dict

### Téléchargement des données et transformation en csv

In [None]:
raw_data_wiki = pd.DataFrame(columns=["character", "raw_data"])


for chara in tqdm(characters_url_dict.keys()):
    fetched_html = requests.get(characters_url_dict[chara], headers=HEADERS).text
    raw_data_wiki.loc[len(raw_data_wiki), :] = [chara, fetched_html]


raw_data_wiki.to_csv(SAVE_PATH + "awoiaf_raw_html_v2.csv", sep=",", index=0)

### Obtention de la page html pour un personnage en particulier

In [None]:
import csv
import codecs
import sys

csv.field_size_limit(sys.maxsize)


FILENAME = SAVE_PATH + "awoiaf_raw_html.csv"
ENCODING = "utf-8"

# Deux solutions pour lire des données : import via pandas ou streaming


def get_html_streaming(
    character,
):  # Cette solution est intéressante si la volumétrie de données est très importante (temps d'exec au pire des cas : 3.76s)
    with codecs.open(FILENAME, "r", ENCODING) as fp:
        reader = csv.reader(fp)
        i = 0
        for row in tqdm(reader):
            if row[0] == character:
                return row[1]
        raise Exception("Character not found")


def get_html_pandas(
    character,
):  # Solution naive mais la plus efficace dans le cas présent (temps d'exec moyen : 1.72s)
    data = pd.read_csv(FILENAME)
    liste = data[data["character"] == character].raw_data.tolist()
    if liste:
        return liste[0]
    else:
        raise Exception("Character not found")


### Obtention des alias d'un personnage

In [None]:
%%time
BOOKS_TITLE = {
    "A Game of Thrones": "GOT1.txt",
    "A Clash of Kings": "GOT2.txt",
    "A Storm of Swords": "GOT3.txt",
    "A Feast for Crows": "GOT4.txt",
    "A Dance with Dragons": "GOT5.txt",
}

CHAR_URLS = []
for item in li_list:
    title = item.get("title")
    if title:
        href = item.get("href")
        CHAR_URLS.append(href)


def get_aliases(soup):
    aliases_list = []
    aliases_html_th = soup.find("table", class_="infobox").find("th", text="Aliases")
    if aliases_html_th:
        aliases_html_td = aliases_html_th.find_next("td").find_all("li")
        for elmt in aliases_html_td:
            name = elmt.text
            name = (
                "".join([i for i in name if not i.isdigit()])
                .replace("[", "")
                .replace("]", "")
            )
            aliases_list.append(name)
    return aliases_list


def get_alias(soup):
    name = []
    alias_html_th = soup.find("table", class_="infobox").find("th", text="Alias")
    if alias_html_th:
        alias = alias_html_th.find_next("td").text
        alias = (
            "".join([i for i in alias if not i.isdigit()])
            .replace("[", "")
            .replace("]", "")
            .strip()
        )
        name.append(alias)
    return name


def get_title_name(soup):
    title_html_h1 = soup.find("h1").text
    return title_html_h1


def get_infobox_name(soup):
    infobox = soup.find("table", class_="infobox")
    sub_table = infobox.find("table")
    if sub_table:
        infobox_name = sub_table.find_all("td")[1]
        span = infobox_name.find("span")
        if span:
            span.decompose()
        return infobox_name.text
    else:
        return infobox.find_next("th").text


def get_text_length(soup):
    text_list = soup.find_all("p")
    text = "".join(i.text for i in text_list)
    text = text.replace("\n", "")
    text = re.sub(r"\[\d+\]", "", text)
    return len(text)


def get_books(soup):
    book_list = []
    books_html_th = soup.find("table", class_="infobox").find("th", text="Books")
    if books_html_th:
        books_html_td = books_html_th.find_next("td").find_all("li")
        for elmt in books_html_td:
            name = elmt.text
            book_list.append(name)
    return book_list


def get_book(soup):
    name = []
    book_html_th = soup.find("table", class_="infobox").find("th", text="Book")
    if book_html_th:
        book = book_html_th.find_next("td").text
        book = (
            "".join([i for i in book if not i.isdigit()])
            .replace("[", "")
            .replace("]", "")
            .strip()
        )
        name.append(book)
    return name


def association_name_number(book_list, app_list):
    new_book_list = list()
    for i in range(len(book_list)):
        book_clean = book_list[i].strip()
        if book_clean in BOOKS_TITLE:
            new_book_list.append(BOOKS_TITLE[book_clean])
        else:
            app_list[i] = ""
    return new_book_list


def split_name_appearance(book_app_list):
    book_name_list = [element.split("(")[0] for element in book_app_list]
    app_list = [element.split("(")[1].replace(")", "") for element in book_app_list]
    book_list = association_name_number(book_name_list, app_list)
    new_app_list = []
    for elmt in app_list:
        if elmt:
            new_app_list.append(elmt)
    return book_list, new_app_list


def get_page_rank(soup):
    rank = 0
    link_list = soup.find_all("a")
    for link in link_list:
        href = link.get("href")
        if href in CHAR_URLS:
            rank += 1
    return rank


def get_total_information(character):
    html = get_html_streaming(character)
    soup = BeautifulSoup(html)

    book_list = get_books(soup) + get_book(soup)
    books, appearances = split_name_appearance(book_list)

    return {
        "name_title": get_title_name(soup),
        "name_infobox": get_infobox_name(soup),
        "aliases": get_aliases(soup) + get_alias(soup),
        "page_rank": get_page_rank(soup),
        "text_length": get_text_length(soup),
        "books": books,
        "nature_of_appearance": appearances,
    }

###### Cas possibles en résultat du search engine

In [40]:
%%time
# Exemple : Eddard Stark
get_total_information("Eddard Stark")

1007it [00:01, 811.15it/s]


CPU times: user 1.31 s, sys: 80.1 ms, total: 1.39 s
Wall time: 1.37 s


{'name_title': 'Eddard Stark',
 'name_infobox': 'Eddard Stark',
 'aliases': ['Ned', 'The quiet wolf', 'The Ned'],
 'page_rank': 410,
 'text_length': 24278,
 'books': ['GOT1.txt', 'GOT2.txt', 'GOT3.txt', 'GOT4.txt', 'GOT5.txt'],
 'nature_of_appearance': ['POV',
  'mentioned',
  'mentioned',
  'mentioned',
  'mentioned']}

In [41]:
%%time
# Exemple : Hodor
get_total_information("Hodor")

1616it [00:01, 885.63it/s] 


CPU times: user 1.82 s, sys: 90.5 ms, total: 1.91 s
Wall time: 1.88 s


{'name_title': 'Hodor',
 'name_infobox': 'Walder',
 'aliases': ['Hodor'],
 'page_rank': 75,
 'text_length': 12982,
 'books': ['GOT1.txt', 'GOT2.txt', 'GOT3.txt', 'GOT4.txt', 'GOT5.txt'],
 'nature_of_appearance': ['appears',
  'appears',
  'appears',
  'mentioned',
  'appears']}