# Scrape My Little Pony characters

#### Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import re
import colorsys
import requests
from bs4 import BeautifulSoup
import time
import collections as co
import colormath
from colormath.color_objects import LabColor, HSLColor
from colormath.color_conversions import convert_color

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 3000
pd.options.display.max_colwidth = None

---

## Get data

#### Define the URL we'll scrape

In [4]:
url = "https://mlp.fandom.com/wiki/List_of_ponies/full"

---

In [5]:
with open(
    "data/raw/List of ponies_full _ My Little Pony Friendship is Magic Wiki _ Fandom.html",
    "r",
) as fin:
    soup = BeautifulSoup(fin.read(), "lxml")

In [6]:
table = soup.find("table", attrs={"class": "wikitable"})
# table_body = table.find("tbody")

In [7]:
rows = []

for tr in table.find_all("tr"):
    rows.append([td.text for td in tr.find_all(["th", "td"])])

In [8]:
src = pd.DataFrame(rows)

In [9]:
src.columns = [
    "name",
    "kind",
    "group",
    "coat_color",
    "mane_color",
    "eye_color",
    "first_appearance",
    "bio",
    "drop",
]

In [10]:
df = src.drop(0).drop(["drop", "bio"], axis=1).copy()
df = df.drop_duplicates(subset="name")

In [11]:
df["kind"] = df["kind"].str.lower()
df["group"] = df["group"].str.lower()

#### Parse appearance order

In [12]:
df["first_season"] = df["first_appearance"].str[:1] == "1"

#### Clean up colors

In [13]:
df["coat_color"] = df["coat_color"].str.replace("¤", "")
df["mane_color"] = df["mane_color"].str.replace("¤", "")
df["eye_color"] = df["eye_color"].str.replace("¤", "")

In [14]:
df = df[(df["kind"] != "") | (df["group"] != "")]

#### How many ponies?

In [15]:
len(df)

2146

In [16]:
df[df["first_season"] == True].head(10)

Unnamed: 0,name,kind,group,coat_color,mane_color,eye_color,first_appearance,first_season
1,Applejack,earth,mare,"hsl(036, 100%, 71%)","hsl(058, 90%, 84%)","hsl(113, 50%, 57%)",1010730,True
2,Pinkie Pie,earth,mare,"hsl(335, 88%, 84%)","hsl(336, 92%, 62%)","hsl(198, 94%, 72%)",1010659,True
5,Fluttershy,pegasus,mare,"hsl(055, 94%, 82%)","hsl(331, 92%, 85%)","hsl(177, 76%, 43%)",1011351,True
7,Rainbow Dash,pegasus,mare,"hsl(195, 100%, 81%)","hsl(359, 99%, 67%)","hsl(326, 74%, 43%)",1010939,True
10,Rarity,unicorn,mare,"hsl(240, 8%, 95%)","hsl(258, 35%, 48%)","hsl(210, 45%, 54%)",1011202,True
11,Twilight Sparkle,alicorn,mare,"hsl(283, 65%, 81%)","hsl(226, 38%, 38%)","hsl(272, 36%, 42%)",1010131,True
12,Apple Bloom,earth,filly,"hsl(063, 94%, 81%)","hsl(343, 95%, 61%)","hsl(040, 88%, 66%)",1010839,True
13,Scootaloo,pegasus,filly,"hsl(035, 100%, 70%)","hsl(326, 57%, 63%)","hsl(276, 34%, 53%)",1012023,True
14,Sweetie Belle,unicorn,filly,"hsl(320, 9%, 94%)","hsl(282, 29%, 66%)","hsl(104, 40%, 69%)",1012023,True
15,Aloe,earth,mare,"hsl(329, 95%, 85%)","hsl(197, 97%, 74%)","hsl(211, 55%, 43%)",1200033,True


In [17]:
kinds = (
    df.groupby("kind")
    .agg({"name": "count"})
    .reset_index()
    .sort_values("name", ascending=False)
    .rename(columns={"name": "count"})
)

In [18]:
kinds

Unnamed: 0,kind,count
2,earth,1275
5,unicorn,454
4,pegasus,381
3,kirin,17
0,,11
1,alicorn,8


#### Colors

In [19]:
df = df.reset_index(drop=True)

In [52]:
coat_json = str(df["coat_color"].to_dict())

coat_regex = r"hsl\(\s*(\d+),\s*(\d+)%,\s*(\d+)%\s*\)"
coat_lines = re.findall(coat_regex, coat_json)
coat_rgbs = [
    colorsys.hls_to_rgb(
        int(coat_line[0]) / 360, int(coat_line[2]) / 100, int(coat_line[1]) / 100
    )
    for coat_line in coat_lines
]
coat_hexs = [
    "#%02x%02x%02x"
    % (round(coat_rgb[0] * 255), round(coat_rgb[1] * 255), round(coat_rgb[2] * 255))
    for coat_rgb in coat_rgbs
]

df["coat_hex"] = pd.Series(coat_hexs)

In [53]:
mane_json = str(df["mane_color"].to_dict())

mane_regex = r"hsl\(\s*(\d+),\s*(\d+)%,\s*(\d+)%\s*\)"
mane_lines = re.findall(mane_regex, mane_json)
mane_rgbs = [
    colorsys.hls_to_rgb(
        int(mane_line[0]) / 360, int(mane_line[2]) / 100, int(mane_line[1]) / 100
    )
    for mane_line in mane_lines
]
mane_hexs = [
    "#%02x%02x%02x"
    % (round(mane_rgb[0] * 255), round(mane_rgb[1] * 255), round(mane_rgb[2] * 255))
    for mane_rgb in mane_rgbs
]

df["mane_hex"] = pd.Series(mane_hexs)

In [54]:
eye_json = str(df["eye_color"].to_dict())

eye_regex = r"hsl\(\s*(\d+),\s*(\d+)%,\s*(\d+)%\s*\)"
eye_lines = re.findall(eye_regex, eye_json)
eye_rgbs = [
    colorsys.hls_to_rgb(
        int(eye_line[0]) / 360, int(eye_line[2]) / 100, int(eye_line[1]) / 100
    )
    for eye_line in eye_lines
]
eye_hexs = [
    "#%02x%02x%02x"
    % (round(eye_rgb[0] * 255), round(eye_rgb[1] * 255), round(eye_rgb[2] * 255))
    for eye_rgb in eye_rgbs
]

df["eye_hex"] = pd.Series(eye_hexs)

In [55]:
import webcolors

In [56]:
df.head()

Unnamed: 0,name,kind,group,coat_color,mane_color,eye_color,first_appearance,first_season,coat_hex,mane_hex,eye_hex
0,Applejack,earth,mare,"hsl(036, 100%, 71%)","hsl(058, 90%, 84%)","hsl(113, 50%, 57%)",1010730,True,#ffc46b,#fbf8b1,#67c85b
1,Pinkie Pie,earth,mare,"hsl(335, 88%, 84%)","hsl(336, 92%, 62%)","hsl(198, 94%, 72%)",1010659,True,#fab2d0,#f7458c,#74d2fb
2,Sunny Starscout,earth,mare,"hsl(25, 86%, 66%)","hsl(328, 75%, 44%)","hsl(159, 95%, 37%)",ANG0127,False,#f39c5e,#c41c76,#05b879
3,Hitch Trailblazer,earth,stallion,"hsl(38, 69%, 68%)","hsl(191, 53%, 46%)","hsl(38, 97%, 39%)",ANG0130,False,#e6bc75,#379db3,#c47d03
4,Fluttershy,pegasus,mare,"hsl(055, 94%, 82%)","hsl(331, 92%, 85%)","hsl(177, 76%, 43%)",1011351,True,#fcf5a6,#fcb6d8,#1ac1b9


#### Get from the list of ponies

In [24]:
codes = pd.read_html("https://mlp.fandom.com/wiki/List_of_ponies/Earth_ponies")[0]

In [25]:
codes

Unnamed: 0,Legend,Legend.1
0,K,"kind: Earth, Pegasus, unicorn, Alicorn"
1,G,"group: colt, filly, mare, stallion"
2,C,coat color
3,M,mane color
4,E,eye color
5,F,"first appearance or mention outside the opening sequence: season, episode, minute, second"


---

## Export

In [26]:
df.to_csv("data/processed/ponies_full.csv", index=False)

In [27]:
kinds.to_csv("data/processed/kinds.csv", index=False)