# Scraping exercise: How [tall](https://www.celebheights.com/) are celebrities? 

#### Import Python tools

In [1]:
%load_ext lab_black

In [27]:
import pandas as pd
import geopandas as gpd
import altair as alt
import altair_stiles as altstiles
import requests
from bs4 import BeautifulSoup
import gender_guesser.detector as gender

In [28]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [29]:
today = pd.Timestamp("today").strftime("%m/%d/%Y")

---

## Get data

#### Make an uppercase list of letters

In [30]:
alphabet = [
    letter.upper()
    for letter in [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
    ]
]

#### Loop though the urls, harvest the name, url, height and place in lists

In [None]:
names_dicts = []
height_dicts = []

for a in alphabet:
    url = f"https://www.celebheights.com/s/all{a}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    divs = soup.find_all("div", class_="sAZ2 v11")
    links = soup.find_all("a", class_="sAZlink")

    for l in links:
        names_dict = {"name": l.text.strip(), "url": l["href"], "letter": a}
        names_dicts.append(names_dict)

    for d in divs:
        height_dict = {
            "height": str(d)
            .split("</a>")[-1]
            .replace("</div>", "")
            .replace("<br/>", "")
        }
        height_dicts.append(height_dict)

#### Convert the lists into dataframes and remove stray empty rows

In [None]:
name = pd.DataFrame(names_dicts)
height = pd.DataFrame(height_dicts)
height = height[height["height"] != '<div class="sAZ2 v11">'].reset_index(drop=True)

#### Merge on index

In [None]:
src = pd.merge(name, height, left_index=True, right_index=True)

In [None]:
src.head()

#### Clean up the heights

In [None]:
src[["height_display", "height_cm"]] = (
    src["height"].str.replace(")", "", regex=False).str.split("(", expand=True)
)

In [None]:
src["height_cm"] = (
    src["height_cm"].str.replace("cm", "").str.replace("None", "0").astype(float)
)

In [None]:
src["height_inches"] = (src["height_cm"] / 2.54).round(2)

#### Remove a handful of rows with null values

In [None]:
df = src[~src["height_cm"].isnull()].copy()

---

## Guess the gender

#### Instatiate the [`gender-guesser`](https://pypi.org/project/gender-guesser/) detector

In [None]:
d = gender.Detector()

#### Functions to add a gender column for a specific column

In [None]:
def guess_col_gender(col, suff="_gender", df=df, d=d):
    first_names = [f.split(" ")[0] for f in df[col].tolist()]
    genders = [d.get_gender(first_name) for first_name in first_names]
    df[f"{col}{suff}"] = genders
    return df

#### Guess for each celebrity name

In [None]:
df_g = guess_col_gender("name", "_gender", df, d)

In [None]:
df_g.head()

#### Gender counts in the dataframe

In [None]:
counts = df_g.groupby("name_gender")["name_gender"].count().reset_index(name="count")

In [None]:
counts["share"] = ((counts["count"] / counts["count"].sum()) * 100).round(2)

In [None]:
counts

#### Fix known errors

In [None]:
df.loc[df["name"] == "Tiny Ron", "name_gender"] = "male"

---

## Height distribution?

In [None]:
alt.Chart(df[(df["name_gender"] == "male") | (df["name_gender"] == "female")]).mark_bar(
    opacity=1, binSpacing=1
).encode(
    alt.X("height_cm:Q", bin=alt.Bin(maxbins=50), title="cm"),
    alt.Y("count()", stack=None, title="count"),
    alt.Color("name_gender:N", title="", legend=None),
    facet=alt.Facet("name_gender", title=""),
).configure_legend(
    orient="top"
).properties(
    width=300, height=200
)

#### Descriptives

In [None]:
shortest = df[df["height_cm"] == df["height_cm"].min()]
tallest = df[df["height_cm"] == df["height_cm"].max()]

In [None]:
df.groupby(["name_gender"]).agg({"height_cm": ["min", "max", "median"]}).reset_index()

#### Ranks

In [None]:
df["rank_for_gender"] = df.groupby("name_gender")["height_cm"].rank("max")

---

## Export

In [None]:
df.to_csv("data/processed/celebrity_heights.csv", index=False)