# Scraping exercise: How [tall](https://www.celebheights.com/) are celebrities? 

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import geopandas as gpd
import altair as alt
import requests
from bs4 import BeautifulSoup
import gender_guesser.detector as gender

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
today = pd.Timestamp("today").strftime("%m/%d/%Y")

---

## Get data

#### Make an uppercase list of letters

In [5]:
alphabet = [
    letter.upper()
    for letter in [
        "a",
        "b",
        "c",
        "d",
        "e",
        "f",
        "g",
        "h",
        "i",
        "j",
        "k",
        "l",
        "m",
        "n",
        "o",
        "p",
        "q",
        "r",
        "s",
        "t",
        "u",
        "v",
        "w",
        "x",
        "y",
        "z",
    ]
]

#### Loop though the urls, harvest the name, url, height and place in lists

In [6]:
names_dicts = []
height_dicts = []

for a in alphabet:
    url = f"https://www.celebheights.com/s/all{a}.html"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    divs = soup.find_all("div", class_="sAZ2 v11")
    links = soup.find_all("a", class_="sAZlink")

    for l in links:
        names_dict = {"name": l.text.strip(), "url": l["href"], "letter": a}
        names_dicts.append(names_dict)

    for d in divs:
        height_dict = {
            "height": str(d)
            .split("</a>")[-1]
            .replace("</div>", "")
            .replace("<br/>", "")
        }
        height_dicts.append(height_dict)

#### Convert the lists into dataframes and remove stray empty rows

In [7]:
name = pd.DataFrame(names_dicts)
height = pd.DataFrame(height_dicts)
height = height[height["height"] != '<div class="sAZ2 v11">'].reset_index(drop=True)

#### Merge on index

In [8]:
src = pd.merge(name, height, left_index=True, right_index=True)

In [9]:
src.head()

Unnamed: 0,name,url,letter,height
0,Aaliyah,https://www.celebheights.com/s/-Aaliyah-1123.html,A,5ft 5 (165cm)
1,Quinton Aaron,https://www.celebheights.com/s/Quinton-Aaron-4...,A,6ft 7 ½ (202cm)
2,Bonnie Aarons,https://www.celebheights.com/s/Bonnie-Aarons-5...,A,5ft 7 ½ (171cm)
3,Hiam Abbass,https://www.celebheights.com/s/Hiam-Abbass-716...,A,5ft 5 (165cm)
4,Amanda Abbington,https://www.celebheights.com/s/Amanda-Abbingto...,A,5ft 4 ½ (164cm)


#### Clean up the heights

In [10]:
src[["height_display", "height_cm"]] = (
    src["height"].str.replace(")", "", regex=False).str.split("(", expand=True)
)

In [11]:
src["height_cm"] = (
    src["height_cm"].str.replace("cm", "").str.replace("None", "0").astype(float)
)

In [12]:
src["height_inches"] = (src["height_cm"] / 2.54).round(2)

#### Remove a handful of rows with null values

In [13]:
df = src[~src["height_cm"].isnull()].copy()

---

## Guess the gender

#### Instatiate the [`gender-guesser`](https://pypi.org/project/gender-guesser/) detector

In [14]:
d = gender.Detector()

#### Functions to add a gender column for a specific column

In [15]:
def guess_col_gender(col, suff="_gender", df=df, d=d):
    first_names = [f.split(" ")[0] for f in df[col].tolist()]
    genders = [d.get_gender(first_name) for first_name in first_names]
    df[f"{col}{suff}"] = genders
    return df

#### Guess for each celebrity name

In [16]:
df_g = guess_col_gender("name", "_gender", df, d)

In [17]:
df_g.head()

Unnamed: 0,name,url,letter,height,height_display,height_cm,height_inches,name_gender
0,Aaliyah,https://www.celebheights.com/s/-Aaliyah-1123.html,A,5ft 5 (165cm),5ft 5,165.0,64.96,female
1,Quinton Aaron,https://www.celebheights.com/s/Quinton-Aaron-4...,A,6ft 7 ½ (202cm),6ft 7 ½,202.0,79.53,male
2,Bonnie Aarons,https://www.celebheights.com/s/Bonnie-Aarons-5...,A,5ft 7 ½ (171cm),5ft 7 ½,171.0,67.32,mostly_female
3,Hiam Abbass,https://www.celebheights.com/s/Hiam-Abbass-716...,A,5ft 5 (165cm),5ft 5,165.0,64.96,female
4,Amanda Abbington,https://www.celebheights.com/s/Amanda-Abbingto...,A,5ft 4 ½ (164cm),5ft 4 ½,164.0,64.57,female


#### Gender counts in the dataframe

In [18]:
counts = df_g.groupby("name_gender")["name_gender"].count().reset_index(name="count")

In [19]:
counts["share"] = ((counts["count"] / counts["count"].sum()) * 100).round(2)

In [20]:
counts

Unnamed: 0,name_gender,count,share
0,andy,182,1.33
1,female,5461,39.8
2,male,5382,39.23
3,mostly_female,580,4.23
4,mostly_male,570,4.15
5,unknown,1545,11.26


#### Fix known errors

In [21]:
df.loc[df["name"] == "Tiny Ron", "name_gender"] = "male"

---

## Height distribution?

In [22]:
alt.Chart(df[(df["name_gender"] == "male") | (df["name_gender"] == "female")]).mark_bar(
    opacity=1, binSpacing=1
).encode(
    alt.X("height_cm:Q", bin=alt.Bin(maxbins=50), title="cm"),
    alt.Y("count()", stack=None, title="count"),
    alt.Color("name_gender:N", title="", legend=None),
    facet=alt.Facet("name_gender", title=""),
).configure_legend(
    orient="top"
).properties(
    width=300, height=200
)

#### Descriptives

In [23]:
shortest = df[df["height_cm"] == df["height_cm"].min()]
tallest = df[df["height_cm"] == df["height_cm"].max()]

In [24]:
df.groupby(["name_gender"]).agg({"height_cm": ["min", "max", "median"]}).reset_index()

Unnamed: 0_level_0,name_gender,height_cm,height_cm,height_cm
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,median
0,andy,127.0,236.0,169.0
1,female,130.0,205.0,166.0
2,male,107.0,272.0,180.0
3,mostly_female,122.0,232.0,168.0
4,mostly_male,81.0,229.0,178.0
5,unknown,117.0,251.0,170.0


#### Ranks

In [25]:
df["rank_for_gender"] = df.groupby("name_gender")["height_cm"].rank("max")

---

## Export

In [26]:
df.to_csv("data/processed/celebrity_heights.csv", index=False)