# Collecting basic Survivor stats from [here](https://www.truedorktimes.com/survivor/boxscores/s32.htm)

#### Import Python tools

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd

---

## Get data

#### Loop over [season pages](https://www.truedorktimes.com/survivor/boxscores/s31.htm) and pack statistics tables into a list

In [63]:
dfs = []

for r in range(1, 43):
    src_df = (
        pd.read_html(f"https://www.truedorktimes.com/survivor/boxscores/s{r}.htm")[0]
        .droplevel(0, axis=1)
        # not all the season tables have consistent column names so adjusting for that next
        .rename(columns={"ChW.1": "ChW%", "Unnamed: 0_level_1": "Contestant"})
        .assign(season=r)
    )
    dfs.append(src_df)

#### Concatenate the dataframes into one large one

In [64]:
src_df = pd.concat(dfs)

#### What do we have? 

In [66]:
src_df.head()

Unnamed: 0,Contestant,SurvSc,SurvAv,ChW,ChA,ChW%,SO,VFB,VAP,TotV,TCA,TC%,wTCR,JVF,TotJ,JV%,season
0,Kelly,1.34,12.26,5.87,16.1,0.36,2,6,0,73,11,0.55,3.82,3,7,0.43,1
1,Hatch,1.59,7.84,1.87,16.1,0.12,0,10,6,73,11,0.9,2.55,4,7,0.57,1
2,Gretchen,1.12,3.85,1.23,3.07,0.4,0,3,4,31,4,0.72,2.63,-,-,-,1
3,Rudy,1.01,3.74,1.62,15.1,0.11,3,10,8,73,11,0.9,2.12,-,-,-,1
4,Sue,0.95,3.67,0.87,15.1,0.06,0,9,5,70,10,0.89,2.8,-,-,-,1


#### Can we select just one season?

In [67]:
src_df.query("season == 33")

Unnamed: 0,Contestant,SurvSc,SurvAv,ChW,ChA,ChW%,SO,VFB,VAP,TotV,TCA,TC%,wTCR,JVF,TotJ,JV%,season
0,Adam,1.88,9.86,1.76,13.22,0.13,2,9,6,100,12,0.75,2.1,10,10,1.00,33
1,Ken,1.22,8.75,4.75,13.22,0.36,0,12,2,117,14,0.86,4.0,0,10,0.00,33
2,David,1.09,4.87,3.15,13.22,0.24,1,12,10,117,14,0.85,1.71,-,-,-,33
3,Jay,0.65,4.68,4.04,11.22,0.36,1,3,9,92,10,0.29,0.65,-,-,-,33
4,Hannah,0.83,3.44,1.11,13.22,0.08,3,9,5,101,12,0.75,2.33,0,10,0.00,33
5,Will,0.66,2.99,1.59,9.22,0.17,4,4,6,79,8,0.49,1.4,-,-,-,33
6,Michaela,1.06,2.79,1.04,1.52,0.68,0,1,4,16,2,0.38,1.75,-,-,-,33
7,Zeke,0.75,2.76,1.62,8.22,0.2,1,4,10,70,7,0.55,1.14,-,-,-,33
8,Sunday,0.57,2.68,1.26,10.22,0.12,2,5,5,103,11,0.45,1.41,-,-,-,33
9,Chris,0.79,2.61,1.15,4.89,0.24,0,4,7,68,7,0.56,1.45,-,-,-,33


#### Clean up these column names from the [glossary](https://www.truedorktimes.com/survivor/boxscores/glossary.htm)

In [68]:
df = src_df.rename(
    columns={
        "Contestant": "contestant",
        "SurvSc": "score",
        "SurvAv": "average",
        "ChW": "challenge_wins",
        "ChA": "challenge_appearances",
        "ChW%": "challenge_win_pct",
        "SO": "sat_out",
        "VFB": "votes_for_bootee",
        "VAP": "votes_against_player",
        "TotV": "total_votes_cast_at_tribals",
        "TCA": "tribal_appearances",
        "TC%": "tribal_council_percent",
        "wTCR": "tribal_council_ratio",
        "JVF": "jury_votes",
        "TotJ": "jury_total_members",
        "JV%": "jury_vote_pct",
    }
)

#### Look up one player

In [90]:
df.query("contestant == 'Michele'")

Unnamed: 0,contestant,score,average,challenge_wins,challenge_appearances,challenge_win_pct,sat_out,votes_for_bootee,votes_against_player,total_votes_cast_at_tribals,tribal_appearances,tribal_council_percent,tribal_council_ratio,jury_votes,jury_total_members,jury_vote_pct,season,finalist
0,Michele,1.76,12.6,5.2,13.4,0.39,1,4,2,45,6,0.66,3.11,5,7,0.71,32,True
1,Michele,0.73,4.53,2.78,12.07,0.23,1,7,4,109,14,0.5,1.75,0,16,0.0,40,True


In [92]:
df.sort_values("score", ascending=False).head(50)

Unnamed: 0,contestant,score,average,challenge_wins,challenge_appearances,challenge_win_pct,sat_out,votes_for_bootee,votes_against_player,total_votes_cast_at_tribals,tribal_appearances,tribal_council_percent,tribal_council_ratio,jury_votes,jury_total_members,jury_vote_pct,season,finalist
0,J.T.,2.26,17.11,5.38,12.24,0.44,1.0,9,0,68,11,0.82,5.73,7,7,1.0,18,True
0,Boston Rob,2.23,12.56,4.64,11.02,0.42,0.0,12,6,99,13,0.92,2.58,8,9,0.89,22,True
0,Tom,2.2,17.81,6.66,13.6,0.49,4.0,6,0,43,7,0.86,6.0,6,7,0.86,10,True
0,Mike,2.11,17.76,7.54,13.8,0.55,1.0,9,0,86,11,0.82,5.73,6,8,0.75,30,True
0,Kim,2.11,14.35,6.08,13.81,0.44,0.0,9,3,80,10,0.9,3.6,7,9,0.78,24,True
0,Cochran,2.09,15.7,5.03,11.9,0.42,1.0,8,0,99,12,0.67,4.67,8,8,1.0,26,True
0,Brian,2.08,16.61,6.18,12.13,0.51,,9,0,54,9,1.0,7.0,4,7,0.57,5,True
0,Tony,2.06,10.73,2.28,12.94,0.18,2.0,10,5,70,10,0.99,3.11,8,9,0.89,28,True
0,Jenna,2.04,13.14,4.72,12.93,0.37,2.0,9,3,73,11,0.81,3.27,6,7,0.86,6,True
0,Earl,2.0,12.16,1.18,10.73,0.11,0.0,8,1,63,9,0.89,4.98,9,9,1.0,14,True


#### Who were the finalists across all seasons

In [72]:
df["finalist"] = df["jury_votes"] != "-"

In [91]:
df.query("season == 40")

Unnamed: 0,contestant,score,average,challenge_wins,challenge_appearances,challenge_win_pct,sat_out,votes_for_bootee,votes_against_player,total_votes_cast_at_tribals,tribal_appearances,tribal_council_percent,tribal_council_ratio,jury_votes,jury_total_members,jury_vote_pct,season,finalist
0,Tony,1.79,13.73,4.56,12.07,0.38,1,8,0,97,12,0.67,4.67,12,16,0.75,40,True
1,Michele,0.73,4.53,2.78,12.07,0.23,1,7,4,109,14,0.5,1.75,0,16,0.00,40,True
2,Denise,0.79,4.39,2.48,10.07,0.25,1,6,4,100,11,0.54,1.91,-,-,-,40,False
3,Natalie,0.86,4.35,2.0,5.1,0.39,0,1,7,21,3,0.22,0.85,4,16,0.25,40,True
4,Sandra,0.95,4.04,0.31,1.04,0.3,4,2,1,24,3,0.65,3.73,-,-,-,40,False
5,Ben,0.9,3.93,1.54,11.07,0.14,0,10,5,105,13,0.77,2.39,-,-,-,40,False
6,Sarah,0.64,3.8,1.29,12.07,0.11,0,7,2,97,13,0.54,2.51,-,-,-,40,False
7,Yul,0.91,3.66,0.66,3.54,0.19,0,3,3,28,4,0.72,3.0,-,-,-,40,False
8,Nick,0.88,3.36,1.66,10.07,0.16,0,8,8,90,11,0.72,1.7,-,-,-,40,False
9,Sophie,0.79,3.22,1.62,7.07,0.23,0,4,6,65,7,0.56,1.6,-,-,-,40,False


#### Isolate those

In [73]:
finalists = df.query("finalist == True").copy()

In [74]:
finalists["jury_vote_pct"] = finalists["jury_vote_pct"].astype(float)

#### Figure out the winners from each season's finalist by getting max jury percentage

In [75]:
idx = (
    finalists.groupby(["finalist", "season"])["jury_vote_pct"].transform(max)
    == finalists["jury_vote_pct"]
)

#### Winners dataframe

In [76]:
winners = finalists[idx].copy()

In [81]:
winners.head()

Unnamed: 0,contestant,score,average,challenge_wins,challenge_appearances,challenge_win_pct,sat_out,votes_for_bootee,votes_against_player,total_votes_cast_at_tribals,tribal_appearances,tribal_council_percent,tribal_council_ratio,jury_votes,jury_total_members,jury_vote_pct,season,finalist
1,Hatch,1.59,7.84,1.87,16.1,0.12,0.0,10,6,73,11,0.9,2.55,4,7,0.57,1,True
1,Tina,1.62,11.02,0.59,13.09,0.05,0.0,11,0,73,11,1.0,7.0,4,7,0.57,2,True
0,Ethan,1.93,14.15,2.87,13.29,0.22,0.0,11,0,74,11,1.0,7.0,5,7,0.71,3,True
0,Vecepia,1.5,9.38,2.45,13.48,0.18,2.0,9,2,81,12,0.75,3.5,4,7,0.57,4,True
0,Brian,2.08,16.61,6.18,12.13,0.51,,9,0,54,9,1.0,7.0,4,7,0.57,5,True


---

## Exports

#### Full list of players

In [78]:
df.to_json("data/processed/survivor_all_players_stats.json", orient="records", indent=4)

#### Just the winners

In [79]:
winners.to_json(
    "data/processed/survivor_all_winners_stats.json", orient="records", indent=4
)