In [27]:
import pandas as pd
import wikipedia
from bs4 import BeautifulSoup
import urllib.request
import numpy as np
import re

In [28]:
def get_lead_data(wik, page, head):
    cols = ['season', 'dates', 'bachelor', 'winner', 'runner-up', 'proposal', 'together', 'notes']
    bachelors = pd.DataFrame(columns=cols)
    url = wik + page.replace(' ', '_')
    req = urllib.request.urlopen(url)
    article = req.read().decode()
    data = []
    
    with open('ISO_3166-1_alpha-2.html', 'w', encoding='utf-8') as fo:
        fo.write(article)
        
    html = open('ISO_3166-1_alpha-2.html', encoding='utf-8').read()
    soup = BeautifulSoup(html, 'html.parser')
    tables = soup.find_all('table')
    
    for t in tables:
        ths = t.find_all('th')
        headings = [th.text.strip() for th in ths]
        
        try:
            if headings[:8] == head:
                with open('iso_3166-1_alpha-2_codes.txt', 'w', encoding='utf-8') as fo:
                    for tr in t.find_all('tr'):
                        tds = tr.find_all('td')
                        if not tds:
                            continue
                        data.append([td.text.strip() for td in tds[:8]])                      
                ret_df = pd.DataFrame(data, columns = cols)
                print(ret_df)
                bachelors = bachelors.append(ret_df, sort=True)
                            
        except:
            ret_df = pd.DataFrame(columns = cols)

    bachelors = bachelors[bachelors['season'].apply(lambda x: x.isnumeric())]
    
    return bachelors

In [29]:
def get_contestant_data(wik, page, head):
    cols = ['name', 'age', 'hometown', 'occupation', 'result'] 
    contestants = pd.DataFrame(columns = cols + ['season'])
    
    for p in range(len(page)):
        url = wik + page[p].replace(' ', '_')
        req = urllib.request.urlopen(url)
        article = req.read().decode()
        data = []

        with open('ISO_3166-1_alpha-2.html', 'w', encoding='utf-8') as fo:
            fo.write(article)

        html = open('ISO_3166-1_alpha-2.html', encoding='utf-8').read()
        soup = BeautifulSoup(html, 'html.parser')
        tables = soup.find_all('table')

        for t in tables:
            ths = t.find_all('th')
            headings = [th.text.strip() for th in ths]

            try:
                if headings[:3] == head:
                    with open('iso_3166-1_alpha-2_codes.txt', 'w', encoding='utf-8') as fo:
                        for tr in t.find_all('tr'):
                            tds = tr.find_all('td')
                            if not tds:
                                continue
                            data.append([td.text.strip() for td in tds[:5]])
                    ret_df = pd.DataFrame(data, columns = cols)
                    ret_df['season'] = page[p].split(' ')[3].split(')')[0]
                    contestants = contestants.append(ret_df, sort=True)
                            
            except:
                ret_df = pd.DataFrame(columns = cols + ['season'])
                contestants = contestants.append(ret_df, sort=True)
                
    return contestants

In [30]:
def clean_age(page):
    keys = ['year-old', 'year old', 'years-old', 'years old']
    cols = ['season','age']
    ages = pd.DataFrame(columns=cols)
    data = []

    for p in page:
        content = wikipedia.WikipediaPage(title = p).summary
        kw = [k for k in keys if k in content]
        season = p.split(' ')[3].split(')')[0]

        if len(kw) != 0:
            if '-' in kw[0]:
                age = content[content.index(kw[0])-3:content.index(kw[0])+4].split('-')[0]
                data.append([season, age])
                ret_df = pd.DataFrame(data, columns=cols)
            else:
                age = content[content.index(kw[0])-3:content.index(kw[0])+4].split(' ')[0]
                data.append([season, age])
                ret_df = pd.DataFrame(data, columns=cols)
        else:
            age = None
            data.append([season, age])
            ret_df = pd.DataFrame(data, columns=cols)
            
    return ages

In [31]:
def remove_brackets(d):
    return d.apply(lambda x: x.str.replace(r"\[.*\]",""))

In [32]:
#create dataframes            
wiki = 'https://en.wikipedia.org/wiki/'
bach_home = 'The Bachelor (American TV series)'
bach_seasons = ['The Bachelor (season {})'.format(s) for s in list(range(1,25))]
ette_home = 'The Bachelorette'
ette_seasons = ['The Bachelorette (season {})'.format(s) for s in list(range(1,16))]
bach_cols = ['#', 'Original run', 'Bachelor', 'Winner', 'Runner(s)-up', 'Proposal', 'Still together', 'Relationship notes']
ette_cols = ['#', 'Original run', 'Bachelorette', 'Winner', 'Runner-up', 'Proposal', 'Still together', 'Relationship notes']
contestant_cols = ['Name', 'Age', 'Hometown']

In [38]:
bach_seasons

['The Bachelor (season 1)',
 'The Bachelor (season 2)',
 'The Bachelor (season 3)',
 'The Bachelor (season 4)',
 'The Bachelor (season 5)',
 'The Bachelor (season 6)',
 'The Bachelor (season 7)',
 'The Bachelor (season 8)',
 'The Bachelor (season 9)',
 'The Bachelor (season 10)',
 'The Bachelor (season 11)',
 'The Bachelor (season 12)',
 'The Bachelor (season 13)',
 'The Bachelor (season 14)',
 'The Bachelor (season 15)',
 'The Bachelor (season 16)',
 'The Bachelor (season 17)',
 'The Bachelor (season 18)',
 'The Bachelor (season 19)',
 'The Bachelor (season 20)',
 'The Bachelor (season 21)',
 'The Bachelor (season 22)',
 'The Bachelor (season 23)',
 'The Bachelor (season 24)']

In [33]:
#bachelors
bachelors = get_lead_data(wiki, bach_home, bach_cols)

           season                             dates                bachelor  \
0               1         March 25 – April 25, 2002             Alex Michel   
1               2  September 25 – November 20, 2002            Aaron Buerge   
2               3           March 24 – May 21, 2003        Andrew Firestone   
3               4  September 24 – November 20, 2003              Bob Guiney   
4               5            April 7 – May 26, 2004            Jesse Palmer   
5               6  September 22 – November 24, 2004           Byron Velvick   
6               7           March 28 – May 16, 2005       Charlie O'Connell   
7               8     January 9 – February 27, 2006       Travis Lane Stork   
8               9     October 2 – November 27, 2006        Lorenzo Borghese   
9              10            April 2 – May 22, 2007          Andrew Baldwin   
10             11  September 24 – November 20, 2007             Brad Womack   
11  DeAnna Pappas                              None 

In [34]:
#bachelor contestants
bach_contestants = get_contestant_data(wiki, bach_seasons, contestant_cols)

In [35]:
#bachelorettes
ettes = get_lead_data(wiki, ette_home, ette_cols)

In [36]:
ettes

In [73]:
#bachelorette contestants
ette_contestants = get_contestant_data(wiki, ette_seasons, contestant_cols)

In [87]:
#add show type
bachelors['type'] = 'Bachelor'
bach_contestants['type'] = 'Bachelor'
ettes['type'] = 'Bachelorette'
ette_contestants['type'] = 'Bachelorette'

In [98]:
#remove hyperlinks and concatenate dataframes
leads = pd.DataFrame(columns=bachelors.columns)
contestants = pd.DataFrame(columns=bach_contestants.columns)
dfs = [bachelors, ettes, bach_contestants, ette_contestants]

for d in dfs:
    d = rem_brack(d)
    
    if 'result' in d.columns:
        contestants = contestants.append(d)
    else:
        leads = leads.append(d)