In [7]:
# import Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from sqlalchemy import create_engine

In [8]:
# create a function that can be reused for scraping the data
def get_html_data(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, "html5lib")

### Source: The Washington Post

* http://stats.washingtonpost.com/fb/glossary.asp

In [9]:
player_position_url = 'http://stats.washingtonpost.com/fb/glossary.asp'

pp_response = get_html_data(player_position_url)
print(pp_response.prettify())

<html>
 <head>
  <script type="text/javascript">
   var strSport = "NFL";
	var strSportName = "NFL";
	var sSHSSportName = "Football";
	var sSHSLeagueName = "NFL";
	var sSHSPageName = "Glossary";
  </script>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0, user-scalable=yes, minimum-scale=0.5, maximum-scale=2.0" id="viewport" name="viewport"/>
  <title>
  </title>
  <script type="text/javascript">
   document.title = sSHSSportName + " | " + sSHSLeagueName + " | " + sSHSPageName + " - The Washington Post";
  </script>
  <meta content="unsafe-url" name="referrer"/>
  <meta content="Washington Capitals, Washington Nationals, Washington Redskins, DC United , Washington Wizards, baseball, football, hockey, soccer, golf, basketball, college football, college basketball, NBA news, MLB news, NFL news, NHL news, MLS news, Nationals, Redskins, Capitals, Wizards, D.C. United, high school sports,colle

In [10]:
# results are returned as an iterable list
pp_results = pp_response.find_all('table', class_="shsTable")

# print(pp_results)
# print(pp_results[1].prettify())   

pp_df = pd.read_html(str(pp_results[1]))
pp_df = pp_df[0]
pp_df.head()

Unnamed: 0,0,1
0,Position Abbreviations,Position Abbreviations
1,Abbreviation,Position
2,QB,Quarterback
3,RB,Running Back
4,FB,Fullback


In [11]:
# rename columns
pp_df = pp_df.rename(columns={0: "abbreviation", 1 : "position"}) 

# set index to abbreviation
pp_df.set_index("abbreviation", inplace=True)
pp_df.drop(['Position Abbreviations', 'Abbreviation'], inplace=True)

pp_df.head()

Unnamed: 0_level_0,position
abbreviation,Unnamed: 1_level_1
QB,Quarterback
RB,Running Back
FB,Fullback
WR,Wide Receiver
TE,Tight End


In [12]:
# pp_df.reset_index(inplace=True)
# pp_df.head()

### Super Bowl Winners
### Source: Topend Sports
* The Super Bowl is the annual championship game of the National Football League (NFL). 
* Below is a list of all the winners of the Super Bowl since Green Bay won the first in 1967. 
* The years listed indicate the year the Super Bowl was played, the regular season is played during the year before.

In [13]:
# Superbowl winning teams with scores 
sb_url = 'https://www.topendsports.com/events/super-bowl/winners-list.htm'
sb_response = get_html_data(sb_url)

# print(sb_response.prettify())

In [14]:
# results are returned as an iterable list
sb_results = sb_response.find_all('table', class_="list")

# print(sb_results[0].prettify())

# read html to convert the data to dataframe - but this return a list
sb_df = pd.read_html(str(sb_results))

# convert the list to a dataframe
sb_df = sb_df[0]

# display the first 5 rows of dataframe
sb_df.head()

Unnamed: 0,Year,No.,Winner,Opposition,Score,Venue
0,2020,LIV,Kansas City Chiefs,San Francisco 49ers,31-20,Miami
1,2019,LIII,New England Patriots,Los Angeles Rams,13-3,Atlanta
2,2018,LII,Philadelphia Eagles,New England Patriots,41-33,Minnesota
3,2017,LI,New England Patriots,Atlanta Falcons,34-28,Texas
4,2016,L,Denver Broncos,Carolina Panthers,24-10,California


In [15]:
# rename columns
sb_df = sb_df.rename(columns={'Year' : 'year',
                              'No.' : 'sb_no', 
                              'Winner': "winner_team", 
                              "Opposition" : "loser_team",
                              }) 
sb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,Score,Venue
0,2020,LIV,Kansas City Chiefs,San Francisco 49ers,31-20,Miami
1,2019,LIII,New England Patriots,Los Angeles Rams,13-3,Atlanta
2,2018,LII,Philadelphia Eagles,New England Patriots,41-33,Minnesota
3,2017,LI,New England Patriots,Atlanta Falcons,34-28,Texas
4,2016,L,Denver Broncos,Carolina Panthers,24-10,California


In [16]:
# Data has some special characters in Score field. But we need to retain the '-' as is to split the score into 2 fields
sb_df['Score'] = sb_df['Score'].str.replace(r"[^a-zA-Z0-9]+", '-')
sb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,Score,Venue
0,2020,LIV,Kansas City Chiefs,San Francisco 49ers,31-20,Miami
1,2019,LIII,New England Patriots,Los Angeles Rams,13-3,Atlanta
2,2018,LII,Philadelphia Eagles,New England Patriots,41-33,Minnesota
3,2017,LI,New England Patriots,Atlanta Falcons,34-28,Texas
4,2016,L,Denver Broncos,Carolina Panthers,24-10,California


In [17]:
# split the score in to 2 fields: score_won and score_lost
def get_winner_score(Score):
    return Score.split("-")[0]

def get_loser_score(Score):
    return Score.split("-")[1]

# Use the labda function to apply the above function to split the score in the dataframe
sb_df['winner_score'] = sb_df['Score'].apply(lambda x: f"{get_winner_score(x)}")
sb_df['loser_score'] = sb_df['Score'].apply(lambda x: f"{get_loser_score(x)}")

sb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,Score,Venue,winner_score,loser_score
0,2020,LIV,Kansas City Chiefs,San Francisco 49ers,31-20,Miami,31,20
1,2019,LIII,New England Patriots,Los Angeles Rams,13-3,Atlanta,13,3
2,2018,LII,Philadelphia Eagles,New England Patriots,41-33,Minnesota,41,33
3,2017,LI,New England Patriots,Atlanta Falcons,34-28,Texas,34,28
4,2016,L,Denver Broncos,Carolina Panthers,24-10,California,24,10


In [18]:
# As we split the score into 2 different columns we don't need original score column
# sb_df.drop(['Score', 'Year'],axis='columns',inplace=True)

sb_df.drop(['Score', 'Venue'],axis='columns',inplace=True)

# set index to sb_no 
sb_df.set_index("sb_no", inplace=True)
sb_df.head()

Unnamed: 0_level_0,year,winner_team,loser_team,winner_score,loser_score
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIV,2020,Kansas City Chiefs,San Francisco 49ers,31,20
LIII,2019,New England Patriots,Los Angeles Rams,13,3
LII,2018,Philadelphia Eagles,New England Patriots,41,33
LI,2017,New England Patriots,Atlanta Falcons,34,28
L,2016,Denver Broncos,Carolina Panthers,24,10


In [19]:
# rearrange the columns
sb_df = sb_df[['year', 'winner_team', 'winner_score', 'loser_team', 'loser_score']]
sb_df.head()

Unnamed: 0_level_0,year,winner_team,winner_score,loser_team,loser_score
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LIV,2020,Kansas City Chiefs,31,San Francisco 49ers,20
LIII,2019,New England Patriots,13,Los Angeles Rams,3
LII,2018,Philadelphia Eagles,41,New England Patriots,33
LI,2017,New England Patriots,34,Atlanta Falcons,28
L,2016,Denver Broncos,24,Carolina Panthers,10


In [20]:
sb_df.dtypes

year             int64
winner_team     object
winner_score    object
loser_team      object
loser_score     object
dtype: object

### Source: ESPN
### NFL History - Super Bowl MVPs
* http://www.espn.com/nfl/superbowl/history/mvps

In [21]:
mvp_url = 'http://www.espn.com/nfl/superbowl/history/mvps'
mvp_response = get_html_data(mvp_url)
# print(mvp_response.prettify())

In [22]:
# results are returned as an iterable list
mvp_results = mvp_response.find_all('table', class_="tablehead")

print(mvp_results[0].prettify())      

<table cellpadding="3" cellspacing="1" class="tablehead">
 <tbody>
  <tr class="stathead">
   <td colspan="3">
    Super Bowl Most Valuable Players
   </td>
  </tr>
  <tr class="colhead" valign="top">
   <td width="10%">
    NO.
   </td>
   <td width="45%">
    PLAYER
   </td>
   <td width="45%">
    HIGHLIGHTS
   </td>
  </tr>
  <tr class="oddrow">
   <td>
    I
   </td>
   <td>
    Bart Starr, QB, Green Bay
   </td>
   <td>
    Two touchdown passes
   </td>
  </tr>
  <tr class="evenrow">
   <td>
    II
   </td>
   <td>
    Bart Starr, QB, Green Bay
   </td>
   <td>
    202 yards passing, 1 TD
   </td>
  </tr>
  <tr class="oddrow">
   <td>
    III
   </td>
   <td>
    Joe Namath, QB, New York Jets
   </td>
   <td>
    206 yards passing
   </td>
  </tr>
  <tr class="evenrow">
   <td>
    IV
   </td>
   <td>
    Len Dawson, QB, Kansas City
   </td>
   <td>
    142 yards passing, 1 TD
   </td>
  </tr>
  <tr class="oddrow">
   <td>
    V
   </td>
   <td>
    Chuck Howley, LB, Dallas
   </

In [23]:
# read html and convert it to pandas dataframe
mvp_df = pd.read_html(str(mvp_results))
mvp_df = mvp_df[0]

# rename columns
mvp_df = mvp_df.rename(columns={0: "sb_no", 1 : "mv_player", 2 : "highlights"}) 

# set the value 50 to letter L to be consistent with other super bowl numbers
mvp_df['sb_no'] = mvp_df['sb_no'].replace('50','L', regex=True)
mvp_df.head()

Unnamed: 0,sb_no,mv_player,highlights
0,Super Bowl Most Valuable Players,Super Bowl Most Valuable Players,Super Bowl Most Valuable Players
1,NO.,PLAYER,HIGHLIGHTS
2,I,"Bart Starr, QB, Green Bay",Two touchdown passes
3,II,"Bart Starr, QB, Green Bay","202 yards passing, 1 TD"
4,III,"Joe Namath, QB, New York Jets",206 yards passing


In [24]:
# set index to sb_no
mvp_df.set_index("sb_no", inplace=True)

# drop the unwanted data
mvp_df.drop(['Super Bowl Most Valuable Players', 'NO.'], inplace=True)

#dispaly the data
mvp_df.head()

Unnamed: 0_level_0,mv_player,highlights
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1
I,"Bart Starr, QB, Green Bay",Two touchdown passes
II,"Bart Starr, QB, Green Bay","202 yards passing, 1 TD"
III,"Joe Namath, QB, New York Jets",206 yards passing
IV,"Len Dawson, QB, Kansas City","142 yards passing, 1 TD"
V,"Chuck Howley, LB, Dallas","Two interceptions, fumble recovery"


In [25]:
# Split the mvp_player column into name of the player, role, and the team
def get_player(mv_player):
    return mv_player.split(",")[0]

def get_position(mv_player):
    return mv_player.split(",")[1]

def get_team(mv_player):
    return mv_player.split(",")[2]

mvp_df['player'] = mvp_df['mv_player'].apply(lambda x: f"{get_player(x)}")
mvp_df['position_abbr'] = mvp_df['mv_player'].apply(lambda x: f"{get_position(x)}")
mvp_df['team'] = mvp_df['mv_player'].apply(lambda x: f"{get_team(x)}")

mvp_df.head()

Unnamed: 0_level_0,mv_player,highlights,player,position_abbr,team
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I,"Bart Starr, QB, Green Bay",Two touchdown passes,Bart Starr,QB,Green Bay
II,"Bart Starr, QB, Green Bay","202 yards passing, 1 TD",Bart Starr,QB,Green Bay
III,"Joe Namath, QB, New York Jets",206 yards passing,Joe Namath,QB,New York Jets
IV,"Len Dawson, QB, Kansas City","142 yards passing, 1 TD",Len Dawson,QB,Kansas City
V,"Chuck Howley, LB, Dallas","Two interceptions, fumble recovery",Chuck Howley,LB,Dallas


In [26]:
# drop mv_player
mvp_df.drop(['mv_player'],axis='columns',inplace=True)
mvp_df.head()

Unnamed: 0_level_0,highlights,player,position_abbr,team
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I,Two touchdown passes,Bart Starr,QB,Green Bay
II,"202 yards passing, 1 TD",Bart Starr,QB,Green Bay
III,206 yards passing,Joe Namath,QB,New York Jets
IV,"142 yards passing, 1 TD",Len Dawson,QB,Kansas City
V,"Two interceptions, fumble recovery",Chuck Howley,LB,Dallas


In [27]:
mvp_df = mvp_df[['player', 'position_abbr', 'team', 'highlights']]
mvp_df.head()

Unnamed: 0_level_0,player,position_abbr,team,highlights
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I,Bart Starr,QB,Green Bay,Two touchdown passes
II,Bart Starr,QB,Green Bay,"202 yards passing, 1 TD"
III,Joe Namath,QB,New York Jets,206 yards passing
IV,Len Dawson,QB,Kansas City,"142 yards passing, 1 TD"
V,Chuck Howley,LB,Dallas,"Two interceptions, fumble recovery"


In [28]:
mvpf_df = mvp_df
mvpf_df

Unnamed: 0_level_0,player,position_abbr,team,highlights
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I,Bart Starr,QB,Green Bay,Two touchdown passes
II,Bart Starr,QB,Green Bay,"202 yards passing, 1 TD"
III,Joe Namath,QB,New York Jets,206 yards passing
IV,Len Dawson,QB,Kansas City,"142 yards passing, 1 TD"
V,Chuck Howley,LB,Dallas,"Two interceptions, fumble recovery"
VI,Roger Staubach,QB,Dallas,"119 yards passing, 2 TDs"
VII,Jake Scott,S,Miami,Two interceptions
VIII,Larry Csonka,FB,Miami,"33 carries, 145 yards rushing, 2 TDs"
IX,Franco Harris,RB,Pittsburgh,"158 yards rushing, 1 TD"
X,Lynn Swann,WR,Pittsburgh,"4 catches, 161 yards, 1 TD"


In [29]:
# mvp_df.reset_index(inplace=True)
# mvp_df.head()
mvp_df.dtypes

player           object
position_abbr    object
team             object
highlights       object
dtype: object

### Source: Wikipedia
### List of Super Bowl starting quarterbacks
* https://en.wikipedia.org/wiki/List_of_Super_Bowl_starting_quarterbacks

In [30]:
qb_url = 'https://en.wikipedia.org/wiki/List_of_Super_Bowl_starting_quarterbacks'

qb_response = get_html_data(qb_url)
# print(qb_response.prettify())

In [31]:
# results are returned as an iterable list
qb_results = qb_response.find_all('table', class_="wikitable")

# print(qb_results[1].prettify())      

qb_df = pd.read_html(str(qb_results[1]))
qb_df = qb_df[0]
qb_df.head()

Unnamed: 0,Season,Super Bowl,Winning QB,Team,Losing QB,Team.1
0,1966 AFL/NFL,I,Bart Starr*MVP,Green Bay Packersn,Len Dawson*,Kansas City Chiefsa
1,1967 AFL/NFL,II,Bart Starr*MVP,Green Bay Packersn,Daryle Lamonica,Oakland Raidersa
2,1968 AFL/NFL,III,Joe Namath*MVP,New York Jetsa,Earl Morrall,Baltimore Coltsn
3,1969 AFL/NFL,IV,Len Dawson*MVP,Kansas City Chiefsa,Joe Kapp,Minnesota Vikingsn
4,1970,V,Johnny Unitas*,Baltimore ColtsA,Craig Morton,Dallas CowboysN


In [32]:
# remove the special charachaters from data
qb_df['winner_qb'] = qb_df['Winning QB'].str.replace('\W', ' ')
qb_df['loser_qb'] = qb_df['Losing QB'].str.replace('\W', ' ')
qb_df.head()

Unnamed: 0,Season,Super Bowl,Winning QB,Team,Losing QB,Team.1,winner_qb,loser_qb
0,1966 AFL/NFL,I,Bart Starr*MVP,Green Bay Packersn,Len Dawson*,Kansas City Chiefsa,Bart Starr MVP,Len Dawson
1,1967 AFL/NFL,II,Bart Starr*MVP,Green Bay Packersn,Daryle Lamonica,Oakland Raidersa,Bart Starr MVP,Daryle Lamonica
2,1968 AFL/NFL,III,Joe Namath*MVP,New York Jetsa,Earl Morrall,Baltimore Coltsn,Joe Namath MVP,Earl Morrall
3,1969 AFL/NFL,IV,Len Dawson*MVP,Kansas City Chiefsa,Joe Kapp,Minnesota Vikingsn,Len Dawson MVP,Joe Kapp
4,1970,V,Johnny Unitas*,Baltimore ColtsA,Craig Morton,Dallas CowboysN,Johnny Unitas,Craig Morton


In [33]:
qb_df.drop(['Winning QB', 'Losing QB'],axis='columns',inplace=True)
qb_df.head()

Unnamed: 0,Season,Super Bowl,Team,Team.1,winner_qb,loser_qb
0,1966 AFL/NFL,I,Green Bay Packersn,Kansas City Chiefsa,Bart Starr MVP,Len Dawson
1,1967 AFL/NFL,II,Green Bay Packersn,Oakland Raidersa,Bart Starr MVP,Daryle Lamonica
2,1968 AFL/NFL,III,New York Jetsa,Baltimore Coltsn,Joe Namath MVP,Earl Morrall
3,1969 AFL/NFL,IV,Kansas City Chiefsa,Minnesota Vikingsn,Len Dawson MVP,Joe Kapp
4,1970,V,Baltimore ColtsA,Dallas CowboysN,Johnny Unitas,Craig Morton


In [34]:
# rename columns
qb_df = qb_df.rename(columns={"Season" : "year", "Super Bowl" : 'sb_no', 'Team': "winner_team", "Team.1" : "loser_team"}) 
qb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,winner_qb,loser_qb
0,1966 AFL/NFL,I,Green Bay Packersn,Kansas City Chiefsa,Bart Starr MVP,Len Dawson
1,1967 AFL/NFL,II,Green Bay Packersn,Oakland Raidersa,Bart Starr MVP,Daryle Lamonica
2,1968 AFL/NFL,III,New York Jetsa,Baltimore Coltsn,Joe Namath MVP,Earl Morrall
3,1969 AFL/NFL,IV,Kansas City Chiefsa,Minnesota Vikingsn,Len Dawson MVP,Joe Kapp
4,1970,V,Baltimore ColtsA,Dallas CowboysN,Johnny Unitas,Craig Morton


In [35]:
qb_df = qb_df.replace('MVP','', regex=True)
qb_df = qb_df.replace('AFL/NFL','', regex=True)

qb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,winner_qb,loser_qb
0,1966,I,Green Bay Packersn,Kansas City Chiefsa,Bart Starr,Len Dawson
1,1967,II,Green Bay Packersn,Oakland Raidersa,Bart Starr,Daryle Lamonica
2,1968,III,New York Jetsa,Baltimore Coltsn,Joe Namath,Earl Morrall
3,1969,IV,Kansas City Chiefsa,Minnesota Vikingsn,Len Dawson,Joe Kapp
4,1970,V,Baltimore ColtsA,Dallas CowboysN,Johnny Unitas,Craig Morton


In [36]:
# the last character of the team name has 'a' or 'n' to nitity afc or nfc. Remove that extra character from team names 
qb_df['winner_team'] = [sub[ : -1] for sub in qb_df['winner_team']] 
qb_df['loser_team'] = [sub[ : -1] for sub in qb_df['winner_team']]

# Update the super bowl number 50 to L
qb_df['sb_no'] = qb_df['sb_no'].replace('50','L', regex=True)

qb_df.head()

Unnamed: 0,year,sb_no,winner_team,loser_team,winner_qb,loser_qb
0,1966,I,Green Bay Packers,Green Bay Packer,Bart Starr,Len Dawson
1,1967,II,Green Bay Packers,Green Bay Packer,Bart Starr,Daryle Lamonica
2,1968,III,New York Jets,New York Jet,Joe Namath,Earl Morrall
3,1969,IV,Kansas City Chiefs,Kansas City Chief,Len Dawson,Joe Kapp
4,1970,V,Baltimore Colts,Baltimore Colt,Johnny Unitas,Craig Morton


In [37]:
# set index to sb_no
qb_df.set_index("sb_no", inplace=True)
qb_df.tail()

Unnamed: 0_level_0,year,winner_team,loser_team,winner_qb,loser_qb
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LI,2016,New England Patriots,New England Patriot,Tom Brady,Matt Ryan
LII,2017,Philadelphia Eagles,Philadelphia Eagle,Nick Foles,Tom Brady
LIII,2018,New England Patriots,New England Patriot,Tom Brady,Jared Goff
LIV,2019,Kansas City Chiefs,Kansas City Chief,Patrick Mahomes,Jimmy Garoppolo
Super Bowl,Season,Tea,Te,Winning QB,Losing QB


In [38]:
qb_df.drop(['Super Bowl'], inplace=True)
#dispaly the data
qb_df.tail()

Unnamed: 0_level_0,year,winner_team,loser_team,winner_qb,loser_qb
sb_no,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
L,2015,Denver Broncos,Denver Bronco,Peyton Manning,Cam Newton
LI,2016,New England Patriots,New England Patriot,Tom Brady,Matt Ryan
LII,2017,Philadelphia Eagles,Philadelphia Eagle,Nick Foles,Tom Brady
LIII,2018,New England Patriots,New England Patriot,Tom Brady,Jared Goff
LIV,2019,Kansas City Chiefs,Kansas City Chief,Patrick Mahomes,Jimmy Garoppolo


In [39]:
qb_df['year'].astype(str).map(len).head()

sb_no
I      5
II     5
III    5
IV     5
V      4
Name: year, dtype: int64

In [40]:
qb_df['year'] = qb_df['year'].str.strip()

In [41]:
qb_df.dtypes

year           object
winner_team    object
loser_team     object
winner_qb      object
loser_qb       object
dtype: object

## Source: Wikipedia
List of NFL Hall of Famers
https://en.wikipedia.org/wiki/List_of_Pro_Football_Hall_of_Fame_inductees

In [50]:
#Query of Hall of Famers 
hf_url = 'https://en.wikipedia.org/wiki/List_of_Pro_Football_Hall_of_Fame_inductees'
hf_response = get_html_data(hf_url)
print(hf_response.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Pro Football Hall of Fame inductees - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"b8d392b8-0478-47c0-94e2-dc349f79044f","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_Pro_Football_Hall_of_Fame_inductees","wgTitle":"List of Pro Football Hall of Fame inductees","wgCurRevisionId":965559266,"wgRevisionId":965559266,"wgArticleId":8802088,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Articles with hCards","Pro 

In [51]:
# results are returned as an iterable list
hf_results = hf_response.find_all('table', class_="wikitable")
#print(hf_results[1].prettify())

hf_df = pd.read_html(str(hf_results[1]))
hf_df = hf_df[0]
hf_df.head(60)


Unnamed: 0,Inductee,Class,Position,Team(s),Years
0,Herb Adderley,1980,Cornerback,Green Bay Packers,1961–1969
1,Herb Adderley,1980,Cornerback,Dallas Cowboys,1970–1972
2,Troy Aikman**,2006,Quarterback,Dallas Cowboys,1989–2000
3,George Allen[3],2002,Coach,Los Angeles Rams,1966–1970
4,George Allen[3],2002,Coach,Washington Redskins,1971–1977
5,George Allen[3],2002,General Manager,Washington Redskins,1971–1977
6,Larry Allen**,2013,Guard/Tackle,Dallas Cowboys,1994–2005
7,Larry Allen**,2013,Guard,San Francisco 49ers,2006–2007
8,Marcus Allen** [4],2003,Running back,Los Angeles Raiders,1982–1992
9,Marcus Allen** [4],2003,Running back,Kansas City Chiefs,1993–1997


In [52]:
#Clean up special characters attached to inductee
#hf_df['Inductee'] = hf_df['Inductee'].str.split().str[0]
#hf_df['Inductee'] = hf_df['Inductee'].str.replace(r"[^a-zA-Z0-9]+", '')
#hf_df[['First','Last']] = hf_df.str.split("_",expand=True)
#hf_df['Inductee']= hf_df['Inductee]'.str.replace('[3]',' ', regex=True)hf_
hf_df['Inductee'] = hf_df['Inductee'].str.replace(r"[^A-Za-z]+", ' ')
hfamer_df =hf_df[["Inductee","Class"]]
hfamer_df

Unnamed: 0,Inductee,Class
0,Herb Adderley,1980
1,Herb Adderley,1980
2,Troy Aikman,2006
3,George Allen,2002
4,George Allen,2002
...,...,...
697,Steve Young,2005
698,Steve Young,2005
699,Jack Youngblood,2001
700,Gary Zimmerman,2008


In [53]:
hfamerunique_df = hfamer_df.drop_duplicates()
hfamerunique_df = hfamerunique_df.rename(columns={'Inductee' : 'player',
                              'Class' : 'class' 
                              })
hfamerunique_df

Unnamed: 0,player,class
0,Herb Adderley,1980
2,Troy Aikman,2006
3,George Allen,2002
6,Larry Allen,2013
8,Marcus Allen,2003
...,...,...
692,Ron Yary,2001
694,George Young,2021
697,Steve Young,2005
699,Jack Youngblood,2001


In [54]:
#These MVPs are Hall of Famers
mvpf_df = mvp_df
mvpf_df
mvphf_df = mvpf_df.merge(hfamerunique_df,on='player', how='inner')
mvphf_df

Unnamed: 0,player,position_abbr,team,highlights,class
0,Joe Namath,QB,New York Jets,206 yards passing,1985
1,Len Dawson,QB,Kansas City,"142 yards passing, 1 TD",1987
2,Larry Csonka,FB,Miami,"33 carries, 145 yards rushing, 2 TDs",1987
3,Lynn Swann,WR,Pittsburgh,"4 catches, 161 yards, 1 TD",2001
4,Fred Biletnikoff,WR,Oakland,"4 catches, 79 yards",1988
5,John Riggins,RB,Washington,"166 yards rushing, 1 TD",1992
6,Richard Dent,DE,Chicago,"2 sacks, 2 forced fumbles",2011
7,Terrell Davis,RB,Denver,"30 carries, 157 yards, 3 TDs",2017
8,Kurt Warner,QB,St. Louis,"414 yards passing, 2 TDs",2017


### Create database connection

In [44]:
connection_string = "postgres:postgres@localhost:5432/ETL_Project"
engine = create_engine(f'postgresql://{connection_string}')

### Load DataFrames into database

In [45]:
pp_df.to_sql(name='player_position', con=engine, if_exists='append', index=True)

In [46]:
sb_df.to_sql(name='sb_winners', con=engine, if_exists='append', index=True)

In [47]:
mvp_df.to_sql(name='most_valuable_player', con=engine, if_exists='append', index=True)

In [48]:
qb_df.to_sql(name='sb_quarterbacks', con=engine, if_exists='append', index=True)