# ETL Project
### ***NFL Draft Analysis***

In [None]:
#Import Dependencies
import pandas as pd
from sqlalchemy import create_engine
import psycopg2
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect

##### Create secrets.py file with username and password to pgadmin

In [None]:
#Import username and password from secrets.py file
from secrets import username, password, database_name

# Extract

##### Extract the data from Wikipedia and  www.pro-football-reference.com. 
* Note: Since we are scraping data from tables we are using pandas otherwise we were going to need to use BeatifulSoup and Splinter

### Extract Combine Info from 2016-2020

In [None]:
#Assign Combine URL's to variable
combine_url_2016 =  'https://www.pro-football-reference.com/draft/2016-combine.htm'
combine_url_2017 = 'https://www.pro-football-reference.com/draft/2017-combine.htm'
combine_url_2018 = 'https://www.pro-football-reference.com/draft/2018-combine.htm'
combine_url_2019 = 'https://www.pro-football-reference.com/draft/2019-combine.htm'
combine_url_2020 = 'https://www.pro-football-reference.com/draft/2020-combine.htm#combine'

In [None]:
#Use Pandas to read in html tables from combine URL's
combine_2016 = pd.read_html(combine_url_2016)
combine_2017 = pd.read_html(combine_url_2017)
combine_2018 = pd.read_html(combine_url_2018)
combine_2019 = pd.read_html(combine_url_2019)
combine_2020 = pd.read_html(combine_url_2020)

In [None]:
#Extract the 0th HTML Table to get the necessary combine info
combine_df_2016 = combine_2016[0]
combine_df_2017 = combine_2017[0]
combine_df_2018 = combine_2018[0]
combine_df_2019 = combine_2019[0]
combine_df_2020 = combine_2020[0]
#Preiew the 2016 Combine DF - Combine Extraction Complete!
combine_df_2016

### Extract Draft Info from 2016-2020

In [None]:
#Assign Draft URL's to variable
draft_url_2016 = 'https://en.wikipedia.org/wiki/2016_NFL_Draft'
draft_url_2017 = 'https://en.wikipedia.org/wiki/2017_NFL_Draft'
draft_url_2018 = 'https://en.wikipedia.org/wiki/2018_NFL_Draft'
draft_url_2019 = 'https://en.wikipedia.org/wiki/2019_NFL_Draft'
draft_url_2020 = 'https://en.wikipedia.org/wiki/2020_NFL_Draft'

In [None]:
#Use Pandas to read in html tables from draft URL's
draft_2016 = pd.read_html(draft_url_2016)
draft_2017 = pd.read_html(draft_url_2017)
draft_2018 = pd.read_html(draft_url_2018)
draft_2019 = pd.read_html(draft_url_2019)
draft_2020 = pd.read_html(draft_url_2020)

In [None]:
#Extract the 4th HTML Table to get the necessary combine info
draft_df_2016 = draft_2016[4]
draft_df_2017 = draft_2017[4]
draft_df_2018 = draft_2018[4]
draft_df_2019 = draft_2019[4]
draft_df_2020 = draft_2020[4]
#Preview the 2016 Draft DF - Draft Extraction Complete!
draft_df_2016

# Transform

### Concatenate and Clean Up Combine Data

In [None]:
#Add year to each combine DF
combine_df_2016['year']='2016'
combine_df_2017['year']='2017'
combine_df_2018['year']='2018'
combine_df_2019['year']='2019'
combine_df_2020['year']='2020'
#Add year to each draft DF
draft_df_2016['year']='2016'
draft_df_2017['year']='2017'
draft_df_2018['year']='2018'
draft_df_2019['year']='2019'
draft_df_2020['year']='2020'

In [None]:
#Preview 2016 Combine Data with Year Column Added
combine_df_2016

In [None]:
#Combine 2016-2020 Combine DF's into 1 DF
combine_frames = [combine_df_2016, combine_df_2017, combine_df_2018, combine_df_2019, combine_df_2020]
combined_combine_df = pd.concat(combine_frames)

#Drop and Rename Columns
combined_combine_df = combined_combine_df.drop(columns = ['Drafted (tm/rnd/yr)', 'College'])
combined_combine_df = combined_combine_df.rename(columns = {'Player': 'name', 'Ht': 'Height', 'Wt': 'Weight', '40yd': 'Forty_Yard',
                                                            '3Cone': 'Three_Cone', 'year': 'Year', 'Broad Jump': 'Broad_Jump',
                                                           'Pos': 'position'})
#Preview the Combined combine DF
combined_combine_df

In [None]:
#Convert Height to Float
def fix(string):
    try:
        feet = int(string.split('-')[0])
    
        inches = int(string.split('-')[1])

    #thats for feet and inches
        return feet * 12 + inches
    except:
        return string
    
combined_combine_df['Height'] = combined_combine_df['Height'].apply(fix)
#Preview the Complete total combine DF - Combine DF Cleanup Complete!
combined_combine_df

In [None]:
combine_table_df = combined_combine_df.drop(columns = ['position', 'School', 'Height', 'Weight', 'Year'])
combine_table_df

### Concatenate and Clean Up Draft Data

In [None]:
#Preview the 2016 Draft DF with year column added
draft_df_2016

In [None]:
#Combine 2016-2020 Draft DF's into 1 DF
draft_frames = [draft_df_2016, draft_df_2017, draft_df_2018, draft_df_2019, draft_df_2020]
combined_draft_df = pd.concat(draft_frames)
#Drop and Rename Draft DF Columns
combined_draft_df = combined_draft_df.drop(columns = ['Notes', 'Unnamed: 0',])
combined_draft_df = combined_draft_df.rename(columns = {"Player": "name", "NFL team": "NFL_Team", "Pos.": "position", 
                                                        "College":"School", "Conf.": "Conf", 
                                                        "Rnd.": "Round", "Pick No.": "Pick_No", "year": "Year"})
#Preview the combined draft DF
combined_draft_df

In [None]:
teams_df = combined_draft_df.drop(columns = ['Round','Pick_No', 'position','School','Conf','Year'])
teams_df

In [None]:
draft_table_df = combined_draft_df.drop(columns = ['NFL_Team', 'position','School','Conf','Year'])
draft_table_df

In [None]:
info_frames = [combined_combine_df, combined_draft_df]
#Combine both player dataframes

combined_info_df = pd.concat(info_frames)
#Drop duplicate players in combined_player_df
info_df = combined_info_df.drop_duplicates(subset='name', keep='first', ignore_index=True)
# Drop columns we are not using for the info table
info_df = info_df.drop(columns = ['Forty_Yard','Vertical','Bench','Broad_Jump','Three_Cone','Shuttle',
'Round','Pick_No','NFL_Team',])
#-----------------------------------------------------------------
# Define School DF
schools_df = info_df.drop(columns = ['position', 'Height','Weight', 'Year'])
#-----------------------------------------------------------------

#Drop School Column
info_df = info_df.drop(columns = ['School','Conf'])


#Preview combined_player_df
info_df

#### School DataFrame


In [None]:
schools_df

##### Need to get one single dataframe of just the players

In [None]:
players_table_df = pd.DataFrame(info_df['name'])
players_table_df

# Check dataframes before loading

In [None]:
# Check dataframes before loading

# players_table_df
# info_df
# teams_df
# combine_table_df
# draft_table_df

players_table_df

In [None]:
info_df

In [None]:
schools_df


In [None]:
teams_df

In [None]:
combine_table_df

In [None]:
draft_table_df

# Load

In [None]:
#Connect to local database
rds_connection_string = f'{username}:{password}@localhost:5432/{database_name}'
engine = create_engine(f'{username}+psycopg2://{rds_connection_string}')

In [None]:
# Check the tables names to make sure where are we posting
engine.table_names()

In [None]:
# check the keys of the tables to make sure everything match exactly.
#if match EXACTLY WITH THE SAME NAME FOR COLUMNS IN THE DATAFRAME AND IN THE TABLE KEYS GO TO NEXT CELL.
combine_table=engine.execute('SELECT * FROM combine')
players_table=engine.execute('SELECT * FROM player')
draft_table=engine.execute('SELECT * FROM draft')
print(combine_table.keys())
print(players_table.keys())
print(draft_table.keys())

In [None]:
# players_table_df
# info_df
# teams_df
# combine_table_df
# draft_table_df
# schools_df

In [None]:
# Load data using pandas. name stands for table name, change it if neccesary.
players_table_df.to_sql(name='player', con=engine, if_exists='append', index=False)

In [None]:
#Confirm the data has been Load. Check table name. Create a dataframe to get player_id to the other dataframes
player_id_df=pd.read_sql_query('select * from player', con=engine)
player_id_df

In [None]:
info_table_df = pd.merge(info_df, player_id_df, how = 'inner', on = 'name')
info_table_df = info_table_df.drop(columns = 'name')

## convert Height and Weight to numeric
info_table_df['Height'] = pd.to_numeric(info_table_df['Height'], errors='coerce')
#com_df_final['Ht'].convert_dtypes(infer_objects=True, convert_string=True)
info_table_df['Weight'] = pd.to_numeric(info_table_df['Weight'], errors='coerce')
info_table_df['Year'] = pd.to_numeric(info_table_df['Year'], errors='coerce')
info_table_df['position'] = info_table_df['position'].convert_dtypes(infer_objects=True, convert_string=True)

info_table_df.info()

In [None]:
teams_table_df = pd.merge(teams_df, player_id_df, how = 'inner', on = 'name')
teams_table_df = teams_table_df.drop(columns = 'name')
teams_table_df['NFL_Team'] = teams_table_df['NFL_Team'].convert_dtypes(infer_objects=True, convert_string=True)
teams_table_df

In [None]:
combine_df_final = pd.merge(combine_table_df, player_id_df, how = 'inner', on = 'name')
combine_df_final

In [None]:
combine_df_final = pd.merge(combine_table_df, player_id_df, how = 'inner', on = 'name')
combine_df_final = combine_df_final.drop(columns = 'name')

# Convert
##### The columns names needs to be changed
combine_df_final['Forty_Yard'] = pd.to_numeric(combine_df_final['Forty_Yard'], errors='coerce')
combine_df_final['Bench'] = pd.to_numeric(combine_df_final['Bench'], errors='coerce')
combine_df_final['Vertical'] = pd.to_numeric(combine_df_final['Vertical'], errors='coerce')
combine_df_final['Broad_Jump'] = pd.to_numeric(combine_df_final['Broad_Jump'], errors='coerce')
combine_df_final['Three_Cone'] = pd.to_numeric(combine_df_final['Three_Cone'], errors='coerce')
combine_df_final['Shuttle'] = pd.to_numeric(combine_df_final['Shuttle'], errors='coerce')
# Get the info of the dataframe
combine_df_final



In [None]:
# merge with draft dataframe to get the serial id.
draft_df_final = pd.merge(draft_table_df, player_id_df, how='inner', on='name')
draft_df_final = draft_df_final.drop(columns = 'name')

# # Convert types to fit in the database
 
draft_df_final['Round'] = pd.to_numeric(draft_df_final['Round'], errors='coerce')
draft_df_final['Pick_No'] = pd.to_numeric(draft_df_final['Pick_No'], errors='coerce')
# draft_df_final['Pick_no'].convert_dtypes(infer_objects=True, convert_integer=True)


draft_df_final.info()

In [None]:
schools_df_final = pd.merge(schools_df, player_id_df, how='inner', on='name')

schools_df_final = schools_df_final.drop(columns = 'name')

schools_df_final['School'] = schools_df_final['School'].convert_dtypes(infer_objects=True, convert_string=True)
schools_df_final['Conf'] = schools_df_final['Conf'].convert_dtypes(infer_objects=True, convert_string=True)

schools_df_final.info()

In [None]:
# Load data using pandas. name stands for table name, change it if neccesary.
info_table_df.to_sql(name='info', con=engine, if_exists='append', index=False)
teams_table_df.to_sql(name='teams', con=engine, if_exists='append', index=False)
combine_df_final.to_sql(name='combine', con=engine, if_exists='append', index=False)
draft_df_final.to_sql(name='draft', con=engine, if_exists='append', index=False)
schools_df_final.to_sql(name='college', con=engine, if_exists='append', index=False)

In [None]:
# Database ready to work

In [None]:
# Check our Database
schools = pd.read_sql_query('select * from college', con=engine)
schools