**Title**: Project Milestone 2  
**Author**: Ryan Weeks  
**Date**: 1/19/2025  
**Description**: I will perform at least 5 data transformation/cleansing steps to my flat file data.

In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\nflhofplayers.csv")
df = pd.DataFrame(data)
print(df.head())

   id  rank           player position  year_inducted  from    to  \
0   1     1     Tony Boselli        T           2022  1995  2001   
1   2     2     Cliff Branch       WR           2022  1972  1985   
2   3     3     LeRoy Butler       DB           2022  1990  2001   
3   4     4        Sam Mills       LB           2022  1986  1997   
4   5     5  Richard Seymour       DE           2022  2001  2012   

   all_pro_selections  pro_bowl_selections  years_as_starter  ...  \
0                   3                    5                 6  ...   
1                   3                    4                11  ...   
2                   4                    4                11  ...   
3                   1                    5                11  ...   
4                   3                    7                11  ...   

   longest_rush  receptions  receiving_yards  receiving_td  longest_reception  \
0           NaN         NaN              NaN           NaN                NaN   
1          20.

In [3]:
# Create position mapping
position_mapping = {
    'C': 'Center',
    'DB': 'Defensive Back',
    'DE': 'Defensive End',
    'DG': 'Defensive Guard',
    'DT': 'Defensive Tackle',
    'E': 'End',
    'FB': 'Full Back',
    'FL': 'Flanker',
    'G': 'Guard',
    'HB': 'Half Back',
    'K': 'Kicker',
    'LB': 'Line Backer',
    'P': 'Punter',
    'QB': 'Quarterback',
    'RB': 'Running Back',
    'SE': 'Split End',
    'T': 'Tackle',
    'TB': 'Tail Back',
    'TE': 'Tight End',
    'WB': 'Wing Back',
    'WR': 'Wide Receiver' 
}

# Map 'position' column to new 'position_full' column
df['position_full'] = df['position'].map(position_mapping)

# Verify result
print(df[['player', 'position', 'position_full']])

              player position   position_full
0       Tony Boselli        T          Tackle
1       Cliff Branch       WR   Wide Receiver
2       LeRoy Butler       DB  Defensive Back
3          Sam Mills       LB     Line Backer
4    Richard Seymour       DE   Defensive End
..               ...      ...             ...
301      Cal Hubbard        T          Tackle
302       Don Hutson        E             End
303  Bronko Nagurski       FB       Full Back
304     Ernie Nevers       FB       Full Back
305       Jim Thorpe       TB       Tail Back

[306 rows x 3 columns]


In [4]:
# Let's drop the id & rank columns (essentially 2 duplicated index columns)
df = df.drop(columns = ['id', 'rank'])

# Verify
print(df.head())

            player position  year_inducted  from    to  all_pro_selections  \
0     Tony Boselli        T           2022  1995  2001                   3   
1     Cliff Branch       WR           2022  1972  1985                   3   
2     LeRoy Butler       DB           2022  1990  2001                   4   
3        Sam Mills       LB           2022  1986  1997                   1   
4  Richard Seymour       DE           2022  2001  2012                   3   

   pro_bowl_selections  years_as_starter  approx_value  games  ...  \
0                    5                 6            66     91  ...   
1                    4                11            87    183  ...   
2                    4                11            87    181  ...   
3                    5                11           102    181  ...   
4                    7                11            91    164  ...   

   receptions  receiving_yards  receiving_td  longest_reception  \
0         NaN              NaN           Na

In [5]:
# Let's rearrange the column order so my new 'position_full' column is directly after the 'position' column
columns_order = ['player', 'position', 'position_full'] + [col for col in df.columns if col not in ['player', 'position', 'position_full']]
df = df[columns_order]

# Verify
print(df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   all_pro_selections  pro_bowl_selections  years_as_starter  approx_value  \
0                   3                    5                 6            66   
1                   3                    4                11            87   
2                   4                    4                11            87   
3                   1                    5                11           102   
4                   3                    7                11            91   

   ...  longest_rush  receptions  receiving_yards  receiving_td  \
0  ...           Na

In [6]:
# Maybe let's add a 'years_in_NFL' column so we can compare to 'years_as_starter'???
df['years_in_NFL'] = df['to'] - df['from']

# Rearrange the columns again
columns_order = ['player', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter'] + [
    col for col in df.columns if col not in [
        'player', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter']]

df = df[columns_order]
pd.set_option('display.max_columns', None)
print(df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   years_in_NFL  years_as_starter  all_pro_selections  pro_bowl_selections  \
0             6                 6                   3                    5   
1            13                11                   3                    4   
2            11                11                   4                    4   
3            11                11                   1                    5   
4            11                11                   3                    7   

   approx_value  games  completions  pass_attempts  pass_yards  pass_td  \
0          

In [7]:
# Drop another column I don't see necessary
df = df.drop(columns = ['approx_value'])

pd.set_option('display.max_columns', None)
print(df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   years_in_NFL  years_as_starter  all_pro_selections  pro_bowl_selections  \
0             6                 6                   3                    5   
1            13                11                   3                    4   
2            11                11                   4                    4   
3            11                11                   1                    5   
4            11                11                   3                    7   

   games  completions  pass_attempts  pass_yards  pass_td  \
0     91          NaN    

In [8]:
# Let's create another new column that will group players into either Offense, Defense, or Special Teams depending on their position
def classify_position(position):
    defense_positions = ['DB', 'DE', 'DG', 'DT', 'LB']
    special_teams_positions = ['K', 'P']
    
    # Classify
    if position in defense_positions:
        return 'Defense'
    elif position in special_teams_positions:
        return 'Special Teams'
    else:
        return 'Offense'

# Apply function to create new 'position_group' column

df['position_group'] = df['position'].apply(classify_position)

# Rearrange column order ... AGAIN!!! ahh
columns_order = ['player', 'position_group', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter'] + [
    col for col in df.columns if col not in [
        'player', 'position_group', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter']]

df = df[columns_order]
print(df.head())

            player position_group position   position_full  year_inducted  \
0     Tony Boselli        Offense        T          Tackle           2022   
1     Cliff Branch        Offense       WR   Wide Receiver           2022   
2     LeRoy Butler        Defense       DB  Defensive Back           2022   
3        Sam Mills        Defense       LB     Line Backer           2022   
4  Richard Seymour        Defense       DE   Defensive End           2022   

   from    to  years_in_NFL  years_as_starter  all_pro_selections  \
0  1995  2001             6                 6                   3   
1  1972  1985            13                11                   3   
2  1990  2001            11                11                   4   
3  1986  1997            11                11                   1   
4  2001  2012            11                11                   3   

   pro_bowl_selections  games  completions  pass_attempts  pass_yards  \
0                    5     91          NaN       

In [9]:
# Probably should've done this first... but let's check all our data types
print(df.dtypes)

player                     object
position_group             object
position                   object
position_full              object
year_inducted               int64
from                        int64
to                          int64
years_in_NFL                int64
years_as_starter            int64
all_pro_selections          int64
pro_bowl_selections         int64
games                       int64
completions               float64
pass_attempts             float64
pass_yards                float64
pass_td                   float64
longest_completed_pass    float64
int_thrown                float64
times_sacked              float64
sack_yards                float64
rush_attempts             float64
rush_yards                float64
rush_td                   float64
longest_rush              float64
receptions                float64
receiving_yards           float64
receiving_td              float64
longest_reception         float64
all_purpose_yards           int64
td_total      

In [10]:
# Besides the 'sacks' column, all other numerical columns don't contain decimals
# I think Python is still identifying them as float64 dtypes due to the NaN cells, though
# Should I fill all NaN's with 0??? Since they all represent no activity or absence of events (the player did not have any rush yards)
# I probably will fill all NaN's in the future

# I've also noticed with some of the older players inducted into the HOF, they must have played multiple positions 
# For 1 example, even though their position isn't listed as a Quarterback, they might have a substantial amount of pass attempts, yards, tds, etc.
# I plan on investigating this further and performing some analysis 
# Filtering individuals who played more than 1 position

# More calculated columns I'm thinking about implementing depending on any analysis I'd wanna perform:
#  - Pass yards per game
#  - Rush yards per game
#  - Receiving yards per game
#  - Sacks per game
#  - Etc.