**Title**: Project Milestone 3  
**Author**: Ryan Weeks  
**Date**: 2/1/2025  
**Description**: I will collect and perform at least 5 data transformation/cleansing steps to my API data.

In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\nflhofplayers.csv")
hof_df = pd.DataFrame(data)
print(hof_df.head())

   id  rank           player position  year_inducted  from    to  \
0   1     1     Tony Boselli        T           2022  1995  2001   
1   2     2     Cliff Branch       WR           2022  1972  1985   
2   3     3     LeRoy Butler       DB           2022  1990  2001   
3   4     4        Sam Mills       LB           2022  1986  1997   
4   5     5  Richard Seymour       DE           2022  2001  2012   

   all_pro_selections  pro_bowl_selections  years_as_starter  ...  \
0                   3                    5                 6  ...   
1                   3                    4                11  ...   
2                   4                    4                11  ...   
3                   1                    5                11  ...   
4                   3                    7                11  ...   

   longest_rush  receptions  receiving_yards  receiving_td  longest_reception  \
0           NaN         NaN              NaN           NaN                NaN   
1          20.

In [3]:
# Create position mapping
position_mapping = {
    'C': 'Center',
    'DB': 'Defensive Back',
    'DE': 'Defensive End',
    'DG': 'Defensive Guard',
    'DT': 'Defensive Tackle',
    'E': 'End',
    'FB': 'Full Back',
    'FL': 'Flanker',
    'G': 'Guard',
    'HB': 'Half Back',
    'K': 'Kicker',
    'LB': 'Line Backer',
    'P': 'Punter',
    'QB': 'Quarterback',
    'RB': 'Running Back',
    'SE': 'Split End',
    'T': 'Tackle',
    'TB': 'Tail Back',
    'TE': 'Tight End',
    'WB': 'Wing Back',
    'WR': 'Wide Receiver' 
}

# Map 'position' column to new 'position_full' column
hof_df['position_full'] = hof_df['position'].map(position_mapping)

# Verify result
print(hof_df[['player', 'position', 'position_full']])

              player position   position_full
0       Tony Boselli        T          Tackle
1       Cliff Branch       WR   Wide Receiver
2       LeRoy Butler       DB  Defensive Back
3          Sam Mills       LB     Line Backer
4    Richard Seymour       DE   Defensive End
..               ...      ...             ...
301      Cal Hubbard        T          Tackle
302       Don Hutson        E             End
303  Bronko Nagurski       FB       Full Back
304     Ernie Nevers       FB       Full Back
305       Jim Thorpe       TB       Tail Back

[306 rows x 3 columns]


In [4]:
# Let's drop the id & rank columns (essentially 2 duplicated index columns)
hof_df = hof_df.drop(columns = ['id', 'rank'])

# Verify
print(hof_df.head())

            player position  year_inducted  from    to  all_pro_selections  \
0     Tony Boselli        T           2022  1995  2001                   3   
1     Cliff Branch       WR           2022  1972  1985                   3   
2     LeRoy Butler       DB           2022  1990  2001                   4   
3        Sam Mills       LB           2022  1986  1997                   1   
4  Richard Seymour       DE           2022  2001  2012                   3   

   pro_bowl_selections  years_as_starter  approx_value  games  ...  \
0                    5                 6            66     91  ...   
1                    4                11            87    183  ...   
2                    4                11            87    181  ...   
3                    5                11           102    181  ...   
4                    7                11            91    164  ...   

   receptions  receiving_yards  receiving_td  longest_reception  \
0         NaN              NaN           Na

In [5]:
# Let's rearrange the column order so my new 'position_full' column is directly after the 'position' column
columns_order = ['player', 'position', 'position_full'] + [col for col in hof_df.columns if col not in ['player', 'position', 'position_full']]
hof_df = hof_df[columns_order]

# Verify
print(hof_df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   all_pro_selections  pro_bowl_selections  years_as_starter  approx_value  \
0                   3                    5                 6            66   
1                   3                    4                11            87   
2                   4                    4                11            87   
3                   1                    5                11           102   
4                   3                    7                11            91   

   ...  longest_rush  receptions  receiving_yards  receiving_td  \
0  ...           Na

In [6]:
# Maybe let's add a 'years_in_NFL' column so we can compare to 'years_as_starter'???
hof_df['years_in_NFL'] = hof_df['to'] - hof_df['from']

# Rearrange the columns again
columns_order = ['player', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter'] + [
    col for col in hof_df.columns if col not in [
        'player', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter']]

hof_df = hof_df[columns_order]
pd.set_option('display.max_columns', None)
print(hof_df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   years_in_NFL  years_as_starter  all_pro_selections  pro_bowl_selections  \
0             6                 6                   3                    5   
1            13                11                   3                    4   
2            11                11                   4                    4   
3            11                11                   1                    5   
4            11                11                   3                    7   

   approx_value  games  completions  pass_attempts  pass_yards  pass_td  \
0          

In [7]:
# Drop another column I don't see necessary
hof_df = hof_df.drop(columns = ['approx_value'])

pd.set_option('display.max_columns', None)
print(hof_df.head())

            player position   position_full  year_inducted  from    to  \
0     Tony Boselli        T          Tackle           2022  1995  2001   
1     Cliff Branch       WR   Wide Receiver           2022  1972  1985   
2     LeRoy Butler       DB  Defensive Back           2022  1990  2001   
3        Sam Mills       LB     Line Backer           2022  1986  1997   
4  Richard Seymour       DE   Defensive End           2022  2001  2012   

   years_in_NFL  years_as_starter  all_pro_selections  pro_bowl_selections  \
0             6                 6                   3                    5   
1            13                11                   3                    4   
2            11                11                   4                    4   
3            11                11                   1                    5   
4            11                11                   3                    7   

   games  completions  pass_attempts  pass_yards  pass_td  \
0     91          NaN    

In [8]:
# Let's create another new column that will group players into either Offense, Defense, or Special Teams depending on their position
def classify_position(position):
    defense_positions = ['DB', 'DE', 'DG', 'DT', 'LB']
    special_teams_positions = ['K', 'P']
    
    # Classify
    if position in defense_positions:
        return 'Defense'
    elif position in special_teams_positions:
        return 'Special Teams'
    else:
        return 'Offense'

# Apply function to create new 'position_group' column

hof_df['position_group'] = hof_df['position'].apply(classify_position)

# Rearrange column order ... AGAIN!!! ahh
columns_order = ['player', 'position_group', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter'] + [
    col for col in hof_df.columns if col not in [
        'player', 'position_group', 'position', 'position_full', 'year_inducted', 'from', 'to', 'years_in_NFL', 'years_as_starter']]

hof_df = hof_df[columns_order]
print(hof_df.head())

            player position_group position   position_full  year_inducted  \
0     Tony Boselli        Offense        T          Tackle           2022   
1     Cliff Branch        Offense       WR   Wide Receiver           2022   
2     LeRoy Butler        Defense       DB  Defensive Back           2022   
3        Sam Mills        Defense       LB     Line Backer           2022   
4  Richard Seymour        Defense       DE   Defensive End           2022   

   from    to  years_in_NFL  years_as_starter  all_pro_selections  \
0  1995  2001             6                 6                   3   
1  1972  1985            13                11                   3   
2  1990  2001            11                11                   4   
3  1986  1997            11                11                   1   
4  2001  2012            11                11                   3   

   pro_bowl_selections  games  completions  pass_attempts  pass_yards  \
0                    5     91          NaN       

In [9]:
# Probably should've done this first... but let's check all our data types
print(hof_df.dtypes)

player                     object
position_group             object
position                   object
position_full              object
year_inducted               int64
from                        int64
to                          int64
years_in_NFL                int64
years_as_starter            int64
all_pro_selections          int64
pro_bowl_selections         int64
games                       int64
completions               float64
pass_attempts             float64
pass_yards                float64
pass_td                   float64
longest_completed_pass    float64
int_thrown                float64
times_sacked              float64
sack_yards                float64
rush_attempts             float64
rush_yards                float64
rush_td                   float64
longest_rush              float64
receptions                float64
receiving_yards           float64
receiving_td              float64
longest_reception         float64
all_purpose_yards           int64
td_total      

In [10]:
# Besides the 'sacks' column, all other numerical columns don't contain decimals
# I think Python is still identifying them as float64 dtypes due to the NaN cells, though
# Should I fill all NaN's with 0??? Since they all represent no activity or absence of events (the player did not have any rush yards)
# I probably will fill all NaN's in the future

# I've also noticed with some of the older players inducted into the HOF, they must have played multiple positions 
# For 1 example, even though their position isn't listed as a Quarterback, they might have a substantial amount of pass attempts, yards, tds, etc.
# I plan on investigating this further and performing some analysis 
# Filtering individuals who played more than 1 position

# More calculated columns I'm thinking about implementing depending on any analysis I'd wanna perform:
#  - Pass yards per game
#  - Rush yards per game
#  - Receiving yards per game
#  - Sacks per game
#  - Etc.

# **Working with the JSON Data**

In [12]:
# From what it appears, you cannot directly search for a specific player through the API, only ALL players for a specific week or Year

import requests

# Base URL for Fantasy Football Data Pros API
BASE_URL = 'https://www.fantasyfootballdatapros.com/api/players/'

# Function to get all players for a specific year
def get_players_for_year(year):
    url = f'{BASE_URL}{year}/all'

    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching players for {year}: {response.status_code}")
        return None

# Function to fetch player stats for a specific year
def search_player_stats(player_name, year):
    players_data = get_players_for_year(year)
    
    if players_data:
        # Search for the player in the data
        for player in players_data:
            if player['player_name'].lower() == player_name.lower():  # case-insensitive matching
                return player
        print(f"Player {player_name} not found in {year} data.")
        return None
    return None

# Function to search player stats for multiple years
def get_player_stats(player_name, start_year, end_year):
    player_stats = {}

    for year in range(start_year, end_year + 1):
        print(f"Fetching stats for {player_name} in {year}...")
        stats = search_player_stats(player_name, year)
        if stats:
            player_stats[year] = stats
        else:
            print(f"No stats found for {player_name} in {year}.")

    return player_stats

# Example usage:
player_name = 'Peyton Manning' 
start_year = 1998  # Starting year
end_year = 2015    # Ending year

player_data = get_player_stats(player_name, start_year, end_year)

# Print the fetched stats for each year
if player_data:
    for year, stats in player_data.items():
        print(f"Stats for {player_name} in {year}: {stats}")

Fetching stats for Peyton Manning in 1998...
Fetching stats for Peyton Manning in 1999...
Fetching stats for Peyton Manning in 2000...
Fetching stats for Peyton Manning in 2001...
Fetching stats for Peyton Manning in 2002...
Fetching stats for Peyton Manning in 2003...
Fetching stats for Peyton Manning in 2004...
Fetching stats for Peyton Manning in 2005...
Fetching stats for Peyton Manning in 2006...
Fetching stats for Peyton Manning in 2007...
Fetching stats for Peyton Manning in 2008...
Fetching stats for Peyton Manning in 2009...
Fetching stats for Peyton Manning in 2010...
Fetching stats for Peyton Manning in 2011...
Player Peyton Manning not found in 2011 data.
No stats found for Peyton Manning in 2011.
Fetching stats for Peyton Manning in 2012...
Fetching stats for Peyton Manning in 2013...
Fetching stats for Peyton Manning in 2014...
Fetching stats for Peyton Manning in 2015...
Stats for Peyton Manning in 1998: {'fumbles_lost': 1.0, 'games_played': 16.0, 'player_name': 'Peyton 

# Data Considerations

Upon trial and error, the fantasy football **API** only tracks back statistics as far as **1992**. And because most fantasy football points are scored/recorded through solely **"Offensive"** positions, I will have to take this into consideration when I plan on merging with my Hall Of Fame CSV data since that data includes MANY **"Defensive"** positions and players that have played before **1992**.

In [98]:
# Filter players to only include those whose 'to' year is 1992 or greater
hof_df_filtered = hof_df[hof_df['to'] >= 1992]

hof_df_filtered

Unnamed: 0,player,position_group,position,position_full,year_inducted,from,to,years_in_NFL,years_as_starter,all_pro_selections,pro_bowl_selections,games,completions,pass_attempts,pass_yards,pass_td,longest_completed_pass,int_thrown,times_sacked,sack_yards,rush_attempts,rush_yards,rush_td,longest_rush,receptions,receiving_yards,receiving_td,longest_reception,all_purpose_yards,td_total,tackles,sacks,interception
0,Tony Boselli,Offense,T,Tackle,2022,1995,2001,6,6,3,5,91,,,,,,,,,,,,,,,,,2,0,1,0.0,0
2,LeRoy Butler,Defense,DB,Defensive Back,2022,1990,2001,11,11,4,4,181,,,,,,,,,,,,,,,,,609,3,889,20.5,38
3,Sam Mills,Defense,LB,Line Backer,2022,1986,1997,11,11,1,5,181,,,,,,,,,,,,,,,,,285,4,1265,20.5,11
4,Richard Seymour,Defense,DE,Defensive End,2022,2001,2012,11,11,3,7,164,,,,,,,,,,,,,,,,,87,1,498,57.5,2
5,Bryant Young,Defense,DT,Defensive Tackle,2022,1994,2007,13,14,1,4,208,,,,,,,,,,,,,,,,,47,0,627,89.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,Joe Montana,Offense,QB,Quarterback,2000,1979,1994,15,12,3,8,192,3409.0,5391.0,40551.0,273.0,96.0,139.0,313.0,2095.0,457.0,1676.0,20.0,21.0,,,,,1642,20,0,0.0,0
138,Eric Dickerson,Offense,RB,Running Back,1999,1983,1993,10,10,5,6,146,1.0,2.0,15.0,1.0,15.0,1.0,1.0,6.0,2996.0,13259.0,90.0,85.0,281.0,2137.0,6.0,50.0,15411,96,0,0.0,0
142,Lawrence Taylor,Defense,LB,Line Backer,1999,1981,1993,12,13,8,10,184,,,,,,,,,,,,,,,,,168,2,0,142.0,9
145,Anthony Munoz,Offense,T,Tackle,1998,1980,1992,12,12,9,11,185,,,,,,,,,,,,,7.0,18.0,4.0,12.0,18,4,0,0.0,0


I ran the following code and realized I still needed to filter more players out since the code was still spending A LOT of time searching for player stats that weren't found. (Defensive players and some Offensive players, specifically linemen)

So, I'm going to continue to filter and search for only "Offensive" positions, specifically only QBs, RBs, WRs, TEs, and Ks.

In [23]:
"""

# Loop through filtered Hall of Fame CSV and search for players in the API
for index, row in hof_df_filtered.iterrows():
    player_name = row['player']
    start_year = int(row['from'])  # 'from' is the start year
    end_year = int(row['to'])      # 'to' is the end year

    # Get player stats for the range of years they played
    player_data = get_player_stats(player_name, start_year, end_year)

    # If player data was found, you can print or store the result
    if player_data:
        print(f"Stats for {player_name}:")
        for year, stats in player_data.items():
            print(f"Year {year}: {stats}")

"""

SyntaxError: incomplete input (2925157977.py, line 1)

## Updated Code for only the Relevant Positions

In [29]:
# Define the list of relevant positions
relevant_positions = ['QB', 'RB', 'WR', 'TE', 'K']

# Filter players to only include those whose 'to' year is 1992 or greater and position is in relevant positions
hof_df_filtered = hof_df[(hof_df['to'] >= 1992) & (hof_df['position'].isin(relevant_positions))]

# Loop through filtered Hall of Fame DataFrame and search for players in the API
for index, row in hof_df_filtered.iterrows():
    player_name = row['player']
    start_year = int(row['from'])  # 'from' is the start year
    end_year = int(row['to'])      # 'to' is the end year

    # Get player stats for the range of years they played
    player_data = get_player_stats(player_name, start_year, end_year)

    # If player data was found, print results
    if player_data:
        print(f"Stats for {player_name}:")
        for year, stats in player_data.items():
            print(f"Year {year}: {stats}")

Fetching stats for Calvin Johnson in 2007...
Fetching stats for Calvin Johnson in 2008...
Fetching stats for Calvin Johnson in 2009...
Fetching stats for Calvin Johnson in 2010...
Fetching stats for Calvin Johnson in 2011...
Fetching stats for Calvin Johnson in 2012...
Fetching stats for Calvin Johnson in 2013...
Fetching stats for Calvin Johnson in 2014...
Fetching stats for Calvin Johnson in 2015...
Stats for Calvin Johnson:
Year 2007: {'fumbles_lost': 0.0, 'games_played': 15.0, 'player_name': 'Calvin Johnson', 'position': 'WR', 'stats': {'passing': {'int': 0.0, 'passing_att': 0.0, 'passing_cmp': 0.0, 'passing_td': 0.0, 'passing_yds': 0.0}, 'receiving': {'receiving_td': 4.0, 'receiving_yds': 756.0, 'receptions': 48.0, 'targets': 93.0}, 'rushing': {'rushing_att': 4.0, 'rushing_td': 1.0, 'rushing_yds': 52.0}}, 'team': 'DET'}
Year 2008: {'fumbles_lost': 2.0, 'games_played': 16.0, 'player_name': 'Calvin Johnson', 'position': 'WR', 'stats': {'passing': {'int': 0.0, 'passing_att': 0.0, 'pa

# Full Code to capture only relevant data into new DataFrame and save as its own CSV

In [41]:
import pandas as pd
import requests

def get_players_for_year(year):
    url = f'{BASE_URL}{year}/all'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching players for {year}: {response.status_code}")
        return None

# Function to fetch player stats for a specific year
def search_player_stats(player_name, year):
    players_data = get_players_for_year(year)
    if players_data:
        for player in players_data:
            if player['player_name'].lower() == player_name.lower():  # case-insensitive matching
                return player
        print(f"Player {player_name} not found in {year} data.")
        return None
    return None

# Function to search player stats for multiple years
def get_player_stats(player_name, start_year, end_year):
    player_stats = []

    for year in range(start_year, end_year + 1):
        print(f"Fetching stats for {player_name} in {year}...")
        stats = search_player_stats(player_name, year)
        if stats:
            print(f"Found stats for {player_name} in {year}: {stats}")  # Debug print
            # Add the year to the player's stats before appending to the list
            stats['year'] = year
            stats['player_name'] = player_name
            player_stats.append(stats)
        else:
            print(f"No stats found for {player_name} in {year}.")

    return player_stats

# Create an empty list to collect all player stats data
all_player_stats = []

# Loop through filtered Hall of Fame CSV and search for players in the API
for index, row in hof_df_filtered.iterrows():
    player_name = row['player']
    start_year = int(row['from'])  # 'from' is the start year
    end_year = int(row['to'])      # 'to' is the end year

    # Get player stats for the range of years they played
    player_data = get_player_stats(player_name, start_year, end_year)

    # If player data was found, append to the all_player_stats list
    if player_data:
        all_player_stats.extend(player_data)

# Check if data was collected properly
if all_player_stats:
    # Convert the collected data into a DataFrame
    df_player_stats = pd.DataFrame(all_player_stats)
    # Optionally, save this DataFrame to a CSV file
    df_player_stats.to_csv('API_player_stats.csv', index=False)
    # Show the new DataFrame with player stats
    print(df_player_stats.head())
else:
    print("No player stats were collected.")

Fetching stats for Calvin Johnson in 2007...
Found stats for Calvin Johnson in 2007: {'fumbles_lost': 0.0, 'games_played': 15.0, 'player_name': 'Calvin Johnson', 'position': 'WR', 'stats': {'passing': {'int': 0.0, 'passing_att': 0.0, 'passing_cmp': 0.0, 'passing_td': 0.0, 'passing_yds': 0.0}, 'receiving': {'receiving_td': 4.0, 'receiving_yds': 756.0, 'receptions': 48.0, 'targets': 93.0}, 'rushing': {'rushing_att': 4.0, 'rushing_td': 1.0, 'rushing_yds': 52.0}}, 'team': 'DET'}
Fetching stats for Calvin Johnson in 2008...
Found stats for Calvin Johnson in 2008: {'fumbles_lost': 2.0, 'games_played': 16.0, 'player_name': 'Calvin Johnson', 'position': 'WR', 'stats': {'passing': {'int': 0.0, 'passing_att': 0.0, 'passing_cmp': 0.0, 'passing_td': 0.0, 'passing_yds': 0.0}, 'receiving': {'receiving_td': 12.0, 'receiving_yds': 1331.0, 'receptions': 78.0, 'targets': 150.0}, 'rushing': {'rushing_att': 3.0, 'rushing_td': 0.0, 'rushing_yds': -1.0}}, 'team': 'DET'}
Fetching stats for Calvin Johnson in 

### I realized I could still optimize my API request code for the players who played before AND after 1992 to NOT search for their stats if the year is less than 1992 because it is spending time searching for data I know doesn't exist. But since I had already run the code and saved the data into a new DataFrame, I didn't bother going forward.

In [55]:
df_player_stats.to_csv('API_player_stats.csv', index=False)

FINALLY I have got all my relevant data into CSV format! Now let's make it more readable.

In [57]:
df_player_stats

Unnamed: 0,fumbles_lost,games_played,player_name,position,stats,team,year
0,0.0,15.0,Calvin Johnson,WR,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",DET,2007
1,2.0,16.0,Calvin Johnson,WR,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",DET,2008
2,2.0,14.0,Calvin Johnson,WR,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",DET,2009
3,0.0,15.0,Calvin Johnson,WR,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",DET,2010
4,1.0,16.0,Calvin Johnson,WR,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",DET,2011
...,...,...,...,...,...,...,...
350,0.0,1.0,Joe Montana,QB,"{'passing': {'int': 0.0, 'passing_att': 21.0, ...",SFO,1992
351,0.0,11.0,Joe Montana,QB,"{'passing': {'int': 7.0, 'passing_att': 298.0,...",KAN,1993
352,5.0,14.0,Joe Montana,QB,"{'passing': {'int': 9.0, 'passing_att': 493.0,...",KAN,1994
353,0.0,16.0,Eric Dickerson,RB,"{'passing': {'int': 0.0, 'passing_att': 0.0, '...",RAI,1992


### The **'stats'** column is a disaster. ^^^  

Let's try and fix that.

In [61]:
import pandas as pd
import requests

BASE_URL = 'https://www.fantasyfootballdatapros.com/api/players/'

def get_players_for_year(year):
    url = f'{BASE_URL}{year}/all'
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching players for {year}: {response.status_code}")
        return None

# Function to fetch player stats for a specific year
def search_player_stats(player_name, year):
    players_data = get_players_for_year(year)
    if players_data:
        for player in players_data:
            if player['player_name'].lower() == player_name.lower():  # case-insensitive matching
                return player
        print(f"Player {player_name} not found in {year} data.")
        return None
    return None

# Function to search player stats for multiple years
def get_player_stats(player_name, start_year, end_year):
    player_stats = []

    for year in range(start_year, end_year + 1):
        print(f"Fetching stats for {player_name} in {year}...")
        stats = search_player_stats(player_name, year)
        if stats:
            print(f"Found stats for {player_name} in {year}")  # Debug print
            # Add the year to the player's stats before appending to the list
            stats['year'] = year
            stats['player_name'] = player_name
            player_stats.append(stats)
        else:
            print(f"No stats found for {player_name} in {year}.")

    return player_stats

# Create an empty list to collect all player stats data
all_player_stats = []

# Loop through filtered Hall of Fame CSV and search for players in the API
for index, row in hof_df_filtered.iterrows():
    player_name = row['player']
    start_year = int(row['from'])  # 'from' is the start year
    end_year = int(row['to'])      # 'to' is the end year

    # Get player stats for the range of years they played
    player_data = get_player_stats(player_name, start_year, end_year)

    # If player data was found, append to the all_player_stats list
    if player_data:
        all_player_stats.extend(player_data)

# Check if data was collected properly
if all_player_stats:
    # Convert the collected data into a DataFrame
    df_player_stats = pd.DataFrame(all_player_stats)

    # Ensure 'stats' column is a dictionary type (if it's not already)
    df_player_stats = df_player_stats[df_player_stats['stats'].apply(lambda x: isinstance(x, dict))]

    # Flatten the 'stats' dictionary into separate columns
    stats_expanded = pd.json_normalize(df_player_stats['stats'])

    # Drop the original 'stats' column and merge expanded stats
    df_player_stats = df_player_stats.drop(columns=['stats']).join(stats_expanded)

    # Save the cleaned CSV with expanded stats
    df_player_stats.to_csv('API_player_stats_expanded.csv', index=False)

    # Show the new DataFrame with expanded player stats
    print(df_player_stats.head())
else:
    print("No player stats were collected.")

Fetching stats for Calvin Johnson in 2007...
Found stats for Calvin Johnson in 2007
Fetching stats for Calvin Johnson in 2008...
Found stats for Calvin Johnson in 2008
Fetching stats for Calvin Johnson in 2009...
Found stats for Calvin Johnson in 2009
Fetching stats for Calvin Johnson in 2010...
Found stats for Calvin Johnson in 2010
Fetching stats for Calvin Johnson in 2011...
Found stats for Calvin Johnson in 2011
Fetching stats for Calvin Johnson in 2012...
Found stats for Calvin Johnson in 2012
Fetching stats for Calvin Johnson in 2013...
Found stats for Calvin Johnson in 2013
Fetching stats for Calvin Johnson in 2014...
Found stats for Calvin Johnson in 2014
Fetching stats for Calvin Johnson in 2015...
Found stats for Calvin Johnson in 2015
Fetching stats for Peyton Manning in 1998...
Found stats for Peyton Manning in 1998
Fetching stats for Peyton Manning in 1999...
Found stats for Peyton Manning in 1999
Fetching stats for Peyton Manning in 2000...
Found stats for Peyton Manning 

#### Great, we've expanded the 'stats' column that was still holding all the statistics in dictionary form into their own separate columns.  
#### Let's rename some of the headers and rearrange some columns for the data to be more appropriate and readable. 

In [69]:
# Rename columns
rename_dict = {
    'passing.int': 'interceptions',
    'passing.passing_att': 'passing_att',
    'passing.passing_cmp': 'passing_cmp',
    'passing.passing_td': 'passing_td',
    'passing.passing_yds': 'passing_yds',
    'receiving.receiving_td': 'receiving_td',
    'receiving.receiving_yds': 'receiving_yds',
    'receiving.receptions': 'receptions',
    'receiving.targets': 'targets',
    'rushing.rushing_att': 'rushing_att',
    'rushing.rushing_td': 'rushing_td',
    'rushing.rushing_yds': 'rushing_yds'
}

df_player_stats = df_player_stats.rename(columns = rename_dict)

print(df_player_stats.columns)

Index(['fumbles_lost', 'games_played', 'player_name', 'position', 'team',
       'year', 'interceptions', 'passing_att', 'passing_cmp', 'passing_td',
       'passing_yds', 'receiving_td', 'receiving_yds', 'receptions', 'targets',
       'rushing_att', 'rushing_td', 'rushing_yds'],
      dtype='object')


In [73]:
# Move 'games_played' after 'year' and 'fumbles_lost' to the end
cols = list(df_player_stats.columns)
cols.remove('games_played') # Remove so we can reinsert
cols.insert(cols.index('year') + 1, 'games_played') # Insert after 'year'

cols.remove('fumbles_lost')
cols.append('fumbles_lost')

# Reorder DataFrame
df_player_stats = df_player_stats[cols]

print(df_player_stats.head())

      player_name position team  year  games_played  interceptions  \
0  Calvin Johnson       WR  DET  2007          15.0            0.0   
1  Calvin Johnson       WR  DET  2008          16.0            0.0   
2  Calvin Johnson       WR  DET  2009          14.0            0.0   
3  Calvin Johnson       WR  DET  2010          15.0            0.0   
4  Calvin Johnson       WR  DET  2011          16.0            0.0   

   passing_att  passing_cmp  passing_td  passing_yds  receiving_td  \
0          0.0          0.0         0.0          0.0           4.0   
1          0.0          0.0         0.0          0.0          12.0   
2          0.0          0.0         0.0          0.0           5.0   
3          0.0          0.0         0.0          0.0          12.0   
4          0.0          0.0         0.0          0.0          16.0   

   receiving_yds  receptions  targets  rushing_att  rushing_td  rushing_yds  \
0          756.0        48.0     93.0          4.0         1.0         52.0   


### I noticed some possible inconsistencies in the 'team' data column... let's take a closer look.

In [79]:
unique_teams = df_player_stats['team'].unique()
print(unique_teams)

['DET' 'IND' 'DEN' 'RAM' 'STL' 'SFO' 'ARI' 'SEA' 'KAN' 'ATL' 'MIN' 'OAK'
 'NWE' '3TM' 'PHI' 'DAL' 'BUF' 'CIN' 'SDG' 'NYJ' 'NYG' 'GNB' 'PIT' 'RAI'
 'TAM' 'WAS' 'MIA' 'BAL' '2TM' 'HOU']


### Possible Issues
1. **Team Name Variations:**
    - 'RAM' vs. 'STL' (Rams/St. Louis, now the L.A. Rams)
    - 'RAI' vs. 'OAK' (Raiders/Oakland, now the Las Vegas Raiders)
    - 'SFO' instead of 'SF' (San Francaisco 49ers)
    - 'KAN' instead of 'KC' (Kansas City Chiefs)
    - 'TAM' instead of 'TB' (Tampa Bay Buccaneers)
    - 'NWE' instead of 'NE' (New England Patriots)
2. **Multi-Team Codes:**
    - '3TM' (Played for 3 teams in a season??)
    - '2TM' (Played for 2 teams in a season??)
  

BECAUSE all the Raider & Ram players in this dataset played BEFORE their teams moved locations, I know the 2 different values for each are only differences in the naming and not locations. So I will just adjust all these to be consistent with eachother. 

For the 3TM and 2TM names, I can do further investigation depending on the player AND year to figure out who they specifically played for. But I'll save that for another day because this data has already given me enough headaches! LOL

In [96]:
# Fix team's inconsistent abbreviations
team_corrections = {
    'RAM': 'STL',
    'RAI': 'OAK',
    'KAN': 'KC',
    'SFO': 'SF',
    'TAM': 'TB',
    'NWE': 'NE'
}

# Apply corrections
df_player_stats['team'] = df_player_stats['team'].replace(team_corrections)

# Print values again to verify
print(df_player_stats['team'].unique())

['DET' 'IND' 'DEN' 'STL' 'SF' 'ARI' 'SEA' 'KC' 'ATL' 'MIN' 'OAK' 'NE'
 '3TM' 'PHI' 'DAL' 'BUF' 'CIN' 'SDG' 'NYJ' 'NYG' 'GNB' 'PIT' 'TB' 'WAS'
 'MIA' 'BAL' '2TM' 'HOU']


# Final DataFrame after:  

- **Collection of Data**  
- **Identifying Irrelevant Data:**  
    - Defensive Players  
    - Offensive Linemen  
    - Players that only played before 1992 (No API data available)  
- **Renamed Headers in Columns**  
- **Formatted Data for Readability**  
- **Fixed Inconsistent Values in 'Team' Column** 

In [93]:
df_player_stats

Unnamed: 0,player_name,position,team,year,games_played,interceptions,passing_att,passing_cmp,passing_td,passing_yds,receiving_td,receiving_yds,receptions,targets,rushing_att,rushing_td,rushing_yds,fumbles_lost
0,Calvin Johnson,WR,DET,2007,15.0,0.0,0.0,0.0,0.0,0.0,4.0,756.0,48.0,93.0,4.0,1.0,52.0,0.0
1,Calvin Johnson,WR,DET,2008,16.0,0.0,0.0,0.0,0.0,0.0,12.0,1331.0,78.0,150.0,3.0,0.0,-1.0,2.0
2,Calvin Johnson,WR,DET,2009,14.0,0.0,0.0,0.0,0.0,0.0,5.0,984.0,67.0,137.0,7.0,0.0,73.0,2.0
3,Calvin Johnson,WR,DET,2010,15.0,0.0,0.0,0.0,0.0,0.0,12.0,1120.0,77.0,137.0,4.0,0.0,32.0,0.0
4,Calvin Johnson,WR,DET,2011,16.0,0.0,0.0,0.0,0.0,0.0,16.0,1681.0,96.0,158.0,1.0,0.0,11.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,Joe Montana,QB,SF,1992,1.0,0.0,21.0,15.0,2.0,126.0,0.0,0.0,0.0,0.0,3.0,0.0,28.0,0.0
351,Joe Montana,QB,KC,1993,11.0,7.0,298.0,181.0,13.0,2144.0,0.0,0.0,0.0,0.0,25.0,0.0,64.0,0.0
352,Joe Montana,QB,KC,1994,14.0,9.0,493.0,299.0,16.0,3283.0,0.0,0.0,0.0,0.0,18.0,0.0,17.0,5.0
353,Eric Dickerson,RB,OAK,1992,16.0,0.0,0.0,0.0,0.0,0.0,1.0,85.0,14.0,22.0,187.0,2.0,729.0,0.0
