**Title**: Project Milestone 4  
**Author**: Ryan Weeks  
**Date**: 2/15/2025  
**Description**: I will collect and perform at least 5 data transformation/cleansing steps to my html/website data.

In [26]:
import pandas as pd
import requests
from io import StringIO

def construct_url(position, year):
    """Construct URL based on the player's position and year."""
    base_url = "https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/{year}/player/"
    if position == 'QB':
        return base_url.format(year=year) + "passing.csv"
    elif position == 'RB':
        return base_url.format(year=year) + "rushing.csv"
    elif position in ['WR', 'TE']:
        return base_url.format(year=year) + "receiving.csv"
    elif position == 'K':
        return base_url.format(year=year) + "field-goals.csv"
    return None

def scrape_data(player_name, position, start_year, end_year):
    """Scrape data for each player and year."""
    scraped_data = []
    
    for year in range(int(start_year), int(end_year) + 1):
        url = construct_url(position, year)
        if url:
            print(f"Scraping data for {player_name} ({position}) from year {year} at {url}")
            try:
                response = requests.get(url)
                
                if response.status_code == 200:
                    # Use StringIO to read the content as if it were a file
                    csv_data = StringIO(response.text)
                    
                    try:
                        # Try reading the CSV with a comma delimiter
                        df = pd.read_csv(csv_data, delimiter=',')
                        scraped_data.append({'player': player_name, 'position': position, 'year': year, 'data': df})
                        print(f"Data for {player_name} ({position}) in {year} successfully scraped.")
                    except pd.errors.ParserError:
                        # If comma delimiter fails, try reading with a semicolon
                        df = pd.read_csv(csv_data, delimiter=';')
                        scraped_data.append({'player': player_name, 'position': position, 'year': year, 'data': df})
                        print(f"Data for {player_name} ({position}) in {year} scraped with semicolon delimiter.")
                else:
                    print(f"Failed to retrieve data for {player_name} ({position}) from year {year} at {url}")
            except Exception as e:
                print(f"Error scraping data for {player_name} ({position}) from year {year}: {e}")
    
    return scraped_data

# Example usage:
hof_df = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\hof_df_filtered.csv")
for index, row in hof_df.iterrows():
    player_name = row['player']
    position = row['position']
    start_year = row['from']
    end_year = row['to']
    
    scraped_data = scrape_data(player_name, position, start_year, end_year)
    
    # You can now process or save scraped_data for the current player here

Scraping data for Calvin Johnson (WR) from year 2007 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2007/player/receiving.csv
Data for Calvin Johnson (WR) in 2007 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2008 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2008/player/receiving.csv
Data for Calvin Johnson (WR) in 2008 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2009 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2009/player/receiving.csv
Data for Calvin Johnson (WR) in 2009 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2010 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2010/player/receiving.csv
Data for Calvin Johnson (WR) in 2010 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2011 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2011/player/receiving.csv
Data for Calvi

In [28]:
print(scraped_data)

[{'player': 'Eric Dickerson', 'position': 'RB', 'year': 1983, 'data':               Player  Rush Yds  Att  TD  20+  40+  Lng  Rush 1st  Rush 1st%  \
0     Eric Dickerson      1808  390  18   11    3   85       116       29.7   
1    William Andrews      1567  331   7    8    0   27        77       23.3   
2        Curt Warner      1449  335  13    7    1   60        80       23.9   
3      Walter Payton      1421  314   6    7    1   49        83       26.4   
4       John Riggins      1347  375  24    3    1   44        98       26.1   
..               ...       ...  ...  ..  ...  ...  ...       ...        ...   
303        Ed Luther       -14    9   0    0    0    8         0        0.0   
304      Ken Stabler       -14    9   0    0    0    0         0        0.0   
305    Kenny Duckett       -16    2   0    0    0    2         0        0.0   
306       Don Strock       -16    6   0    0    0   -2         0        0.0   
307         Art Monk       -19    3   0    0    0    2       

In [30]:
import pandas as pd
import requests
from io import StringIO

def construct_url(position, year):
    """Construct URL based on the player's position and year."""
    base_url = "https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/{year}/player/"
    if position == 'QB':
        return base_url.format(year=year) + "passing.csv"
    elif position == 'RB':
        return base_url.format(year=year) + "rushing.csv"
    elif position in ['WR', 'TE']:
        return base_url.format(year=year) + "receiving.csv"
    elif position == 'K':
        return base_url.format(year=year) + "field-goals.csv"
    return None

def scrape_data(player_name, position, start_year, end_year):
    """Scrape data for each player and year."""
    scraped_data = []
    
    for year in range(int(start_year), int(end_year) + 1):
        url = construct_url(position, year)
        if url:
            print(f"Scraping data for {player_name} ({position}) from year {year} at {url}")
            try:
                response = requests.get(url)
                
                if response.status_code == 200:
                    # Use StringIO to read the content as if it were a file
                    csv_data = StringIO(response.text)
                    
                    try:
                        # Try reading the CSV with a comma delimiter
                        df = pd.read_csv(csv_data, delimiter=',')
                        df['player'] = player_name
                        df['position'] = position
                        df['year'] = year
                        scraped_data.append(df)
                        print(f"Data for {player_name} ({position}) in {year} successfully scraped.")
                    except pd.errors.ParserError:
                        # If comma delimiter fails, try reading with a semicolon
                        df = pd.read_csv(csv_data, delimiter=';')
                        df['player'] = player_name
                        df['position'] = position
                        df['year'] = year
                        scraped_data.append(df)
                        print(f"Data for {player_name} ({position}) in {year} scraped with semicolon delimiter.")
                else:
                    print(f"Failed to retrieve data for {player_name} ({position}) from year {year} at {url}")
            except Exception as e:
                print(f"Error scraping data for {player_name} ({position}) from year {year}: {e}")
    
    # Combine the data for all years into a single DataFrame
    return pd.concat(scraped_data, ignore_index=True)

# Load your 'hof_df_filtered' CSV
hof_df = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\hof_df_filtered.csv")

# Initialize an empty list to store the scraped data for all players
all_player_data = []

# Loop through each player in your 'hof_df_filtered' CSV
for index, row in hof_df.iterrows():
    player_name = row['player']
    position = row['position']
    start_year = row['from']
    end_year = row['to']
    
    # Scrape data for the current player
    player_data = scrape_data(player_name, position, start_year, end_year)
    
    # Append the data for this player to the list
    all_player_data.append(player_data)

# Combine all the data into one single DataFrame
final_df = pd.concat(all_player_data, ignore_index=True)

# Save the final DataFrame to a CSV file
final_df.to_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\combined_player_data.csv", index=False)

print("Scraping complete. Data saved to 'combined_player_data.csv'.")

Scraping data for Calvin Johnson (WR) from year 2007 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2007/player/receiving.csv
Data for Calvin Johnson (WR) in 2007 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2008 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2008/player/receiving.csv
Data for Calvin Johnson (WR) in 2008 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2009 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2009/player/receiving.csv
Data for Calvin Johnson (WR) in 2009 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2010 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2010/player/receiving.csv
Data for Calvin Johnson (WR) in 2010 successfully scraped.
Scraping data for Calvin Johnson (WR) from year 2011 at https://raw.githubusercontent.com/MarcLinderGit/NFL_Stats/main/data/2011/player/receiving.csv
Data for Calvi

In [36]:
# Load the data
hof_df = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\hof_df_filtered.csv")
combined_player_data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\combined_player_data.csv")

# Create an empty list to store matching rows
matching_data = []

# Loop through the hof_df and check for matches in combined_player_data
for player in hof_df['player']:
    matching_rows = combined_player_data[combined_player_data['Player'] == player]
    
    # If matches are found, append them to the list
    if not matching_rows.empty:
        matching_data.append(matching_rows)

# Combine the matching rows into a single DataFrame
final_df = pd.concat(matching_data, ignore_index=True)

# Save the result to a new CSV
final_df.to_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\matching_player_data.csv", index=False)

print("CSV with matching player data has been created!")

  combined_player_data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\combined_player_data.csv")


CSV with matching player data has been created!


#### --I'll look into/fix the DtypeWarning for those columns later.--

# First Glimpse at the Scraped Data

In [38]:
matching_player_data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\matching_player_data.csv")
print(matching_player_data.head())

           Player   Rec     Yds    TD   20+   40+   LNG  Rec 1st  1st%  \
0  Calvin Johnson  48.0   756.0   4.0  12.0   1.0  49.0     38.0  79.2   
1  Calvin Johnson  78.0  1331.0  12.0  21.0   7.0  96.0     52.0  66.7   
2  Calvin Johnson  67.0   984.0   5.0  15.0   4.0  75.0     46.0  68.7   
3  Calvin Johnson  77.0  1120.0  12.0  16.0   3.0  87.0     57.0  74.0   
4  Calvin Johnson  96.0  1681.0  16.0  32.0  10.0  73.0     77.0  80.2   

   Rec FUM  ...  Rush FUM  FGM FG %  1-19 > A-M  20-29 > A-M  30-39 > A-M  \
0      1.0  ...       NaN  NaN  NaN         NaN          NaN          NaN   
1      2.0  ...       NaN  NaN  NaN         NaN          NaN          NaN   
2      3.0  ...       NaN  NaN  NaN         NaN          NaN          NaN   
3      1.0  ...       NaN  NaN  NaN         NaN          NaN          NaN   
4      1.0  ...       NaN  NaN  NaN         NaN          NaN          NaN   

   40-49 > A-M  50-59 > A-M  60+ > A-M  FG Blk  
0          NaN          NaN        NaN     

In [40]:
print(matching_player_data.shape)

(5837, 38)


In [44]:
duplicates = matching_player_data.duplicated()

duplicate_rows = matching_player_data[duplicates]
print(duplicate_rows)

              Player   Rec     Yds    TD   20+  40+   LNG  Rec 1st  1st%  \
9     Calvin Johnson  48.0   756.0   4.0  12.0  1.0  49.0     38.0  79.2   
10    Calvin Johnson  78.0  1331.0  12.0  21.0  7.0  96.0     52.0  66.7   
11    Calvin Johnson  67.0   984.0   5.0  15.0  4.0  75.0     46.0  68.7   
22    Calvin Johnson  48.0   756.0   4.0  12.0  1.0  49.0     38.0  79.2   
23    Calvin Johnson  78.0  1331.0  12.0  21.0  7.0  96.0     52.0  66.7   
...              ...   ...     ...   ...   ...  ...   ...      ...   ...   
5832  Eric Dickerson   NaN     NaN   7.0   2.0  0.0   NaN      NaN   NaN   
5833  Eric Dickerson   NaN     NaN   4.0   3.0  1.0   NaN      NaN   NaN   
5834  Eric Dickerson   NaN     NaN   2.0   3.0  0.0   NaN      NaN   NaN   
5835  Eric Dickerson   NaN     NaN   2.0   5.0  1.0   NaN      NaN   NaN   
5836  Eric Dickerson   NaN     NaN   0.0   0.0  0.0   NaN      NaN   NaN   

      Rec FUM  ...  Rush FUM  FGM FG %  1-19 > A-M  20-29 > A-M  30-39 > A-M  \
9      

# Remove duplicates

In [48]:
matching_player_data = matching_player_data.drop_duplicates()

print(matching_player_data.shape)

(1269, 38)


In [50]:
matching_player_data.to_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\matching_player_data.csv", index=False)

# Column Dtype Fixing
It appears with the field goal data, pandas has mistaken the attempts/makes formatting as date formats. (3/4 as 4-Mar)

In [62]:
# Re-load the CSV with specified columns as strings to prevent date parsing
matching_player_data = pd.read_csv(
    r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\matching_player_data.csv",
    dtype={ '1-19 > A-M': str, '20-29 > A-M': str, '30-39 > A-M': str, 
            '40-49 > A-M': str, '50-59 > A-M': str, '60+ > A-M': str }
)

# View rows from 254 to 278 (index 253 to 277)
matching_player_data.iloc[253:278]

Unnamed: 0,Player,Rec,Yds,TD,20+,40+,LNG,Rec 1st,1st%,Rec FUM,...,Rush FUM,FGM,FG %,1-19 > A-M,20-29 > A-M,30-39 > A-M,40-49 > A-M,50-59 > A-M,60+ > A-M,FG Blk
253,Morten Andersen,,,,,,,,,,...,,18.0,75.0,2-Feb,8-Aug,4-Mar,6-Feb,0/0,0/0,2.0
254,Morten Andersen,,,,,,,,,,...,,20.0,74.1,0/0,9-Sep,5-Apr,10-May,0/0,0/0,1.0
255,Morten Andersen,,,,,,,,,,...,,31.0,88.6,0/0,5-Apr,13/14,12-Nov,0/0,0/0,1.0
256,Morten Andersen,,,,,,,,,,...,,26.0,86.7,1-Jan,11-Nov,7-Jun,6-Jun,0/0,0/0,0.0
257,Morten Andersen,,,,,,,,,,...,,28.0,77.8,3-Mar,6-Jun,9-Sep,12-Aug,0/0,0/0,0.0
258,Morten Andersen,,,,,,,,,,...,,26.0,72.2,1-Jan,12-Nov,11-Aug,8-May,0/0,0/0,1.0
259,Morten Andersen,,,,,,,,,,...,,20.0,69.0,0/0,8-Jul,11-Oct,6-Mar,0/0,0/0,1.0
260,Morten Andersen,,,,,,,,,,...,,21.0,77.8,0/0,5-May,6-May,12-Aug,0/0,0/0,1.0
261,Morten Andersen,,,,,,,,,,...,,25.0,78.1,0/0,6-Jun,13-Nov,9-Jun,0/0,0/0,0.0
262,Morten Andersen,,,,,,,,,,...,,29.0,85.3,0/0,10-Oct,10-Aug,11-Aug,0/0,0/0,0.0


In [78]:
import re

# Dictionary to map month names to their respective numeric values
month_map = {
    'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 'May': '5', 'Jun': '6',
    'Jul': '7', 'Aug': '8', 'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'
}

# Specify columns that need the transformation
columns_to_fix = [
    '1-19 > A-M', '20-29 > A-M', '30-39 > A-M', '40-49 > A-M', 
    '50-59 > A-M', '60+ > A-M'
]

# Function to transform the values like '5-May' to '5/5' (month/day)
def reformat_months(val):
    if isinstance(val, str):  # Only apply regex if it's a string
        # Match patterns like '5-May' and '11-Aug'
        match = re.match(r'(\d{1,2})-(\w{3})', val)
        if match:
            day = match.group(1)
            month_name = match.group(2)
            # Replace month name with numeric month from the map
            month_number = month_map.get(month_name, '')
            return f"{day}/{month_number}"  # Correct the order to day/month
    return val  # Return the value as is if it's not a string or doesn't match

# Apply transformation to the relevant columns
for col in columns_to_fix:
    matching_player_data[col] = matching_player_data[col].apply(reformat_months)

# Check a few rows to verify
matching_player_data.iloc[253:278]

Unnamed: 0,Player,Rec,Yds,TD,20+,40+,LNG,Rec 1st,1st%,Rec FUM,...,Rush FUM,FGM,FG %,1-19 > A-M,20-29 > A-M,30-39 > A-M,40-49 > A-M,50-59 > A-M,60+ > A-M,FG Blk
253,Morten Andersen,,,,,,,,,,...,,18.0,75.0,2/2,8/8,4/3,6/2,0/0,0/0,2.0
254,Morten Andersen,,,,,,,,,,...,,20.0,74.1,0/0,9/9,5/4,10/5,0/0,0/0,1.0
255,Morten Andersen,,,,,,,,,,...,,31.0,88.6,0/0,5/4,13/14,12/11,0/0,0/0,1.0
256,Morten Andersen,,,,,,,,,,...,,26.0,86.7,1/1,11/11,7/6,6/6,0/0,0/0,0.0
257,Morten Andersen,,,,,,,,,,...,,28.0,77.8,3/3,6/6,9/9,12/8,0/0,0/0,0.0
258,Morten Andersen,,,,,,,,,,...,,26.0,72.2,1/1,12/11,11/8,8/5,0/0,0/0,1.0
259,Morten Andersen,,,,,,,,,,...,,20.0,69.0,0/0,8/7,11/10,6/3,0/0,0/0,1.0
260,Morten Andersen,,,,,,,,,,...,,21.0,77.8,0/0,5/5,6/5,12/8,0/0,0/0,1.0
261,Morten Andersen,,,,,,,,,,...,,25.0,78.1,0/0,6/6,13/11,9/6,0/0,0/0,0.0
262,Morten Andersen,,,,,,,,,,...,,29.0,85.3,0/0,10/10,10/8,11/8,0/0,0/0,0.0


# Both WR and TE positions listed for same player and year, giving identical receiving stats

In [85]:
# Duplicates based on player, year, and all the receiving stats columns
receiving_columns = ['Rec', 'Yds', 'TD', '20+', '40+', 'Lng', 'Rec 1st', '1st%', 'Rec FUM', 'Rec YAC/R', 'Tgts']
duplicates = matching_player_data[matching_player_data.duplicated(subset=['Player', 'year'] + receiving_columns, keep=False)]

# Display the duplicate rows
print(duplicates)

              Player   Rec     Yds    TD   20+   40+   LNG  Rec 1st  1st%  \
0     Calvin Johnson  48.0   756.0   4.0  12.0   1.0  49.0     38.0  79.2   
1     Calvin Johnson  78.0  1331.0  12.0  21.0   7.0  96.0     52.0  66.7   
2     Calvin Johnson  67.0   984.0   5.0  15.0   4.0  75.0     46.0  68.7   
3     Calvin Johnson  77.0  1120.0  12.0  16.0   3.0  87.0     57.0  74.0   
4     Calvin Johnson  96.0  1681.0  16.0  32.0  10.0  73.0     77.0  80.2   
...              ...   ...     ...   ...   ...   ...   ...      ...   ...   
1246  Eric Dickerson   6.0    58.0   0.0   1.0   0.0  30.0      2.0  33.3   
1250  Eric Dickerson  18.0    92.0   0.0   0.0   0.0  17.0      0.0   0.0   
1251  Eric Dickerson  41.0   269.0   1.0   1.0   0.0  26.0     13.0  31.7   
1252  Eric Dickerson  14.0    85.0   1.0   0.0   0.0  15.0      5.0  35.7   
1253  Eric Dickerson   6.0    58.0   0.0   1.0   0.0  30.0      2.0  33.3   

      Rec FUM  ...  Rush FUM  FGM FG %  1-19 > A-M  20-29 > A-M  30-39 > A-

# Two specific examples

In [92]:
print(matching_player_data.loc[1, ['Player', 'year', 'Rec', 'Yds', 'TD', 'position']])  # Row 2
print(matching_player_data.loc[13, ['Player', 'year', 'Rec', 'Yds', 'TD', 'position']])  # Row 14

Player      Calvin Johnson
year                  2008
Rec                   78.0
Yds                 1331.0
TD                    12.0
position                WR
Name: 1, dtype: object
Player      Calvin Johnson
year                  2008
Rec                   78.0
Yds                 1331.0
TD                    12.0
position                TE
Name: 13, dtype: object


So, you can clearly see the identical statistics listed for two different positions.

# Keep only the WR or TE rows for players depending on their position listed in the Hall of Fame CSV

In [97]:
# Loop over each player in 'hof_df'
for index, row in hof_df.iterrows():
    player = row['player']
    position = row['position']

    # Filter out 'WR' or 'TE' rows based on the position in 'hof_df'
    if position == 'WR':
        # Keep WR rows and remove TE rows for this player in 'matching_player_data'
        matching_player_data = matching_player_data[(matching_player_data['Player'] != player) | (matching_player_data['position'] == 'WR')]
    elif position == 'TE':
        # Keep TE rows and remove WR rows for this player in 'matching_player_data'
        matching_player_data = matching_player_data[(matching_player_data['Player'] != player) | (matching_player_data['position'] == 'TE')]

# Check the resulting dataframe
print(matching_player_data)

              Player   Rec     Yds    TD   20+   40+   LNG  Rec 1st  1st%  \
0     Calvin Johnson  48.0   756.0   4.0  12.0   1.0  49.0     38.0  79.2   
1     Calvin Johnson  78.0  1331.0  12.0  21.0   7.0  96.0     52.0  66.7   
2     Calvin Johnson  67.0   984.0   5.0  15.0   4.0  75.0     46.0  68.7   
3     Calvin Johnson  77.0  1120.0  12.0  16.0   3.0  87.0     57.0  74.0   
4     Calvin Johnson  96.0  1681.0  16.0  32.0  10.0  73.0     77.0  80.2   
...              ...   ...     ...   ...   ...   ...   ...      ...   ...   
1264  Eric Dickerson   NaN     NaN  18.0  11.0   3.0   NaN      NaN   NaN   
1265  Eric Dickerson   NaN     NaN  14.0  18.0   4.0   NaN      NaN   NaN   
1266  Eric Dickerson   NaN     NaN  12.0   4.0   2.0   NaN      NaN   NaN   
1267  Eric Dickerson   NaN     NaN  11.0  10.0   2.0   NaN      NaN   NaN   
1268  Eric Dickerson   NaN     NaN   6.0   7.0   2.0   NaN      NaN   NaN   

      Rec FUM  ...  Rush FUM  FGM FG %  1-19 > A-M  20-29 > A-M  30-39 > A-

# Replace Headers
This scraped dataframe uses camelCase for the headers. My other two datasets I've been working with during this project use snake_case, so let's change the camelCase to snake_case to be consistent with the other datasets.

In [128]:
# Define the mapping of old column names to snake_case
column_mapping = {
    "Player": "player",
    "Rec": "rec",
    "Yds": "yards",
    "TD": "td",
    "20+": "rec_20_plus",
    "40+": "rec_40_plus",
    "LNG": "long",
    "Rec 1st": "rec_1st",
    "1st%": "rec_1st_pct",
    "Rec FUM": "rec_fum",
    "Rec YAC/R": "rec_yac",
    "Tgts": "targts",
    "position": "position",
    "year": "year",
    "Pass Yds": "pass_yds",
    "Yds/Att": "yds_per_att",
    "Att": "att",
    "Cmp": "cmp",
    "Cmp %": "cmp_pct",
    "INT": "int",
    "Rate": "rate",
    "1st": "pass_1st",
    "Lng": "pass_long",
    "Sck": "sacks",
    "SckY": "sack_yards",
    "Rush Yds": "rush_yds",
    "Rush 1st": "rush_1st",
    "Rush 1st%": "rush_1st_pct",
    "Rush FUM": "rush_fum",
    "FGM": "fgm",
    "FG %": "fg_pct",
    "1-19 > A-M": "fg_1_19",
    "20-29 > A-M": "fg_20_29",
    "30-39 > A-M": "fg_30_39",
    "40-49 > A-M": "fg_40_49",
    "50-59 > A-M": "fg_50_59",
    "60+ > A-M": "fg_60_plus",
    "FG Blk": "fg_blk"
}

# Rename the columns in the DataFrame
matching_player_data.rename(columns=column_mapping, inplace=True)

# Display the updated column names
print(matching_player_data.head())  # Verify the changes

           player   rec   yards    td  rec_20_plus  rec_40_plus  long  \
0  Calvin Johnson  48.0   756.0   4.0         12.0          1.0  49.0   
1  Calvin Johnson  78.0  1331.0  12.0         21.0          7.0  96.0   
2  Calvin Johnson  67.0   984.0   5.0         15.0          4.0  75.0   
3  Calvin Johnson  77.0  1120.0  12.0         16.0          3.0  87.0   
4  Calvin Johnson  96.0  1681.0  16.0         32.0         10.0  73.0   

   rec_1st  rec_1st_pct  rec_fum  ...  rush_fum  fgm fg_pct  fg_1_19  \
0     38.0         79.2      1.0  ...       NaN  NaN    NaN      NaN   
1     52.0         66.7      2.0  ...       NaN  NaN    NaN      NaN   
2     46.0         68.7      3.0  ...       NaN  NaN    NaN      NaN   
3     57.0         74.0      1.0  ...       NaN  NaN    NaN      NaN   
4     77.0         80.2      1.0  ...       NaN  NaN    NaN      NaN   

   fg_20_29  fg_30_39  fg_40_49  fg_50_59  fg_60_plus  fg_blk  
0       NaN       NaN       NaN       NaN         NaN     NaN  


In [142]:
scraped_data = pd.read_csv(r"C:\Users\Weekseey\Documents\Bellevue Work\Data Wrangling\cleaned_matching_player_data.csv")

In [144]:
print(scraped_data)

          player   rec   yards    td  rec_20_plus  rec_40_plus  long  rec_1st  \
0     Andre Reed  48.0   637.0   4.0         10.0          0.0  32.0     31.0   
1     Andre Reed  53.0   739.0   7.0         11.0          1.0  55.0     38.0   
2     Andre Reed  57.0   752.0   5.0         10.0          1.0  40.0     38.0   
3     Andre Reed  71.0   968.0   6.0         16.0          2.0  65.0     40.0   
4     Andre Reed  88.0  1312.0   9.0         18.0          5.0  78.0     62.0   
..           ...   ...     ...   ...          ...          ...   ...      ...   
513  Warren Moon   NaN     NaN   7.0         19.0          2.0   NaN      NaN   
514  Warren Moon   NaN     NaN  25.0         52.0          9.0   NaN      NaN   
515  Warren Moon   NaN     NaN  11.0         22.0          3.0   NaN      NaN   
516  Warren Moon   NaN     NaN   0.0          1.0          0.0   NaN      NaN   
517  Warren Moon   NaN     NaN   1.0          3.0          1.0   NaN      NaN   

     rec_1st_pct  rec_fum  

# Rearrange columns so the first 3 are player, position, and year

I think this gives the dataset better readability.

In [146]:
# Reorder columns so that 'player', 'position', 'year' are first
cols = ['player', 'position', 'year'] + [col for col in scraped_data.columns if col not in ['player', 'position', 'year']]
scraped_data = scraped_data[cols]

In [148]:
print(scraped_data)

          player position  year   rec   yards    td  rec_20_plus  rec_40_plus  \
0     Andre Reed       WR  1985  48.0   637.0   4.0         10.0          0.0   
1     Andre Reed       WR  1986  53.0   739.0   7.0         11.0          1.0   
2     Andre Reed       WR  1987  57.0   752.0   5.0         10.0          1.0   
3     Andre Reed       WR  1988  71.0   968.0   6.0         16.0          2.0   
4     Andre Reed       WR  1989  88.0  1312.0   9.0         18.0          5.0   
..           ...      ...   ...   ...     ...   ...          ...          ...   
513  Warren Moon       QB  1996   NaN     NaN   7.0         19.0          2.0   
514  Warren Moon       QB  1997   NaN     NaN  25.0         52.0          9.0   
515  Warren Moon       QB  1998   NaN     NaN  11.0         22.0          3.0   
516  Warren Moon       QB  1999   NaN     NaN   0.0          1.0          0.0   
517  Warren Moon       QB  2000   NaN     NaN   1.0          3.0          1.0   

     long  rec_1st  ...  ru

# Data Transformations Performed:  
1. **Removed Duplicates:** Identified and removed duplicate rows to ensure unique entries across the dataset.

2. **Data Type Fixing:** Addressed inconsistencies in data types, particularly for columns related to field goal statistics, ensuring proper formats for analysis.

3. **Duplicate Statistical Entries:** Eliminated redundant statistics for players listed under multiple positions (e.g., WR & TE) where the stats were identical, retaining the correct position as per the Hall of Fame data.

4. **Standardized Column Headers:** Replaced column headers with snake_case formatting for consistency across datasets and improved readability.

5. **Rearranged Columns:** Adjusted the column order to enhance the logical flow, with key columns like player, position, and year positioned at the beginning for better accessibility and clarity.