In [1]:
#from splinter import Browser
from bs4 import BeautifulSoup as Soup
import numpy as np
import pandas as pd
from urllib.parse import urlparse, parse_qs
from splinter import Browser
import html5lib
import openpyxl
import xarray
import hvplot.pandas
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
#import lux


In [2]:
def lookup(start, finish, subject):
    df = pd.DataFrame()
    for i in range(start, finish + 1):
        url = 'https://www.pro-football-reference.com/years/{}/{}.htm'.format(i, subject)
        # Read HTML table and handle multi-level header if necessary
        site_data = pd.read_html(url)[0]
        if isinstance(site_data.columns, pd.MultiIndex):
            # If there is a multi-level header, drop the first level
            site_data.columns = site_data.columns.droplevel(0)
        site_data["Season"] = i
        df = pd.concat([df, site_data], ignore_index=True)
        # Check for special characters in player names and remove them
        if any(('+' in player) or ('*' in player) for player in df['Player']):
            df['Player'] = df['Player'].str.replace("+", "", case=False, regex=False)
            df['Player'] = df['Player'].str.replace("*", "", case=False, regex=False)
            print("Data for season", i, "retrieved from", url)
        else:
            print("Data for season", i, "retrieved.", url)
    return df


## Rushing

k -- Rank
This is a count of the rows from top to bottom.
It is recalculated following the sorting of a column.
Age -- Player's age on December 31st of that year
Pos -- Position
Games
G -- Games played
GS -- Games started as an offensive or defensive player
Rushing
Att -- Rushing Attempts (sacks not included in NFL)
Yds -- Rushing Yards Gained (sack yardage is not included by NFL)
TD -- Rushing Touchdowns
1D -- First downs rushing
YBC -- Rushing yards before contact

In [3]:
#rushing_2019_2022_df = lookup(2019, 2022, "rushing")
#rushing_2019_df = lookup(2019, 2019, "rushing")
#rushing_2020_df = lookup(2020, 2020, "rushing")
#rushing_2021_df = lookup(2021, 2022, "rushing")
rushing_2022_df = lookup(2022, 2022, "rushing")

#rushing_2019_2022_df.head()

Data for season 2022 retrieved from https://www.pro-football-reference.com/years/2022/rushing.htm


In [4]:
rush_reg_df = rushing_2022_df.copy()
rush_reg_df.head()

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Att,Yds,TD,1D,Succ%,Lng,Y/A,Y/G,Fmb,Season
0,1,Derrick Henry,TEN,28,RB,16,16,349,1538,13,65,46.7,56,4.4,96.1,6,2022
1,2,Josh Jacobs,LVR,24,RB,17,17,340,1653,12,93,57.4,86,4.9,97.2,3,2022
2,3,Nick Chubb,CLE,27,RB,17,17,302,1525,12,69,50.0,41,5.0,89.7,1,2022
3,4,Saquon Barkley,NYG,25,RB,16,16,295,1312,10,62,47.5,68,4.4,82.0,1,2022
4,5,Najee Harris,PIT,24,RB,17,17,272,1034,7,45,46.0,36,3.8,60.8,3,2022


In [5]:
rush_reg_df = rush_reg_df[rush_reg_df['Pos'] == "RB"]
rush_reg_df = rush_reg_df.drop(columns=['Tm','Rk', 'Season','Pos'])
rush_reg_df.set_index('Player', inplace=True)
rush_reg_df.head()


Unnamed: 0_level_0,Age,G,GS,Att,Yds,TD,1D,Succ%,Lng,Y/A,Y/G,Fmb
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Derrick Henry,28,16,16,349,1538,13,65,46.7,56,4.4,96.1,6
Josh Jacobs,24,17,17,340,1653,12,93,57.4,86,4.9,97.2,3
Nick Chubb,27,17,17,302,1525,12,69,50.0,41,5.0,89.7,1
Saquon Barkley,25,16,16,295,1312,10,62,47.5,68,4.4,82.0,1
Najee Harris,24,17,17,272,1034,7,45,46.0,36,3.8,60.8,3


In [6]:
rush_reg_scaled = StandardScaler().fit_transform(rush_reg_df[["Age","G","GS","Att","TD","1D","Succ%","Lng","Y/A","Y/G","Fmb"]])

In [7]:
rush_reg_scaled = pd.DataFrame(
    rush_reg_scaled,
    columns=["Age","G","GS","Att","TD","1D","Succ%","Lng","Y/A","Y/G","Fmb"]
)

rush_reg_scaled["Player"]=rush_reg_df.index
rush_reg_scaled=rush_reg_scaled.set_index("Player")
rush_reg_scaled.head()

Unnamed: 0_level_0,Age,G,GS,Att,TD,1D,Succ%,Lng,Y/A,Y/G,Fmb
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Derrick Henry,1.006351,0.853121,2.236394,3.078524,3.167708,2.335162,0.006556,1.494485,0.251509,2.724525,3.832494
Josh Jacobs,-0.532774,1.051489,2.421359,2.974582,2.867918,3.736749,0.712938,3.049504,0.604808,2.768179,1.595569
Nick Chubb,0.62157,1.051489,2.421359,2.535716,2.867918,2.535389,0.224412,0.716976,0.675468,2.470539,0.104286
Saquon Barkley,-0.147993,0.853121,2.236394,2.454872,2.268339,2.184992,0.059369,2.116493,0.251509,2.164961,0.104286
Najee Harris,-0.532774,1.051489,2.421359,2.189243,1.36897,1.334028,-0.039656,0.457806,-0.172449,1.323632,1.595569


In [8]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

for i in k:
    k_model = KMeans(n_clusters=i, random_state=1, n_init=10)
    k_model.fit(rush_reg_scaled)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [9]:
model = KMeans(n_clusters=3)

In [10]:
model.fit(rush_reg_scaled)

In [11]:
rush_reg_clusters = model.predict(rush_reg_scaled)
print(rush_reg_clusters)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 2 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 1 0 1 1 0 0 1 0 0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1
 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 1 1 1 0 1 1 0 1]


In [12]:
rush_reg_scaled["rush_reg_clusters"]=rush_reg_clusters

In [13]:
rush_reg_scaled.head()

Unnamed: 0_level_0,Age,G,GS,Att,TD,1D,Succ%,Lng,Y/A,Y/G,Fmb,rush_reg_clusters
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Derrick Henry,1.006351,0.853121,2.236394,3.078524,3.167708,2.335162,0.006556,1.494485,0.251509,2.724525,3.832494,2
Josh Jacobs,-0.532774,1.051489,2.421359,2.974582,2.867918,3.736749,0.712938,3.049504,0.604808,2.768179,1.595569,2
Nick Chubb,0.62157,1.051489,2.421359,2.535716,2.867918,2.535389,0.224412,0.716976,0.675468,2.470539,0.104286,2
Saquon Barkley,-0.147993,0.853121,2.236394,2.454872,2.268339,2.184992,0.059369,2.116493,0.251509,2.164961,0.104286,2
Najee Harris,-0.532774,1.051489,2.421359,2.189243,1.36897,1.334028,-0.039656,0.457806,-0.172449,1.323632,1.595569,2


In [14]:
rush_reg_scaled.hvplot.scatter(
    x="Att",
    y="TD",
    by="rush_reg_clusters",
    hover_cols=["Player", "Age", "Att", "TD", "Y/G"]
)


In [15]:
pca = PCA(n_components=2)

In [16]:

rush_reg_pca_data = pca.fit_transform(rush_reg_scaled)

rush_reg_pca_data[:5]

array([[ 7.04848693, -1.62453141],
       [ 7.59504031, -0.19307691],
       [ 5.61541497, -0.69987802],
       [ 5.39189689, -0.63681016],
       [ 4.24455288, -1.18495317]])

In [17]:
pca.explained_variance_ratio_

array([0.53983696, 0.15550146])

In [18]:
# Creating a DataFrame with the PCA data
rush_reg_pca_df = pd.DataFrame(rush_reg_pca_data, columns=["PC1", "PC2"])

# Review the DataFrame
rush_reg_pca_df.head()

Unnamed: 0,PC1,PC2
0,7.048487,-1.624531
1,7.59504,-0.193077
2,5.615415,-0.699878
3,5.391897,-0.63681
4,4.244553,-1.184953


In [19]:
model = KMeans(n_clusters=4, random_state=67, n_init=10)
model.fit(rush_reg_pca_df)
rush_reg_clusters = model.predict(rush_reg_pca_df)
print(rush_reg_clusters)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 3 2 0 3 3 3 0 3 0 0 0 3 2
 2 3 3 2 3 3 2 3 3 2 2 3 2 2 2 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 3 3 2 3 2 3
 3 2 3 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 3 3 2 3 3 3 2 2 2 3 2 2 3 2]


In [20]:
rush_reg_pca_predictions = rush_reg_pca_df.copy()
rush_reg_pca_predictions["rush_reg_clusters"] = rush_reg_clusters
rush_reg_pca_predictions['Player'] = rush_reg_scaled.index
rush_reg_pca_predictions.head()

Unnamed: 0,PC1,PC2,rush_reg_clusters,Player
0,7.048487,-1.624531,1,Derrick Henry
1,7.59504,-0.193077,1,Josh Jacobs
2,5.615415,-0.699878,1,Nick Chubb
3,5.391897,-0.63681,1,Saquon Barkley
4,4.244553,-1.184953,1,Najee Harris


In [21]:
# Create the scatter plot with x="PC1" and y="PC2"
rush_reg_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="rush_reg_clusters",
    hover_cols=["Player"]
)