In [1]:
#from splinter import Browser
from bs4 import BeautifulSoup as Soup
import numpy as np
import pandas as pd
from urllib.parse import urlparse, parse_qs
from splinter import Browser
import html5lib
import openpyxl
import xarray
import hvplot.pandas
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [2]:
def lookup(start, finish, subject):
    df = pd.DataFrame()
    for i in range(start, finish + 1):
        url = 'https://www.pro-football-reference.com/years/{}/{}.htm'.format(i, subject)
        # Read HTML table and handle multi-level header if necessary
        site_data = pd.read_html(url)[0]
        if isinstance(site_data.columns, pd.MultiIndex):
            # If there is a multi-level header, drop the first level
            site_data.columns = site_data.columns.droplevel(0)
        site_data["Season"] = i
        df = pd.concat([df, site_data], ignore_index=True)
        # Check for special characters in player names and remove them
        if any(('+' in player) or ('*' in player) for player in df['Player']):
            df['Player'] = df['Player'].str.replace("+", "", case=False, regex=False)
            df['Player'] = df['Player'].str.replace("*", "", case=False, regex=False)
            print("Data for season", i, "retrieved from", url)
        else:
            print("Data for season", i, "retrieved.", url)
    return df


## Passing

In [3]:
#passing_2019_2022_df = lookup(2019, 2022, "passing")
#passing_2019_df = lookup(2019, 2019, "passing")
#passing_2020_df = lookup(2020, 2020, "passing")
#passing_2021_df = lookup(2021, 2021, "passing")
#passing_2022_df = lookup(2022, 2022, "passing")

Passing Advanced
Age -- Player's age on December 31st of that year
Pos -- Position
Games
G -- Games played
GS -- Games started as an offensive or defensive player
Passing
Cmp -- Passes completed
Att -- Passes attempted
Yds -- Yards Gained by Passing
For teams, sack yardage is deducted from this total
IAY -- Intended air yards - Air yards on all pass attempts, whether completed or incomplete
IAY/PA -- Intended air yards per pass attempt - Average depth of target, whether completed or not
CAY -- Completed air yards - Total yards completed passes traveled in the air past the line of scrimmage before being caught
CAY/Cmp -- Completed air yards per completion - yards the ball traveled in the air past the line of scrimmage prior to a completion
CAY/PA -- Completed air yards per pass attempt - Air yards (on completed passes) per pass attempt
YAC -- Pass yards after catch
YAC/Cmp -- Pass yards after catch per completion


In [4]:
#passing_adv_2019_2022_df = lookup(2019, 2022, "passing_advanced")
#passing_adv_2019_df = lookup(2019, 2019, "passing_advanced")
passing_adv_2020_df = lookup(2020, 2020, "passing_advanced")
#passing_adv_2021_df = lookup(2021, 2021, "passing_advanced")
#passing_adv_2022_df = lookup(2022, 2022, "passing_advanced")

Data for season 2020 retrieved from https://www.pro-football-reference.com/years/2020/passing_advanced.htm


In [5]:
pass_adv_df = passing_adv_2020_df.copy()
pass_adv_df.head()

Unnamed: 0,Rk,Player,Tm,Age,Pos,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp,Season
0,1,Deshaun Watson,HOU,25,QB,16,16,382,544,4823,4840,8.9,2847,7.5,5.2,1976,5.2,2020
1,2,Patrick Mahomes,KAN,25,QB,15,15,390,588,4740,4910,8.4,2473,6.3,4.2,2267,5.8,2020
2,3,Tom Brady,TAM,43,QB,16,16,401,610,4633,5531,9.1,2833,7.1,4.6,1800,4.5,2020
3,4,Matt Ryan,ATL,35,QB,16,16,407,626,4581,5337,8.5,2939,7.2,4.7,1642,4.0,2020
4,5,Josh Allen,BUF,24,QB,16,16,396,572,4544,4857,8.5,2692,6.8,4.7,1852,4.7,2020


In [6]:
pass_adv_df = pass_adv_df[pass_adv_df['Pos'] == "QB"]
pass_adv_df = pass_adv_df.drop(columns=['Tm','Rk'])
pass_adv_df.set_index('Player', inplace=True)
pass_adv_df.head()


Unnamed: 0_level_0,Age,Pos,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp,Season
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Deshaun Watson,25,QB,16,16,382,544,4823,4840,8.9,2847,7.5,5.2,1976,5.2,2020
Patrick Mahomes,25,QB,15,15,390,588,4740,4910,8.4,2473,6.3,4.2,2267,5.8,2020
Tom Brady,43,QB,16,16,401,610,4633,5531,9.1,2833,7.1,4.6,1800,4.5,2020
Matt Ryan,35,QB,16,16,407,626,4581,5337,8.5,2939,7.2,4.7,1642,4.0,2020
Josh Allen,24,QB,16,16,396,572,4544,4857,8.5,2692,6.8,4.7,1852,4.7,2020


In [7]:
pass_adv_scaled = StandardScaler().fit_transform(pass_adv_df[["Age","G","GS","Cmp","Att","Yds","IAY","IAY/PA","CAY","CAY/Cmp","CAY/PA","YAC","YAC/Cmp"]])

In [8]:
pass_adv_scaled = pd.DataFrame(
    pass_adv_scaled,
    columns=["Age","G","GS","Cmp","Att","Yds","IAY","IAY/PA","CAY","CAY/Cmp","CAY/PA","YAC","YAC/Cmp"]
)

pass_adv_scaled["Player"]=pass_adv_df.index
pass_adv_scaled=pass_adv_scaled.set_index('Player')
pass_adv_scaled.head()

Unnamed: 0_level_0,Age,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Deshaun Watson,-0.690843,1.424208,1.475484,1.594167,1.444182,1.91785,1.728428,0.405449,2.122213,0.636502,0.752882,1.613872,-0.018824
Patrick Mahomes,-0.690843,1.246466,1.32017,1.648957,1.64577,1.867619,1.768268,0.207793,1.715018,0.215763,0.3015,1.998885,0.248129
Tom Brady,3.011233,1.424208,1.475484,1.724292,1.746564,1.802864,2.121709,0.484512,2.106971,0.496256,0.482053,1.381012,-0.330268
Matt Ryan,1.365866,1.424208,1.475484,1.765385,1.819868,1.771394,2.011294,0.247324,2.222379,0.531317,0.527191,1.171967,-0.552728
Josh Allen,-0.896514,1.424208,1.475484,1.690049,1.572465,1.749002,1.738103,0.247324,1.953456,0.391071,0.527191,1.449812,-0.241284


In [9]:
model = KMeans(n_clusters=3)

In [10]:
model.fit(pass_adv_scaled)

In [11]:
pass_clusters = model.predict(pass_adv_scaled)
print(pass_clusters)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0
 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0]


In [12]:
pass_adv_scaled["pass_clusters"]=pass_clusters

In [13]:
pass_adv_scaled.head()

Unnamed: 0_level_0,Age,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp,pass_clusters
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Deshaun Watson,-0.690843,1.424208,1.475484,1.594167,1.444182,1.91785,1.728428,0.405449,2.122213,0.636502,0.752882,1.613872,-0.018824,1
Patrick Mahomes,-0.690843,1.246466,1.32017,1.648957,1.64577,1.867619,1.768268,0.207793,1.715018,0.215763,0.3015,1.998885,0.248129,1
Tom Brady,3.011233,1.424208,1.475484,1.724292,1.746564,1.802864,2.121709,0.484512,2.106971,0.496256,0.482053,1.381012,-0.330268,1
Matt Ryan,1.365866,1.424208,1.475484,1.765385,1.819868,1.771394,2.011294,0.247324,2.222379,0.531317,0.527191,1.171967,-0.552728,1
Josh Allen,-0.896514,1.424208,1.475484,1.690049,1.572465,1.749002,1.738103,0.247324,1.953456,0.391071,0.527191,1.449812,-0.241284,1


In [14]:
pass_adv_scaled.hvplot.scatter(
    x="IAY/PA",
    y="CAY/PA",
    by="pass_clusters",
    hover_cols=["Player", "Age", "Cmp", "Att", "IAY"]
)


In [15]:
pass_adv_scaled.hvplot.scatter(
    x="IAY",
    y="CAY",
    by="pass_clusters",
    hover_cols=["Player", "Age", "Cmp", "Att", "IAY"]
)


In [16]:
pass_adv_scaled.hvplot.scatter(
    x="IAY",
    y="Age",
    by="pass_clusters",
    hover_cols=["Player", "Age", "Cmp", "Att", "IAY"]
)


In [17]:
pca = PCA(n_components=2)

In [18]:
# Fit the df_stocks_scaled data to the PCA
pass_pca_data = pca.fit_transform(pass_adv_scaled)

# Review the first five rose of the PCA data
# using bracket notation ([0:5])
pass_pca_data[:5]

array([[ 4.69798818,  0.51880631],
       [ 4.58060475, -0.15254036],
       [ 5.11469712,  0.49962575],
       [ 4.96038416,  0.41172402],
       [ 4.55952024,  0.20237425]])

In [19]:
pca.explained_variance_ratio_

array([0.59988155, 0.18478269])

In [20]:
# Creating a DataFrame with the PCA data
pass_pca_df = pd.DataFrame(pass_pca_data, columns=["PC1", "PC2"])

# Copy the tickers names from the original data
#df_stocks_pca["Ticker"] = df_stocks.index

# Set the Ticker column as index
#df_stocks_pca = df_stocks_pca.set_index("Ticker")

# Review the DataFrame
pass_pca_df.head()

Unnamed: 0,PC1,PC2
0,4.697988,0.518806
1,4.580605,-0.15254
2,5.114697,0.499626
3,4.960384,0.411724
4,4.55952,0.202374


In [21]:
# Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=4, random_state=67, n_init=10)

# Fit the model for the df_stocks_pca DataFrame
model.fit(pass_pca_df)

# Predict the model segments (clusters)
pass_clusters = model.predict(pass_pca_df)

# Print the stock segments
print(pass_clusters)

[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 2 1 0 2 2 2 0 2 2 2 2 2 0 0 0 2 2 2 0 2 2 2 2 2 2 0 2 0 0 0 0 0 0 0 2 0 0
 0 2 0 0]


In [22]:
# Create a copy of the df_stocks_pca DataFrame and name it as df_stocks_pca_predictions
pass_pca_predictions = pass_pca_df.copy()

# Create a new column in the DataFrame with the predicted clusters
pass_pca_predictions["PassClusters"] = pass_clusters

# Ensure the indices match or merge the DataFrames based on a common column
pass_pca_predictions['Player'] = pass_adv_scaled.index


# Review the DataFrame
pass_pca_predictions.head()

Unnamed: 0,PC1,PC2,PassClusters,Player
0,4.697988,0.518806,3,Deshaun Watson
1,4.580605,-0.15254,3,Patrick Mahomes
2,5.114697,0.499626,3,Tom Brady
3,4.960384,0.411724,3,Matt Ryan
4,4.55952,0.202374,3,Josh Allen


In [23]:
# Create the scatter plot with x="PC1" and y="PC2"
pass_pca_predictions.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="PassClusters",
    hover_cols=["Player"]
)

In [24]:
pass_adv_scaled.head()

Unnamed: 0_level_0,Age,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp,pass_clusters
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Deshaun Watson,-0.690843,1.424208,1.475484,1.594167,1.444182,1.91785,1.728428,0.405449,2.122213,0.636502,0.752882,1.613872,-0.018824,1
Patrick Mahomes,-0.690843,1.246466,1.32017,1.648957,1.64577,1.867619,1.768268,0.207793,1.715018,0.215763,0.3015,1.998885,0.248129,1
Tom Brady,3.011233,1.424208,1.475484,1.724292,1.746564,1.802864,2.121709,0.484512,2.106971,0.496256,0.482053,1.381012,-0.330268,1
Matt Ryan,1.365866,1.424208,1.475484,1.765385,1.819868,1.771394,2.011294,0.247324,2.222379,0.531317,0.527191,1.171967,-0.552728,1
Josh Allen,-0.896514,1.424208,1.475484,1.690049,1.572465,1.749002,1.738103,0.247324,1.953456,0.391071,0.527191,1.449812,-0.241284,1


In [25]:
"""
Cell generated by Data Wrangler.
"""
def clean_data(pass_adv_df):
    # Drop column: 'Season'
    pass_adv_df = pass_adv_df.drop(columns=['Season'])
    # Drop column: 'Pos'
    pass_adv_df = pass_adv_df.drop(columns=['Pos'])
    return pass_adv_df

pass_adv_df_clean = clean_data(pass_adv_df.copy())
pass_adv_df_clean.head()

Unnamed: 0_level_0,Age,G,GS,Cmp,Att,Yds,IAY,IAY/PA,CAY,CAY/Cmp,CAY/PA,YAC,YAC/Cmp
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Deshaun Watson,25,16,16,382,544,4823,4840,8.9,2847,7.5,5.2,1976,5.2
Patrick Mahomes,25,15,15,390,588,4740,4910,8.4,2473,6.3,4.2,2267,5.8
Tom Brady,43,16,16,401,610,4633,5531,9.1,2833,7.1,4.6,1800,4.5
Matt Ryan,35,16,16,407,626,4581,5337,8.5,2939,7.2,4.7,1642,4.0
Josh Allen,24,16,16,396,572,4544,4857,8.5,2692,6.8,4.7,1852,4.7


In [26]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

# Create a for loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the home_sales_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the K-means model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1, n_init=10)
    k_model.fit(pass_adv_df_clean)
    inertia.append(k_model.inertia_)

# Create a dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data dictionary
df_elbow = pd.DataFrame(elbow_data)

# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)