In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans



In [2]:
dfs = []

years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000]

for year in years:
    path = f'../data/{year}_combine.csv'
    df_year = pd.read_csv(path)
    dfs.append(df_year)

df = pd.concat(dfs, ignore_index=True)

print(df.head(10))

               Player  Pos       School        College    Ht     Wt  40yd  \
0  Kris Abrams-Draine   CB     Missouri  College Stats  5-11  179.0  4.44   
1        Isaiah Adams    G     Illinois  College Stats   6-4  315.0  5.22   
2         Rasheen Ali   RB     Marshall  College Stats  5-11  206.0   NaN   
3           Erick All   TE         Iowa  College Stats   6-4  252.0   NaN   
4       Braelon Allen   RB    Wisconsin  College Stats   6-1  235.0   NaN   
5             Joe Alt   OT   Notre Dame  College Stats   6-9  321.0  5.05   
6     Kiran Amegadjie   OT         Yale            NaN   6-5  323.0   NaN   
7     Daijahn Anthony  SAF  Mississippi  College Stats   6-0  195.0  4.55   
8      Terrion Arnold   CB      Alabama  College Stats   6-0  189.0  4.50   
9     Gottlieb Ayedze    G     Maryland  College Stats   6-4  308.0  5.01   

   Vertical  Bench  Broad Jump  3Cone  Shuttle  \
0      33.5    NaN         NaN    NaN      NaN   
1      24.5    NaN       102.0   7.77     4.73   
2 

In [3]:
def change_Ht_to_int(input):
    if (isinstance(input, str)):
        feet, inches = input.split('-')
        return (int(feet) * 12) + int(inches)
    else:
        return np.nan

In [4]:
df['Drafted'] = df['Drafted (tm/rnd/yr)'].notna().astype(int)

df = df[df['Pos'] == 'QB']

df['Ht'] = df['Ht'].apply(change_Ht_to_int)

In [5]:
print(df.head(10))

                Player Pos          School        College    Ht     Wt  40yd  \
56      Jayden Daniels  QB             LSU  College Stats  76.0  210.0   NaN   
112        Sam Hartman  QB      Notre Dame  College Stats  73.0  211.0  4.80   
147  Michael Penix Jr.  QB      Washington  College Stats  74.0  216.0   NaN   
167        Devin Leary  QB        Kentucky  College Stats  73.0  215.0   NaN   
179         Drake Maye  QB  North Carolina  College Stats  76.0  223.0   NaN   
181      J.J. McCarthy  QB        Michigan  College Stats  75.0  219.0   NaN   
195         Joe Milton  QB       Tennessee  College Stats  77.0  235.0   NaN   
209             Bo Nix  QB          Oregon  College Stats  74.0  214.0   NaN   
227      Michael Pratt  QB          Tulane  College Stats  75.0  217.0   NaN   
233    Spencer Rattler  QB  South Carolina  College Stats  72.0  211.0  4.95   

     Vertical  Bench  Broad Jump  3Cone  Shuttle  \
56        NaN    NaN         NaN    NaN      NaN   
112      28.5  

In [6]:
X = df.drop(['Drafted', 'Player', 'Pos', 'School', 'College', 'Drafted (tm/rnd/yr)', 'Player-additional'], axis=1)

In [7]:
X

Unnamed: 0,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle
56,76.0,210.0,,,,,,
112,73.0,211.0,4.80,28.5,,109.0,7.19,4.34
147,74.0,216.0,,,,,,
167,73.0,215.0,,,,,,
179,76.0,223.0,,,,,,
...,...,...,...,...,...,...,...,...
8215,74.0,207.0,4.75,,,,,
8225,75.0,229.0,4.81,33.5,,111.0,7.12,4.16
8237,72.0,215.0,4.91,25.5,,100.0,7.34,4.21
8239,75.0,222.0,5.37,26.5,,98.0,7.80,4.78


In [8]:
X.isna().any(axis=1).sum()

433

In [9]:
imputer = KNNImputer(n_neighbors=5)

X_imputed = imputer.fit_transform(X)

In [10]:
X_imputed

array([[ 76.   , 210.   ,   4.822, ..., 114.8  ,   7.038,   4.232],
       [ 73.   , 211.   ,   4.8  , ..., 109.   ,   7.19 ,   4.34 ],
       [ 74.   , 216.   ,   4.836, ..., 110.4  ,   7.048,   4.252],
       ...,
       [ 72.   , 215.   ,   4.91 , ..., 100.   ,   7.34 ,   4.21 ],
       [ 75.   , 222.   ,   5.37 , ...,  98.   ,   7.8  ,   4.78 ],
       [ 75.   , 229.   ,   4.91 , ..., 108.   ,   7.71 ,   4.59 ]])

In [11]:
wcss = []
for i in range(1,11):
    k_means = KMeans(n_clusters=i,init='k-means++', random_state=42)
    k_means.fit(X_imputed)
    wcss.append(k_means.inertia_)

plt.plot(np.arange(1,11),wcss)
plt.xlabel('Clusters')
plt.ylabel('SSE')
plt.show()

AttributeError: 'NoneType' object has no attribute 'split'

In [None]:
k_means_optimum = KMeans(n_clusters = 2, init = 'k-means++',  random_state=42)
y = k_means_optimum.fit_predict(X_imputed)

df['cluster'] = y

In [None]:
df.groupby("cluster")["Drafted"].mean()

cluster
0    0.663677
1    0.551570
Name: Drafted, dtype: float64

In [None]:
df.groupby("cluster")[['Ht', 'Wt', '40yd', 'Vertical', 'Bench', 'Broad Jump', '3Cone', 'Shuttle']].median() 

Unnamed: 0_level_0,Ht,Wt,40yd,Vertical,Bench,Broad Jump,3Cone,Shuttle
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,76.0,229.0,4.84,31.0,19.5,110.0,7.165,4.37
1,74.0,214.0,4.79,31.5,18.5,112.0,7.09,4.31
