In [3]:
import pandas as pd

import numpy as np

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn

from sklearn.impute import KNNImputer



In [4]:
def change_Ht_to_int(input):
    if (isinstance(input, str)):
        feet, inches = input.split('-')
        return (int(feet) * 12) + int(inches)
    else:
        return np.nan

In [5]:
dfs = []

years = [2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003, 2002, 2001, 2000]

for year in years:
    path = f'../data/{year}_combine.csv'
    df_year = pd.read_csv(path)
    dfs.append(df_year)

df = pd.concat(dfs, ignore_index=True)

print(df.head(10))

               Player  Pos       School        College    Ht     Wt  40yd  \
0  Kris Abrams-Draine   CB     Missouri  College Stats  5-11  179.0  4.44   
1        Isaiah Adams    G     Illinois  College Stats   6-4  315.0  5.22   
2         Rasheen Ali   RB     Marshall  College Stats  5-11  206.0   NaN   
3           Erick All   TE         Iowa  College Stats   6-4  252.0   NaN   
4       Braelon Allen   RB    Wisconsin  College Stats   6-1  235.0   NaN   
5             Joe Alt   OT   Notre Dame  College Stats   6-9  321.0  5.05   
6     Kiran Amegadjie   OT         Yale            NaN   6-5  323.0   NaN   
7     Daijahn Anthony  SAF  Mississippi  College Stats   6-0  195.0  4.55   
8      Terrion Arnold   CB      Alabama  College Stats   6-0  189.0  4.50   
9     Gottlieb Ayedze    G     Maryland  College Stats   6-4  308.0  5.01   

   Vertical  Bench  Broad Jump  3Cone  Shuttle  \
0      33.5    NaN         NaN    NaN      NaN   
1      24.5    NaN       102.0   7.77     4.73   
2 

In [6]:
df['Drafted'] = df['Drafted (tm/rnd/yr)'].notna().astype(int)

In [7]:
df = df[df['Pos'] == 'QB']

In [8]:
df['Ht'] = df['Ht'].apply(change_Ht_to_int)

In [9]:
print(df.head(10))

                Player Pos          School        College    Ht     Wt  40yd  \
56      Jayden Daniels  QB             LSU  College Stats  76.0  210.0   NaN   
112        Sam Hartman  QB      Notre Dame  College Stats  73.0  211.0  4.80   
147  Michael Penix Jr.  QB      Washington  College Stats  74.0  216.0   NaN   
167        Devin Leary  QB        Kentucky  College Stats  73.0  215.0   NaN   
179         Drake Maye  QB  North Carolina  College Stats  76.0  223.0   NaN   
181      J.J. McCarthy  QB        Michigan  College Stats  75.0  219.0   NaN   
195         Joe Milton  QB       Tennessee  College Stats  77.0  235.0   NaN   
209             Bo Nix  QB          Oregon  College Stats  74.0  214.0   NaN   
227      Michael Pratt  QB          Tulane  College Stats  75.0  217.0   NaN   
233    Spencer Rattler  QB  South Carolina  College Stats  72.0  211.0  4.95   

     Vertical  Bench  Broad Jump  3Cone  Shuttle  \
56        NaN    NaN         NaN    NaN      NaN   
112      28.5  

In [10]:
X = df.drop(['Drafted', 'Player', 'Pos', 'School', 'College', 'Drafted (tm/rnd/yr)', 'Player-additional'], axis=1)
y = df['Drafted']

In [11]:
imputer = KNNImputer(n_neighbors=5)

X_imputed = imputer.fit_transform(X)

In [12]:
data_set_train = torch.tensor(X_imputed, dtype=torch.float32)
data_set_test = torch.tensor(y.values, dtype=torch.long)

In [13]:
dataset = TensorDataset(data_set_train, data_set_test)

In [14]:
dataset[0:5]

(tensor([[ 76.0000, 210.0000,   4.8220,  33.2000,  17.0000, 114.8000,   7.0380,
            4.2320],
         [ 73.0000, 211.0000,   4.8000,  28.5000,  18.8000, 109.0000,   7.1900,
            4.3400],
         [ 74.0000, 216.0000,   4.8360,  32.4000,  17.0000, 110.4000,   7.0480,
            4.2520],
         [ 73.0000, 215.0000,   4.8340,  31.2000,  17.0000, 110.8000,   7.1400,
            4.3200],
         [ 76.0000, 223.0000,   4.8840,  31.0000,  20.4000, 109.0000,   7.0400,
            4.3140]]),
 tensor([1, 0, 1, 1, 1]))

In [15]:
batch_size = 64
torch.manual_seed(1)
train_dl = DataLoader(dataset, batch_size, shuffle=True)

In [16]:
hidden_units = [32, 16]     
num_classes = 2   

input_size = data_set_train.shape[1]

all_layers = []


all_layers.append(nn.Linear(input_size, hidden_units[0]))
all_layers.append(nn.ReLU())  

for i in range(1, len(hidden_units)):
    all_layers.append(nn.Linear(hidden_units[i-1], hidden_units[i]))
    all_layers.append(nn.ReLU())

all_layers.append(nn.Linear(hidden_units[-1], num_classes))

model = nn.Sequential(*all_layers)
print(model)

Sequential(
  (0): Linear(in_features=8, out_features=32, bias=True)
  (1): ReLU()
  (2): Linear(in_features=32, out_features=16, bias=True)
  (3): ReLU()
  (4): Linear(in_features=16, out_features=2, bias=True)
)


In [17]:
loss_fn = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) 

In [18]:
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    correct = 0
    total = 0

    for x_batch, y_batch in train_dl:
        pred = model(x_batch)              
        loss = loss_fn(pred, y_batch)       

        optimizer.zero_grad()
        loss.backward()                    
        optimizer.step()                    

        predicted_labels = torch.argmax(pred, dim=1)
        correct += (predicted_labels == y_batch).sum().item()
        total   += y_batch.size(0)

    acc = correct / total
    print(f"Epoch {epoch} | Train accuracy: {acc:.4f}")

Epoch 0 | Train accuracy: 0.3924
Epoch 1 | Train accuracy: 0.3924
Epoch 2 | Train accuracy: 0.3946
Epoch 3 | Train accuracy: 0.6211
Epoch 4 | Train accuracy: 0.6076
Epoch 5 | Train accuracy: 0.6076
Epoch 6 | Train accuracy: 0.5314
Epoch 7 | Train accuracy: 0.4574
Epoch 8 | Train accuracy: 0.6009
Epoch 9 | Train accuracy: 0.6076
Epoch 10 | Train accuracy: 0.6099
Epoch 11 | Train accuracy: 0.6076
Epoch 12 | Train accuracy: 0.6054
Epoch 13 | Train accuracy: 0.6076
Epoch 14 | Train accuracy: 0.6076
Epoch 15 | Train accuracy: 0.6076
Epoch 16 | Train accuracy: 0.6076
Epoch 17 | Train accuracy: 0.6054
Epoch 18 | Train accuracy: 0.6054
Epoch 19 | Train accuracy: 0.6076
