# preprocessing

In [None]:
import numpy as np
import pandas as pd
import torch
from matplotlib import pyplot as plt
# scikit learn logistic regression (NOT DOWNSAMPLED)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


In [None]:
# load data
data_dir = "data/"

college_players_stats = pd.read_csv(f'{data_dir}college_players_stats.csv', index_col=0)
drafted_players = pd.read_csv(f'{data_dir}drafted_players.csv', index_col=0)
not_drafted = pd.read_csv(f'{data_dir}not_drafted.csv', index_col=0)
players_stats = pd.read_csv(f'{data_dir}players_stats.csv', index_col=0)

# add label and normalize relevant values

players_stats['drafted'] = players_stats['name'].isin(drafted_players['name'])
players_stats['drafted'] = players_stats['drafted'].astype(int)


# add label and normalize relevant values

players_stats['drafted'] = players_stats['name'].isin(drafted_players['name'])
players_stats['drafted'] = players_stats['drafted'].astype(int)
players_stats = players_stats.drop(columns=['name'])

# discretize height
def convert_height(height_str):
    feet, inches = map(int, height_str.split('-'))
    return feet + inches / 12  

# Apply the conversion function to the height column
players_stats['height'] = players_stats['height'].apply(convert_height)

# add binary columns for each position label
def str_to_int(maybe_bool):
    if isinstance(maybe_bool, bool):
        return int(maybe_bool) 
    return maybe_bool

def weight_to_int(input):
    if(input == "22l"):
        return 221
    return(int(input)) 
players_stats['weight'] = players_stats['weight'].apply(weight_to_int)

players_stats = pd.get_dummies(players_stats, columns=['position'])
players_stats = players_stats.applymap(str_to_int)

players_stats.info()
non_numeric_columns = players_stats.select_dtypes(exclude=["number"]).columns
print("Non-numeric columns:", non_numeric_columns)

# TODO: add other metrics for training
df = players_stats

df.dropna(inplace=True)
pd.set_option('display.max_columns', None)
df.head()

In [None]:
np.unique(df['drafted'], return_counts=True)

# models

In [None]:
seed = 42
epochs = 250
lr = 0.01
batch_size = 32

# get data
x = df.drop(columns=['drafted']).values
y = df['drafted'].values


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

# downsampling
#separate drafted and non-drafted 
df_drafted = df[df['drafted'] == 1]
df_non_drafted = df[df['drafted'] == 0]


# Drop 20000 random samples from non-drafted
df_non_drafted_sampled = df_non_drafted.sample(len(df_non_drafted) - 20000, random_state=seed)

df_downsampled = pd.concat([df_drafted, df_non_drafted_sampled])
x = df_downsampled.drop(columns=['drafted']).values
y = df_downsampled['drafted'].values

xd_train, xd_test, yd_train, yd_test = train_test_split(x, y, test_size=0.2, random_state=seed)



count = 0
count_tot = 0
for y1,y2 in zip(y_train, y_test):
    if(y1 == 1):
        count+=1
    if(y2 == 1):
        count+=1
    count_tot +=1

print("Count drafted:")
print(count)
print("Count not drafted:")
print(count_tot)

np.unique(y_test, return_counts=True)

count = 0
count_tot = 0
for y1,y2 in zip(yd_train, yd_test):
    if(y1 == 1):
        count+=1
    if(y2 == 1):
        count+=1
    count_tot +=1

print("Count downsample drafted:")
print(count)
print("Count downsample not drafted:")
print(count_tot)

np.unique(y_test, return_counts=True)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def explain(y_test, y_pred, average='weighted'):
    """Look at precision, recall etc. for the model"""
    precision = precision_score(y_test, y_pred, average=average, zero_division=0)
    recall = recall_score(y_test, y_pred, average=average, zero_division=0)
    f1 = f1_score(y_test, y_pred, average=average, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    print("Precision: ", precision)
    print("Recall: ", recall)    
    print("F1: ", f1)
    print("Accuracy: ", accuracy)
    print("Distribution", np.unique(y_pred, return_counts=True))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "accuracy": accuracy
    }

In [None]:
#NEURAL NET
from sklearn.model_selection import KFold

In [None]:
# mlp
import tqdm

class MLP(torch.nn.Module):
    def __init__(self, hidden=16):
        super(MLP, self).__init__()
        self.fc1 = torch.nn.Linear(44, hidden)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(hidden, 1)
        self.sigmoid = torch.nn.Sigmoid()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

In [None]:
from collections import defaultdict

grid_search_results = {}

for hidden in [16, 32, 64]:
    for lr in [0.01, 0.001]:
        for batch_size in [32, 64]:

            k_folds = 5
            kfold = KFold(n_splits=k_folds, shuffle=True, random_state=10)

            fold_results = []

            xdfold = np.concatenate((xd_train, xd_test), axis = 0)
            ydfold = np.concatenate((yd_train, yd_test), axis = 0)
            for fold, (train_idx, test_idx) in enumerate(kfold.split(xdfold)):
                x_t, y_t = xdfold[train_idx], ydfold[train_idx]
                x_v, y_v = xdfold[test_idx], ydfold[test_idx]

                train_dataset = torch.utils.data.TensorDataset(
                    torch.tensor(x_t, dtype=torch.float32),
                    torch.tensor(y_t, dtype=torch.float32).reshape(-1, 1)
                )
                train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

                model = MLP()
                criterion = torch.nn.BCELoss()
                optimizer = torch.optim.Adam(model.parameters(), lr=lr)

                for epoch in range(epochs):
                    for x_batch, y_batch in train_dl:
                        optimizer.zero_grad()
                        output = model(x_batch)
                        loss = criterion(output, y_batch)
                        loss.backward()
                        optimizer.step()

                
                val_outputs = model(torch.tensor(x_v, dtype=torch.float32))
                predictions = [1 if x>0.5 else 0 for x in val_outputs]

                results = explain(predictions, y_v, average='binary')
                fold_results.append(results)

            #avg explain results across all folds

            print("AVERAGE RESULTS")
            print("\n\n")

            #make a new dict with the avgs
            countdict = defaultdict(lambda: {'sum': 0, 'count': 0})
            for d in fold_results:
                for key, value in d.items():
                    countdict[key]['sum'] += value
                    countdict[key]['count'] += 1
            averaged_dict_mlp = {key: acc['sum'] / acc['count'] for key, acc in countdict.items()}



            print(averaged_dict_mlp)

            grid_search_results[(hidden, lr, batch_size)] = averaged_dict_mlp

pd.DataFrame(grid_search_results).T.head()

In [None]:
qrs = pd.DataFrame(grid_search_results).T

# remove precision and recall columns
qrs = qrs.drop(columns=['precision', 'recall'])

qrs.head(100)

In [None]:
#DOWNSAMPLED 

In [None]:
# K FOLD EVALUATION ON DOWNSAMPLED
from collections import defaultdict

In [None]:
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=10)

fold_results = []

xdfold = np.concatenate((xd_train, xd_test), axis = 0)
ydfold = np.concatenate((yd_train, yd_test), axis = 0)
for fold, (train_idx, test_idx) in enumerate(kfold.split(xdfold)):
    x_t, y_t = xdfold[train_idx], ydfold[train_idx]
    x_v, y_v = xdfold[test_idx], ydfold[test_idx]

    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(x_t, dtype=torch.float32),
        torch.tensor(y_t, dtype=torch.float32).reshape(-1, 1)
    )
    train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

    model = MLP()
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        for x_batch, y_batch in train_dl:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

    
    val_outputs = model(torch.tensor(x_v, dtype=torch.float32))
    predictions = [1 if x>0.5 else 0 for x in val_outputs]

    results = explain(predictions, y_v, average='binary')
    fold_results.append(results)

#avg explain results across all folds

print("AVERAGE RESULTS")
print("\n\n")

#make a new dict with the avgs
countdict = defaultdict(lambda: {'sum': 0, 'count': 0})
for d in fold_results:
    for key, value in d.items():
        countdict[key]['sum'] += value
        countdict[key]['count'] += 1
averaged_dict_mlp = {key: acc['sum'] / acc['count'] for key, acc in countdict.items()}



print(averaged_dict_mlp)

In [None]:
#TOTAL DOWNSAMPLED RESULTS
down_models = {
    "Logistic Regression": LogisticRegression(random_state=seed, max_iter=250),
    "Decision Tree": DecisionTreeClassifier(random_state=seed),
    "Random Forest": RandomForestClassifier(random_state=seed),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(random_state=seed)
}

results = {}
for name, model in down_models.items():
    print(name)

    model.fit(xd_train, yd_train)
    yd_pred = model.predict(xd_test)
    
    results[name] = explain(yd_test, yd_pred, average='binary')
    print("\n\n")
# add the MLP
results["2 layer MLP"] = averaged_dict_mlp
# plot the results as a hist
results_df = pd.DataFrame(results)
results_df.plot(kind='bar', figsize=(10, 6))

In [None]:
#K FOLD CROSS WITHOUT DOWNSAMPLING

In [None]:

k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=20)

fold_results = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(x_train)):
    x_t, y_t = x_train[train_idx], y_train[train_idx]
    x_v, y_v = x_train[test_idx], y_train[test_idx]

    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(x_t, dtype=torch.float32),
        torch.tensor(y_t, dtype=torch.float32).reshape(-1, 1)
    )
    train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

    model = MLP()
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        for x_batch, y_batch in train_dl:
            optimizer.zero_grad()
            output = model(x_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            optimizer.step()

    
    val_outputs = model(torch.tensor(x_v, dtype=torch.float32))
    predictions = [1 if x>0.5 else 0 for x in val_outputs]

    results = explain(predictions, y_v, average='macro')
    fold_results.append(results)

#avg explain results across all folds

print("AVERAGE RESULTS")
print("\n\n")


In [None]:
#make a new dict with the avgs
countdict = defaultdict(lambda: {'sum': 0, 'count': 0})
for d in fold_results:
    for key, value in d.items():
        countdict[key]['sum'] += value
        countdict[key]['count'] += 1
averaged_dict_mlp_nd = {key: acc['sum'] / acc['count'] for key, acc in countdict.items()}

print(averaged_dict_mlp_nd)

In [None]:
# TOTAL NON-DOWNSAMPLED RESULTS
models = {
    "Logistic Regression": LogisticRegression(random_state=seed, max_iter=250),
    "Decision Tree": DecisionTreeClassifier(random_state=seed),
    "Random Forest": RandomForestClassifier(random_state=seed),
    "Naive Bayes": GaussianNB(),
    "SVM": SVC(random_state=seed)
}

results = {}
for name, model in models.items():
    print(name)

    model.fit(x_train, y_train)
    yd_pred = model.predict(x_test)
    
    results[name] = explain(y_test, yd_pred, average='macro')
    print("\n\n")
# add the MLP
results["2 layer MLP"] = averaged_dict_mlp_nd
# plot the results as a hist
results_df = pd.DataFrame(results)
results_df.plot(kind='bar', figsize=(10, 6))