In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import torch

df = pd.read_csv('basketball.csv')


In [3]:
#set the seed
np.random.seed(0)
torch.manual_seed(0)

In [4]:
# preprocess and scale the dataset
def preprocess(df):
    # Drop rows with missing values in specified columns
    df.loc[:,"SEED"]=df["SEED"].fillna(0)
    df = df.dropna(subset=["POSTSEASON"])#, "SEED"])
    
    # Map values in the "POSTSEASON" column
    #Can use FACTORIZE here
    postseason_mapping = {"Champions": 0, "2ND": 1, "F4": 2, "E8": 3, "S16": 4, "R32": 5, "R64": 6, "R68" : 7}
    df.loc[:,"POSTSEASON"] = df["POSTSEASON"].replace(postseason_mapping)
    # df.loc[:,"POSTSEASON"],_ = pd.factorize(df["POSTSEASON"])
    
    # Convert object type columns to numerical labels, excluding "POSTSEASON"
    for column in df.select_dtypes(include=["object"]).columns:
        if column != "POSTSEASON":
            df.loc[:,column], _ = pd.factorize(df[column])
                
    # Convert object data type to float as pytorch needs float/int
    conv_map=dict()
    for col in df.loc[:, df.dtypes =='O'].columns:
        conv_map[col]=float
    df = df.astype(conv_map)
   
    sorted_df = df.sort_values(by="POSTSEASON")
    return sorted_df

In [5]:
#create test, val and training dataframes
train_val = (df[df["YEAR"]<2022])
train,val=train_test_split(train_val,test_size=0.2)
# train = (df[df["YEAR"]==2019])
# val = (df[df["YEAR"] == 2021])
test = (df[df["YEAR"] >= 2022])
train = preprocess(train)
test = preprocess(test)
val = preprocess(val)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["POSTSEASON"] = df["POSTSEASON"].replace(postseason_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column], _ = pd.factorize(df[column])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column], _ = pd.factorize(df[column])
A value is trying to be set on a copy of a slice from a Dat

In [6]:
#Drop features which have low correlation to POSTSEASON column
features_to_drop=["POSTSEASON", "TEAM"]
for feature in train.columns[train.corr()['POSTSEASON'].abs()<0.10]:
    features_to_drop.append(feature)
print("Dropping Features: ",features_to_drop)
    
feature_names = train.drop(features_to_drop, axis=1).columns.tolist()
num_vars=len(feature_names)
num_classes=train["POSTSEASON"].unique().shape[0]

#Scale the data based on training (columns_to_scale == feature_names)
columns_to_scale = [col for col in train.columns if col not in features_to_drop]
scaler = StandardScaler()
#scaler=MinMaxScaler()
scaler.fit(train[columns_to_scale])


#Transform the selected columns of train/val/test
for df_tran in [train,val,test]:
    df_tran.loc[:,columns_to_scale]=scaler.transform(df_tran[columns_to_scale])

In [7]:
#utility function to get dataframes/tensors
def make_data(df:pd.DataFrame,feature_names,get_tensor:bool=True):
    X_train = df[feature_names]
    Y_train = df["POSTSEASON"].astype(np.int64)
    if get_tensor:
        X_train = torch.tensor(X_train.values, dtype=torch.float32)
        # Adjust labels to start from 0
        Y_train = torch.tensor(Y_train.values, dtype=torch.int64)  # Use long tensor for classification labels
    return X_train,Y_train

[-3.36536354e-16 -1.66533454e-16 -8.32667268e-17  1.38777878e-16
 -1.38777878e-17  1.66533454e-16 -9.71445147e-17 -6.93889390e-17
  2.42861287e-17  1.07552856e-16  4.85722573e-17 -5.20417043e-18
 -1.38777878e-16  1.38777878e-17 -1.07552856e-16  9.71445147e-17
  1.38777878e-17 -3.46944695e-17 -1.66533454e-16  6.93889390e-18
 -9.71445147e-17]
(0.27956349206349207, 0.27734375, 0.2773612454129461, 0.5294117647058824)


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Run logistic regression
def fit_predict(train_fname: pd.DataFrame, test_fname: pd.DataFrame,feature_names:list) -> np.array:
    classifier = LogisticRegression(max_iter=1000)
    X_train,Y_train=make_data(train,feature_names,False)
    X_test,_=make_data(test,feature_names,False)
    
    classifier.fit(X_train, Y_train)
    Y_pred = classifier.predict(X_test)
    coefs = [0]*num_vars
    for i in range(num_classes):
        for j in range(num_vars):
            coefs[j] += classifier.coef_[i][j]
    #print(coefs)
    feature_names = X_train.columns.tolist()
    weights = np.mean(classifier.coef_, axis = 0)
    weights=weights/np.abs(weights).sum()
    feature_importance = {feature_names[i]: abs(weights[i]) for i in range(len(feature_names))}

    sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

    top_n = 10
    print(f"Top {top_n} features from Logistic Regressopn:")
    for feature, importance in sorted_features[:top_n]:
        print(f"{feature}: {importance:.4f}")
    return Y_pred.astype(np.int64),sorted_features


In [8]:
def score(data: pd.DataFrame, Y_pred: np.array) -> list[float]:
    Y_true = data["POSTSEASON"].astype(np.int64).values
    precision = metrics.precision_score(Y_true, Y_pred, average = "macro",zero_division=np.nan)
    recall = metrics.recall_score(Y_true, Y_pred, average = "macro")
    f1 = metrics.f1_score(Y_true, Y_pred, average = "macro")
    accuracy=metrics.accuracy_score(Y_true, Y_pred)
    #accuracy = (Y_pred == Y_true).mean() * 100
    return accuracy,precision, recall, f1

Y_pred,sorted_features_LR = fit_predict(train, test,feature_names)
print("="*80)
print("LOGISTIC REGRESSION")
acc,prec,rec,f1=score(test,Y_pred)
print(f"Accuracy={acc:.4f} Precision={prec:.4f} Recall={rec:.4f} F1={f1:.4f}")
print("="*80)

In [None]:
class FNNModel(torch.nn.Module):
    def __init__(self, input_size,hidden_size):
        num_nodes=hidden_size
        layer=[num_nodes,num_nodes//2]
        super(FNNModel, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, layer[0])
        self.fc2 = torch.nn.Linear(layer[0], layer[1])
        self.fc3 = torch.nn.Linear(layer[1], num_classes)  # Adjust output units to num_classes
        
        
    def forward(self, x):
        x = torch.nn.functional.relu(self.fc1(x))
        x = torch.nn.functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

[[-0.9145177   1.73949932  2.29117439 ... -2.77589493  1.77912944
  -1.67116778]
 [-1.02841368  1.73949932  1.34262122 ... -0.34304084  0.98941238
  -1.24233982]
 [-0.80062172  2.23859617  1.57975951 ...  0.05050908  1.70208388
  -1.4567538 ]
 ...
 [ 2.27456966 -1.75417864 -1.74017659 ... -0.73659077 -2.45837676
   1.54504191]
 [ 2.38846564 -0.25688809 -0.79162342 ...  1.33849065 -1.72644387
   1.54504191]
 [-0.45893379 -0.75598494 -0.55448513 ...  0.62294534 -0.01218
   0.47297201]]


In [None]:
def init_weights(m):
    if isinstance(m, torch.nn.Linear):
       # torch.nn.init.xavier_uniform_(m.weight)
        torch.nn.init.kaiming_uniform_(m.weight, a=0, mode="fan_in", nonlinearity="relu") 
        # torch.nn.init.normal_(m.weight, mean=0, std=1.0e-4) 
        m.bias.data.fill_(0.0)

In [None]:
def compute_accuracy(X,Y,model):
    # Forward pass to get predictions
    with torch.no_grad():
        output = model(X)
    
    # Get the predicted labels
    _, Y_pred = torch.max(output, -1)
    return (Y==Y_pred).float().mean().item()

In [None]:
#Create training,validation and testing data
best_overall_acc=-torch.inf
best_overall_loss=torch.inf
best_feature_set=None
best_metrics=None
debug=False
#hyper parameter loop
for num_features in [num_vars]:
   
   
    fnn_features=[]  
    for feature in feature_names:
        fnn_features.append(feature)
    
    X_tensor_train,Y_tensor_train=make_data(train,fnn_features)
    X_tensor_val,Y_tensor_val=make_data(val,fnn_features)
    X_tensor_test,Y_tensor_test=make_data(test,fnn_features)
   
    for hidden_size in [32,64,128,256,512,1024]:
    
        model = FNNModel(X_tensor_train.shape[1],hidden_size)  # Adjust input_size according to your data
        model.apply(init_weights)
        # Define loss function and optimizer
        criterion = torch.nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001,weight_decay=1.0e-2)
        # optimizer = torch.optim.SGD(model.parameters(), momentum=0.9,nesterov=True,lr=0.0001,weight_decay=1.0e-2)
        # Training loop
        epochs = 500
        best_loss=torch.inf
        best_acc=-torch.inf
        for epoch in range(epochs):
           
           
            
            # Backward pass and optimization
            batch_size=16
            num_samples=X_tensor_train.shape[0]
            num_batches=num_samples//batch_size
            for i in range(num_batches):
                batch_start=i*batch_size
                batch_end= min(batch_start+batch_size,num_samples)
                # Forward pass
                output = model(X_tensor_train[batch_start:batch_end])
                # Calculate the loss
                loss = criterion(output, Y_tensor_train[batch_start:batch_end])
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            #calculate loss on val data
            with torch.no_grad():
                output = model(X_tensor_val)
                # Calculate the loss
                loss_val = criterion(output, Y_tensor_val)
                val_acc=compute_accuracy(X_tensor_val,Y_tensor_val, model)
                #Prevent overfit
                # if val_acc<best_acc: #ACCURACY AS CRIT.
                if loss_val>best_loss:    #LOSS AS CRIT.
                    if debug:
                        print(f'Epoch [{epoch+1:3d}/{epochs:3d}], Training Loss: {loss.item():.4f} Val Loss:{loss_val.item():.4f}')
                        print("Stopping since val loss increasing")
                    #load the best params (https://discuss.pytorch.org/t/how-to-save-the-best-model/84608)
                    model.load_state_dict(torch.load('best-model.pt'))
                    break
                else:
                    best_acc=val_acc
                    best_loss=loss_val
                    #save the best params
                    torch.save(model.state_dict(), 'best-model.pt')
                    
                
            
            # Print training loss every few epochs
            if (epoch+1) % 10 == 0 and debug:
                print(f'Epoch [{epoch+1:3d}/{epochs:3d}], Training Loss: {loss.item():.4f} Val Loss:{loss_val.item():.4f}')
        
        #use loss on val data to track best model
        #val_acc=compute_accuracy(X_tensor_val,Y_tensor_val,model)
        # if (best_acc>best_overall_acc):
        if (best_loss<best_overall_loss):
            best_overall_acc=best_acc
            best_overall_loss=best_loss
            # Extract avergae weights from the first layer of the model
            weights = model.fc1.weight.detach().numpy()
            weights=weights.mean(0).squeeze()
            weights=weights/np.abs(weights).sum()
            
            
            # Calculate feature importance
            feature_importance = {fnn_features[i]: abs(weights[i]) for i in range(len(fnn_features))}
            
            # Sort features by importance
            sorted_features_NN = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
           
            # Print top N features
            if debug:
                print(f"Top {num_features} features from FFN:")
                for feature, importance in sorted_features_NN:
                    print(f"{feature}:{importance:8.4f}")
            
            # Put the model in evaluation mode
            model.eval()
            
            # Forward pass to get predictions
            with torch.no_grad():
                output = model(X_tensor_test)
            
            # Get the predicted labels
            _, predicted_labels = torch.max(output, -1)
            
            
            # Calculate accuracy using score utility
            Y_pred=predicted_labels.numpy()
            #Y_pred[:]+=1 # add 1 since index is 0 based and class 1 based
            print(f"Found a better model with  val loss {best_loss:.4f} and accuracy {best_acc:.4f}")
            print("Feature Set: ",fnn_features)
            print("Hidden Size: ",hidden_size)
            acc,prec,rec,f1=score(test,Y_pred)
            if debug:
                if debug:
                    print("-"*80)
                    
                    print(f"Test Accuracy={acc:.4f} Precision={prec:.4f} Recall={rec:.4f} F1={f1:.4f}, Loss={best_loss:.4f}")
                    print("-"*80)

        
        
            
            best_feature_set=sorted_features_NN
            best_metrics=(acc,prec,rec,f1,best_loss,num_features,hidden_size)
    

                

In [None]:
print("="*80)
(acc,prec,rec,f1,best_loss,num_features,hidden_size)=best_metrics
print(f"FFN Best Fit with {num_features} features {hidden_size} hidden size")
print(f"Validation Accuracy {best_overall_acc:.4f}")
print(f"Test Accuracy={acc:.4f} Precision={prec:.4f} Recall={rec:.4f} F1={f1:.4f}, Loss={best_loss:.4f}")
print("Best feature set by importance: ")
for feature, importance in best_feature_set:
    print(f"{feature}:{importance:8.4f}")
print("="*80)


def graphs(sorted_features,method): 
    features = [feat[0] for feat in sorted_features]
    importance = [feat[1] for feat in sorted_features]
    # Plot the feature importance
    plt.figure(figsize=(10, 6))
    plt.barh(features, importance, color='skyblue')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title('Feature Importance for '+method)
    plt.gca().invert_yaxis()  # Invert y-axis to show the most important features at the top
    plt.show()

graphs(sorted_features_LR,"LR")
graphs(sorted_features_NN,"NN")