In [32]:
!pip install matplotlib 



In [33]:
!pip install scikit-learn



In [34]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest

# Load the dataset
data = pd.read_csv('all_nba_standing_2021-2025.csv')

# Check initial shape of the data
print(f'Initial shape of data: {data.shape}')

# Preprocess the dataset
data.dropna(inplace=True)  # Remove rows with missing values
print(f'Shape after dropping NaNs: {data.shape}')

# Select features for the model
features = [
    'W', 'L', 'PCT', 'GB', 'HOME', 'AWAY', 
    'DIV', 'CONF', 'PPG', 'OPP PPG', 'DIFF', 'STRK', 'L10'
]

# Check for non-numeric values and convert them
for feature in features:
    data[feature] = pd.to_numeric(data[feature], errors='coerce')  # Convert to numeric, set errors to NaN

# Drop rows with NaN values after conversion
data.dropna(subset=features, inplace=True)
print(f'Shape after converting to numeric and dropping NaNs: {data.shape}')

# Target variable
data['target'] = np.where(data['W'] > data['L'], 1, 0)  # 1 if wins > losses, else 0

# Prepare feature matrix and target vector
X = data[features]
y = data['target']

# Check if X is empty before proceeding
if X.empty:
    raise ValueError("Feature matrix X is empty after preprocessing.")

# Detect and remove outliers using Isolation Forest
iso_forest = IsolationForest(contamination=0.1)  # Adjust contamination as needed
outliers = iso_forest.fit_predict(X)

# Filter the data to remove outliers
X = X[outliers != -1]
y = y[outliers != -1]

# Check the shapes of X and y
print(f'Shape of X after outlier removal: {X.shape}, Shape of y after outlier removal: {y.shape}')

# Ensure that the lengths match
if len(X) != len(y):
    raise ValueError("The number of samples in X and y do not match after outlier removal.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Gradient Boosting model (XGBoost)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_scaled, y_train)

# Predict the outcomes
predictions = model.predict(X_test_scaled)

# Print accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

# Get feature importances
importances = model.feature_importances_

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(features, importances, color='skyblue')
plt.xlabel('Importance Score')
plt.title('Feature Importances from XGBoost Model')
plt.axvline(0, color='black', lw=0.8)
plt.show()

Initial shape of data: (120, 16)
Shape after dropping NaNs: (120, 16)
Shape after converting to numeric and dropping NaNs: (0, 16)


ValueError: Feature matrix X is empty after preprocessing.