# NBA Salary Prediction Based on Age of player

This notebook demonstrates how to build a simple machine learning model to predict the salary of an NBA player based on the number of years they have played in the league.

## Step 1: Import Libraries and Load the Data

In [None]:
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st

from snowflake.ml.modeling.linear_model import LinearRegression
from snowflake.ml.modeling.neural_network import MLPRegressor
from snowflake.ml.modeling.metrics import mean_squared_error, r2_score
from snowflake.ml.modeling.preprocessing import StandardScaler

# Load your dataset from Snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# data = pd.read_csv('nba_salary_data.csv')
# data = session.table('SNOWPARK_PLAYGROUND.NBA.SALARIES').to_pandas()
data = session.table('SNOWPARK_PLAYGROUND.NBA.SALARIES')

# Preview the data
data.show()

## Step 2: Data Exploration and Visualization

In [None]:
# schema
list(data.schema)

In [None]:
from snowflake.snowpark import DataFrame
from snowflake.snowpark.functions import lit, col, count

def null_columns(df: DataFrame):
    total_records = df.count()
    for column in df.columns:
        not_null_count = count(column)
        df = df.withColumn(column, lit(total_records - not_null_count))
    return df.limit(1)
null_columns(data)

In [None]:
# Check for missing values
# print(data.isnull().sum())
import altair as alt

# Basic statistics
st.write(data.describe())


# Visualize the relationship between years played and salary
sns.scatterplot(x='AGE', y='SALARY', data=data.to_pandas())
plt.xlabel('AGE')
plt.ylabel('SALARY')
plt.title('Age vs Salary')
plt.show()

In [None]:
data = data.dropna()
# Check for missing values
# print(data.isnull().sum())
null_columns(data)

## Step 3: Prepare the Data

In [None]:
# Select the feature(s) and target variable
X = data[['AGE']]  # Independent variable
y = data['SALARY']         # Dependent variable

# Split the data into training and testing sets
# Split the data into train and test sets
data_train, data_test = data.random_split(weights=[0.8, 0.2], seed=0)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# data.shape[]
data_train.describe()

# print(f'Training set size: {X_train.shape[0]}')
# print(f'Test set size: {X_test.shape[0]}')

## Step 4: Train the Model

In [None]:
input_cols=['AGE']
output_cols=['SALARY']

In [None]:
# Create a linear regression model
model = LinearRegression(
    input_cols=input_cols,
    label_cols=output_cols
)

# Train the model
model.fit(data_train)
model_local=model.to_sklearn()

# Output the coefficients
print(f'Intercept: {model_local.intercept_}')
print(f'Coefficient: {model_local.coef_[0]}')

## Step 5: Evaluate the Model

In [None]:
# Make predictions on the test set
pred = model.predict(data_test)

In [None]:


# Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f'Mean Squared Error: {mse}')
# print(f'R^2 Score: {r2}')

# # Visualize the regression line
# plt.scatter(X_test, y_test, color='blue')
# plt.plot(X_test, y_pred, color='red', linewidth=2)
# plt.xlabel('Age')
# plt.ylabel('Salary')
# plt.title('Regression Line: Age vs Salary')
# plt.show()

pred=pred.sort(col('AGE').desc())

y_pred = np.array(pred.select('OUTPUT_SALARY').collect())
x_test= np.array(pred.select('AGE').collect())

# plt.scatter(x='AGE',y='OUTPUT_SALARY',data=pred.to_pandas(), color='red')
plt.scatter(x='AGE',y='SALARY',data=data_test.to_pandas(), color='blue')
plt.plot(x_test, y_pred, color='red', linewidth=2)
plt.xlabel('AGE')
plt.ylabel('SALARY')
plt.title('Age vs Salary')
plt.show()




## Step 6: Make Predictions

In [None]:
# Predict the salary for a player with a certain age
player_age=25
age = session.create_dataframe([player_age], schema=['AGE'])
predicted_salary = model.predict(age)
predicted_salary

## Step 7. Fit a Neural Network Model

In [None]:
# Scale the data (standardization)
scaler = StandardScaler(
    input_cols=['AGE'],
    output_cols=['AGE_SCALED']
)

scaler.fit(data_test)
data_train_scaled = scaler.transform(data_train)
data_test_scaled=scaler.transform(data_test)


# Create the MLPRegressor model
mlp = MLPRegressor(
    input_cols=['AGE_SCALED'],
    label_cols=output_cols,
    hidden_layer_sizes=(64, 64), activation='relu', solver='lbfgs', max_iter=500, random_state=1)

# Train the model
res= mlp.fit(data_train_scaled)
# mlp.fit(data_train)

pred_mlp=mlp.predict(data_test_scaled)

## Step 8. Evaluate a NNM

In [None]:
pred_mlp

In [None]:
pred=pred_mlp.sort(col('AGE').desc())

y_pred_mlp = np.array(pred_mlp.select('OUTPUT_SALARY').collect())
x_test_mlp= np.array(pred_mlp.select('AGE').collect())

plt.scatter(x='AGE',y='SALARY',data=data_test.to_pandas(), color='blue', label='Actual')

plt.scatter(x_test_mlp, y_test, color='blue', label='Actual')
plt.scatter(X_test, y_pred, color='red', label='Predicted')
plt.xlabel('Years Played')
plt.ylabel('Salary')
plt.title('MLPRegressor: Years Played vs Salary')
plt.legend()
plt.show()

In [None]:
pred=pred_mlp.sort(col('AGE').desc())

# Evaluate the model
# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f'Mean Squared Error: {mse}')
# print(f'R^2 Score: {r2}')
x_test= np.array(data_test.select('AGE').collect())
y_pred = np.array(pred_mlp.select('OUTPUT_SALARY').collect())

# Visualize the results
plt.scatter(x='AGE',y='SALARY',data=data_test.to_pandas(), color='blue', label='Actual')
# plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(x_test, y_pred, color='red', label='Predicted')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.title('MLPRegressor: Age vs Salary')
plt.legend()
plt.show()


## Step 9. make predictions

In [None]:
# Predict the salary for a player with a certain number of years played
age = session.create_dataframe([player_age], schema=['AGE'])

# age = np.array([[5]])  # Example: predicting salary for a player with 5 years played
age_scaled = scaler.transform(age)

predicted_salary = mlp.predict(age_scaled)
predicted_salary



## Summary
This notebook provides a simple implementation of a linear regression model to predict NBA salaries based on the number of years played. Depending on the available data, the model could be enhanced by incorporating additional features or by using more complex algorithms.
