In [None]:
# PANDAS IS FOR DATA WRANGLING
import pandas as pd
import numpy as np

# SEABORN IS A PLOTTING LIBRARY
import seaborn as sns

# MATPLOT LIB IS ALSO A PLOTTING LIBRARY
import matplotlib.pyplot as plt


# Good ol statsmodels
import statsmodels.api as sm

# Specific root mean squared error for stats models
from statsmodels.tools.eval_measures import rmse

# CTP Library
from CTPLIB import CTP_LinReg


In [None]:
df = pd.read_csv('data/NBA_train.csv')
df.head().T

# Finding how many wins needed to make the playoffs 

* Compute the mean and standard deviation for the number of wins for teams that DID NOT  make the playoffs.
* Determine the wins above which 95% of the non-playoff teams fall

In [None]:
# Extract wins for teams that made and didn't make the playoffs
playoff_wins = df[df['Playoffs'] == 1]['W'].values
non_playoff_wins = df[df['Playoffs'] == 0]['W'].values


from scipy.stats import norm

# Calculate mean and standard deviation for non-playoff wins
mean_non_playoff_wins = np.mean(non_playoff_wins)
std_non_playoff_wins = np.std(non_playoff_wins)

# Determine the wins above which 95% of the non-playoff teams fall
threshold_wins_95 = norm.ppf(0.95, mean_non_playoff_wins, std_non_playoff_wins)
threshold_wins_95


In [None]:
sns.set()
plt.figure(figsize=(13, 8))
ax = sns.scatterplot(df, x='W', y='Team', hue='Playoffs')
ax.axvline(x=int(threshold_wins_95), color='red')

In [None]:
df['points_diff'] = df['PTS'] - df['oppPTS']

independent_vars = 'points_diff'
dependent_var = 'W'

In [None]:
df.points_diff.hist()

In [None]:
model_wins = CTP_LinReg(df, independent_vars, dependent_var, simple=True)
model_wins.build_model()

#### run code below to check for all gotchyas
# model_wins.run_all()

## Use the model coefficents to find how many points_diff we need to safely win our target of 42 Wins to make the playoffs

In [None]:
wins_needed_to_make_playoffs = int(threshold_wins_95)


coef_y_int = model_wins.model.params['const']
coef_runs_diff = model_wins.model.params['points_diff']

print(wins_needed_to_make_playoffs, coef_y_int, coef_runs_diff)


min_pts_diff = (wins_needed_to_make_playoffs - coef_y_int) / coef_runs_diff
min_pts_diff


## Here we need to score 30.68 more points than we allow to make win 42 games and make the playoffs

In [None]:
ax = sns.regplot(df, x='points_diff', y='W' )
ax.axvline(x=min_pts_diff, color='red')
ax.axhline(y=wins_needed_to_make_playoffs, color='red')

# KITCHEN SINK MODEL 
* Build a model for points scored using everything we have.

In [None]:
print(df.columns)
iv = ['2PA', '3PA', 'FTA', 'ORB', 'DRB', 'AST', 'STL', 'BLK', 'TOV']
dv = 'PTS'
model_pts = CTP_LinReg(df, iv, dv)
model_pts.run_all()



# Extract only the statistically signifant feautres (aka features with a p-value of <0.05)

In [None]:


significant_vars = model_pts.model.pvalues[model_pts.model.pvalues.round(3) < 0.05].round(3).copy()
sig_ivs = list(significant_vars.index)

sig_ivs.remove('const')
print(sig_ivs)

dv = 'PTS'
model_pts2 = CTP_LinReg(df, sig_ivs, dv)
model_pts2.build_model()
# model_pts2.run_all()


# Model Interpertation

* r^2 score is very high
* Model's RMSE is 184, which means its on aveage 184 points away from the acutal.  Sounds high, however, the average prediction actual is 8370, and 184 is about 2% of the average, so on average we are about 2% off our target which is great. 

### Coefficient Interpertation

* 3PA seems to be the most important when it comes to scoring. This makes sense as 3P are worth the most. 
* Next is FT, which they are only worth one, however, they are made at such a high rate that that is probably why they are worth more than 2P. 
* Next is 2PA which also makes sense as they are a majority of how scoring is done in the NBA.
* Next biggest factor is assists, which also make sense as it measures assisting someone scoring.  
* What is suprising to me is that ORB (offensive rebounds) acutally lowers points scored.  This doesn't make much sense to me, but maybe its because you can only make an offensive rebound when you MISS a shot, so maybe that is why... because teams that miss more shots will have higher ORB and missing shots will obvisouly score less points. 

### Side note: Interesting Model
* Notice the coefficents are excatly how many points each point is worth....

In [None]:
ivs = ['2P', '3P', 'FT']
dv = 'PTS'
trippy = CTP_LinReg(df, ivs, dv)
trippy.build_model()

# Test our model

In [None]:
print(sig_ivs)

In [None]:
test_df = pd.read_csv('data/NBA_test.csv')
test_df['points_diff'] = test_df['PTS'] - test_df['oppPTS']


ivs = ['2PA', '3PA', 'FTA', 'ORB', 'AST', 'STL']
dv = 'PTS'

X_test = test_df[ivs]
X_test = sm.add_constant(X_test)
X_test


y_test = test_df[dv]
y_pred = model_pts2.model.predict(X_test).round(1)

avg_error = rmse(y_test, y_pred)
print('Root Mean Squared Error: ', avg_error)

test_data_pts_avg = test_df[dv].mean()
print('Average Points Scored of test data', test_data_pts_avg)
print('RMSE Percentage off average:', ((avg_error / test_data_pts_avg)*100).round(2))
