In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Object, create a model to predict points scored

In [None]:
df = pd.read_csv('../input/beginner-datasets/beginner_datasets/nba.csv')

In [None]:
#observe the data
df.head()

In [None]:
#Check that the data is clean
df.info()

In [None]:
#data is mostly clean, some missing values in 3P%

In [None]:
#visualize the data, as we are trying to predict points scored, let's check the
#relations between points scored and some of the variables we think might have the most 
#effect on this: MIN, FGM, FG%, 3PM, 3P%, FTM, FT%

In [None]:
from matplotlib import pyplot as plt

In [None]:
plt.scatter(x = 'MIN', y = 'PTS', data = df )

In [None]:
plt.scatter(x = 'FGM', y = 'PTS', data = df)

In [None]:
plt.scatter(x = 'FG%', y = 'PTS', data = df)

In [None]:
plt.scatter(x = '3P Made', y = 'PTS', data = df)

In [None]:
plt.scatter(x = '3P%', y = 'PTS', data = df)

In [None]:
plt.scatter(x = 'FTM', y = 'PTS', data = df)

In [None]:
plt.scatter(x = 'FT%', y = 'PTS', data = df)

In [None]:
#After plotting the data we can note a mostly positive linear relationship between PTS and MIN as would 
#be expected, the more minutes you play the more likely you are to score more points.
#We note a near perfectly correlated linear relationship between PTS AND FGM
#A weaker positive association is noted between PTS and FG%, as players can score a low number of 
#points at a high field goal percentage
#The association between PTS and 3P Made and 3P% is weak as players can score a high number of points
#without shooting any three point shots
#FTM and PTS have a strong positive linear relationship
#FT% and PTS have a weaker positive relationship, and percentages are clustered around the 40%-80% range
#for low and high point scorers

In [None]:
#For our MLE model we will use the variables FGM, FTM and 3P Made to predict PTS as 
#these FGM and FTM have a strong positive linear correlation, and 3P Made contribute to the
#points total despite have a weaker relationship on the scatterplot.
#Theoritically a points scored model would be points = 2*FGM + 1*FTM + 3*3P Made
#Let's see if we can replicate this with our practical model from the data

In [None]:
#import linear model package
from sklearn import linear_model

In [None]:
#build the linear model
points = linear_model.LinearRegression()
points.fit(df[['FGM','FTM','3P Made']], df.PTS)

In [None]:
#Model Evalution
# print the intercept
print(points.intercept_)

In [None]:
print(points.coef_)

In [None]:
#Interpreting the coefficients
#Holding all other features fixed, a 1 unit increase in FGM
#is associated with a PTS increase of 2
#Holding all other features fixed, a 1 unit increase in FTM
#is associted with a PTS increase of 1
#Holding all other features fixed, a 1 unit increase in 3P Made
#is associated with a PTS increase of 0.99 - This needs to be scaled as it doesn't make
#sense, a 3P Made should be a associated with a points increase of 3

In [None]:
#scale the 3P Made column
df['3P Made'] = 3*df['3P Made']

In [None]:
#check to see if it worked
df.head()

In [None]:
#scaling worked, rebuild the model
newpoints = linear_model.LinearRegression()
newpoints.fit = (df[['FGM','FTM','3P Made']],df.PTS)

In [None]:
#Train Test Split
#Now let's split the data into a training set and a testing set. 
#We will train out model on the training set and then use the test set to evaluate the model.

In [None]:
X = df[['FGM','FTM','3P Made']]
y = df['PTS']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn import metrics
from sklearn.model_selection import cross_val_score

def cross_val(model):
    pred = cross_val_score(model, X, y, cv=10)
    return pred.mean()

def print_evaluate(true, predicted):  
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    print('MAE:', mae)
    print('MSE:', mse)
    print('RMSE:', rmse)
    print('R2 Square', r2_square)
    print('__________________________________')
    
def evaluate(true, predicted):
    mae = metrics.mean_absolute_error(true, predicted)
    mse = metrics.mean_squared_error(true, predicted)
    rmse = np.sqrt(metrics.mean_squared_error(true, predicted))
    r2_square = metrics.r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [None]:
#Predictions from our model
#Let's grab predictions off of our test set and see how the model performs

In [None]:
pred = points.predict(X_test)

In [None]:
plt.scatter(y_test, pred)

In [None]:
import seaborn as sns

In [None]:
sns.displot((y_test - pred), bins=50);

In [None]:
test_pred = points.predict(X_test)
train_pred = points.predict(X_train)

print('Test set evaluation:\n_____________________________________')
print_evaluate(y_test, test_pred)
print('Train set evaluation:\n_____________________________________')
print_evaluate(y_train, train_pred)

In [None]:
#R^2 is 0.96, model is accurate.