In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load in data
data = pd.read_csv('Cleaned Data.csv')

In [136]:
# Set up training and testing data
# 2020 will be the testing data. All years before 2020 will be the training data

X_train = data.loc[data['Year']!=2020].drop(['Share','MVP'], axis=1)
X_test = data.loc[data['Year']==2020].drop(['Share','MVP'], axis=1)

y_train = data.loc[data['Year']!=2020, 'Share']
y_test = data.loc[data['Year']==2020, 'Share']

names_test = data.loc[data['Year']==2020, 'Player'].reset_index(drop=True)

In [None]:
#X_train.head()
#X_train.dtypes

In [192]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [5]:
class ColumnExtractor(TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xcols = X[self.cols]
        return Xcols

In [73]:
#  Numerical features
numeric_columns = X_train.select_dtypes(exclude='object').columns

#  Categorical features
categorical_columns = X_train.select_dtypes('object').columns

#  Define the pipeline to fix the data
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('numeric', Pipeline([
            ('extract', ColumnExtractor(numeric_columns)),
            ('scale', MinMaxScaler())
            #('scale', StandardScaler())
        ])),
        ('categorical', Pipeline([
            ('extract', ColumnExtractor(categorical_columns)),
            ('encode', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])),
    ]))
])

# Regression

### Linear Regression

In [91]:
from sklearn.linear_model import LinearRegression

In [92]:
lr_pipeline = Pipeline([('pipeline', pipeline),
                         ('lr', LinearRegression())])

lr_pipeline.fit(X_train, y_train)
;

''

In [193]:
yhat_train = lr_pipeline.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print(rmse)
print(r2_score(y_train, yhat_train))
print(mean_absolute_error(y_train, yhat_train))

0.25082797497312215
0.03893070684915245
0.179229628871864


In [150]:
yhat = lr_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.8114624
1,LeBron James,0.3086395
2,James Harden,0.6339569
3,Luka Dončić,4461899000000.0
4,Kawhi Leonard,-6370279000.0
5,Anthony Davis,2182202000000.0
6,Chris Paul,0.4285126
7,Damian Lillard,0.08000183
8,Nikola Jokić,0.02507019
9,Pascal Siakam,2723052000000.0


### Lasso

In [38]:
from sklearn.linear_model import LassoCV

In [84]:
lasso_pipeline = Pipeline([('pipeline', pipeline),
                         ('lasso', LassoCV())])

lasso_pipeline.fit(X_train, y_train)
;

''

In [194]:
yhat_train = lasso_pipeline.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print(rmse)
print(r2_score(y_train, yhat_train))
print(mean_absolute_error(y_train, yhat_train))

0.25361993262591986
0.017416372711779426
0.20236858419540926


In [146]:
yhat = lasso_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.398631
1,LeBron James,0.394039
2,James Harden,0.374683
3,Luka Dončić,0.1715
4,Kawhi Leonard,0.15216
5,Anthony Davis,0.260719
6,Chris Paul,0.039916
7,Damian Lillard,0.196596
8,Nikola Jokić,0.156222
9,Pascal Siakam,-0.007998


### KNN

In [65]:
from sklearn.neighbors import KNeighborsRegressor

In [83]:
knn_pipeline = Pipeline([('pipeline', pipeline),
                         ('knn', KNeighborsRegressor())])

knn_pipeline.fit(X_train, y_train)
;

''

In [195]:
yhat_train = knn_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print(rmse)
print(r2_score(y_train, yhat_train))
print(mean_absolute_error(y_train, yhat_train))

0.2707033465455976
-0.11941212584848482
0.19350633802816902


In [147]:
yhat = knn_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.2042
1,LeBron James,0.3938
2,James Harden,0.6392
3,Luka Dončić,0.0294
4,Kawhi Leonard,0.199
5,Anthony Davis,0.403
6,Chris Paul,0.195
7,Damian Lillard,0.0608
8,Nikola Jokić,0.0534
9,Pascal Siakam,0.0168


### Ridge

In [85]:
from sklearn.linear_model import Ridge

In [86]:
ridge_pipeline = Pipeline([('pipeline', pipeline),
                         ('ridge', Ridge())])

ridge_pipeline.fit(X_train, y_train)
;

''

In [196]:
yhat_train = ridge_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print(rmse)
print(r2_score(y_train, yhat_train))
print(mean_absolute_error(y_train, yhat_train))

0.25177505238141085
0.031659385559457465
0.19098163520437195


In [148]:
yhat = ridge_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.560758
1,LeBron James,0.376962
2,James Harden,0.50705
3,Luka Dončić,0.0722
4,Kawhi Leonard,0.2128
5,Anthony Davis,0.20175
6,Chris Paul,0.052421
7,Damian Lillard,0.130088
8,Nikola Jokić,0.122743
9,Pascal Siakam,-0.169191


### Trees

In [102]:
from sklearn.tree import DecisionTreeRegressor

In [103]:
tree_pipeline = Pipeline([('pipeline', pipeline),
                         ('tree', DecisionTreeRegressor())])

tree_pipeline.fit(X_train, y_train)
;

''

In [197]:
yhat_train = tree_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print(rmse)
print(r2_score(y_train, yhat_train))
print(mean_absolute_error(y_train, yhat_train))

0.2318611015727632
0.17878183484848476
0.1246619718309859


In [149]:
yhat = tree_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.393
1,LeBron James,0.029
2,James Harden,0.441
3,Luka Dončić,0.271
4,Kawhi Leonard,0.029
5,Anthony Davis,0.049
6,Chris Paul,0.001
7,Damian Lillard,0.271
8,Nikola Jokić,0.001
9,Pascal Siakam,0.001


# Classification

In [168]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

In [151]:
y_train = data.loc[data['Year']!=2020, 'MVP']
y_test = data.loc[data['Year']==2020, 'MVP']

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [157]:
nb = Pipeline([('pipeline', pipeline),
                ('nb', MultinomialNB())])

nb.fit(X_train, y_train)
;

''

In [177]:
yhat_train = nb.predict(X_train)

print(accuracy_score(y_train, yhat_train))
print(f1_score(y_train, yhat_train))

confusion_matrix(y_train, yhat_train)

0.9295774647887324
0.0


array([[264,   0],
       [ 20,   0]], dtype=int64)

In [153]:
yhat = nb.predict(X_test)
yhat

array([False, False, False, False, False, False, False, False, False,
       False, False, False])

In [159]:
from sklearn.ensemble import RandomForestClassifier

In [186]:
rf = Pipeline([('pipeline', pipeline),
                ('rf', RandomForestClassifier())])

rf.fit(X_train, y_train)
;

''

In [187]:
yhat_train = rf.predict(X_train)

print(accuracy_score(y_train, yhat_train))
print(f1_score(y_train, yhat_train))

confusion_matrix(y_train, yhat_train)

1.0
1.0


array([[264,   0],
       [  0,  20]], dtype=int64)

In [188]:
yhat = rf.predict(X_test)
yhat

array([False, False, False, False, False, False, False, False, False,
       False, False, False])

### Boosted Forest

In [154]:
from sklearn.ensemble import AdaBoostClassifier

In [189]:
ada = Pipeline([('pipeline', pipeline),
                ('ada', AdaBoostClassifier())])

ada.fit(X_train, y_train)
;

''

In [190]:
yhat_train = ada.predict(X_train)

print(accuracy_score(y_train, yhat_train))
print(f1_score(y_train, yhat_train))

confusion_matrix(y_train, yhat_train)

1.0
1.0


array([[264,   0],
       [  0,  20]], dtype=int64)

In [191]:
yhat = ada.predict(X_test)
yhat

array([ True, False, False, False, False, False, False, False, False,
       False, False, False])