In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load in data
data = pd.read_csv('Cleaned Data.csv')

In [3]:
# Set up training and testing data
# 2020 will be the testing data. All years before 2020 will be the training data

X_train = data.loc[data['Year']!=2020].drop(['Share','MVP'], axis=1)
X_test = data.loc[data['Year']==2020].drop(['Share','MVP'], axis=1)

y_train = data.loc[data['Year']!=2020, 'Share']
y_test = data.loc[data['Year']==2020, 'Share']

names_test = data.loc[data['Year']==2020, 'Player'].reset_index(drop=True)

In [4]:
#X_train.head()
#X_train.dtypes

In [5]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [6]:
class ColumnExtractor(TransformerMixin):
    
    def __init__(self, cols):
        self.cols = cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        Xcols = X[self.cols]
        return Xcols

In [7]:
#  Numerical features
numeric_columns = X_train.select_dtypes(exclude='object').columns

#  Categorical features
categorical_columns = X_train.select_dtypes('object').columns

#  Define the pipeline to fix the data
pipeline = Pipeline([
    ('features', FeatureUnion([
        ('numeric', Pipeline([
            ('extract', ColumnExtractor(numeric_columns)),
            ('scale', MinMaxScaler())
            #('scale', StandardScaler())
        ])),
        ('categorical', Pipeline([
            ('extract', ColumnExtractor(categorical_columns)),
            ('encode', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])),
    ]))
])

# Regression

### Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
lr_pipeline = Pipeline([('pipeline', pipeline),
                         ('lr', LinearRegression())])

lr_pipeline.fit(X_train, y_train);

In [10]:
yhat_train = lr_pipeline.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print('RMSE:',rmse)
print('R^2:',r2_score(y_train, yhat_train))
print('MAE:',mean_absolute_error(y_train, yhat_train))

RMSE: 0.1412517006044717
R^2: 0.7460840204920596
MAE: 0.09921997242242517


In [11]:
yhat = lr_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.8114624
1,LeBron James,0.3086395
2,James Harden,0.6339569
3,Luka Dončić,4461899000000.0
4,Kawhi Leonard,-6370279000.0
5,Anthony Davis,2182202000000.0
6,Chris Paul,0.4285126
7,Damian Lillard,0.08000183
8,Nikola Jokić,0.02507019
9,Pascal Siakam,2723052000000.0


### Lasso

In [12]:
from sklearn.linear_model import LassoCV

In [13]:
lasso_pipeline = Pipeline([('pipeline', pipeline),
                         ('lasso', LassoCV())])

lasso_pipeline.fit(X_train, y_train);

In [14]:
yhat_train = lasso_pipeline.predict(X_train)

rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print('RMSE:',rmse)
print('R^2:',r2_score(y_train, yhat_train))
print('MAE:',mean_absolute_error(y_train, yhat_train))

RMSE: 0.195423973913882
R^2: 0.5139752974964701
MAE: 0.14882369082377478


In [15]:
yhat = lasso_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.398631
1,LeBron James,0.394039
2,James Harden,0.374683
3,Luka Dončić,0.1715
4,Kawhi Leonard,0.15216
5,Anthony Davis,0.260719
6,Chris Paul,0.039916
7,Damian Lillard,0.196596
8,Nikola Jokić,0.156222
9,Pascal Siakam,-0.007998


### KNN

In [16]:
from sklearn.neighbors import KNeighborsRegressor

In [17]:
knn_pipeline = Pipeline([('pipeline', pipeline),
                         ('knn', KNeighborsRegressor())])

knn_pipeline.fit(X_train, y_train);

In [18]:
yhat_train = knn_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print('RMSE:',rmse)
print('R^2:',r2_score(y_train, yhat_train))
print('MAE:',mean_absolute_error(y_train, yhat_train))

RMSE: 0.2005289810813418
R^2: 0.4882510542819948
MAE: 0.1366823943661972


In [19]:
yhat = knn_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.2042
1,LeBron James,0.3938
2,James Harden,0.6392
3,Luka Dončić,0.0294
4,Kawhi Leonard,0.199
5,Anthony Davis,0.403
6,Chris Paul,0.195
7,Damian Lillard,0.0608
8,Nikola Jokić,0.0534
9,Pascal Siakam,0.0168


### Ridge

In [20]:
from sklearn.linear_model import Ridge

In [21]:
ridge_pipeline = Pipeline([('pipeline', pipeline),
                         ('ridge', Ridge())])

ridge_pipeline.fit(X_train, y_train);

In [22]:
yhat_train = ridge_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print('RMSE:',rmse)
print('R^2:',r2_score(y_train, yhat_train))
print('MAE:',mean_absolute_error(y_train, yhat_train))

RMSE: 0.15907222121209463
R^2: 0.6779736917654926
MAE: 0.117696635051458


In [23]:
yhat = ridge_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.560758
1,LeBron James,0.376962
2,James Harden,0.50705
3,Luka Dončić,0.0722
4,Kawhi Leonard,0.2128
5,Anthony Davis,0.20175
6,Chris Paul,0.052421
7,Damian Lillard,0.130088
8,Nikola Jokić,0.122743
9,Pascal Siakam,-0.169191


### Trees

In [24]:
from sklearn.tree import DecisionTreeRegressor

In [25]:
tree_pipeline = Pipeline([('pipeline', pipeline),
                         ('tree', DecisionTreeRegressor())])

tree_pipeline.fit(X_train, y_train);

In [26]:
yhat_train = tree_pipeline.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, yhat_train))
print('RMSE:',rmse)
print('R^2:',r2_score(y_train, yhat_train))
print('MAE:',mean_absolute_error(y_train, yhat_train))

RMSE: 1.8910716083357921e-19
R^2: 1.0
MAE: 5.497363128095515e-20


In [27]:
yhat = tree_pipeline.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('Predicted Share')], axis=1)

Unnamed: 0,Player,Predicted Share
0,Giannis Antetokounmpo,0.271
1,LeBron James,0.013
2,James Harden,0.441
3,Luka Dončić,0.271
4,Kawhi Leonard,0.001
5,Anthony Davis,0.049
6,Chris Paul,0.001
7,Damian Lillard,0.393
8,Nikola Jokić,0.001
9,Pascal Siakam,0.001


# Classification

In [28]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

In [29]:
y_train = data.loc[data['Year']!=2020, 'MVP']
y_test = data.loc[data['Year']==2020, 'MVP']

### Naive Bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB

In [31]:
nb = Pipeline([('pipeline', pipeline),
                ('nb', MultinomialNB())])

nb.fit(X_train, y_train);

In [32]:
yhat_train = nb.predict(X_train)

print('Accuracy Score:',accuracy_score(y_train, yhat_train))
print('F1 Score:',f1_score(y_train, yhat_train))

print('Confusion Matrix:')
confusion_matrix(y_train, yhat_train)

Accuracy Score: 0.9295774647887324
F1 Score: 0.0
Confusion Matrix:


array([[264,   0],
       [ 20,   0]], dtype=int64)

In [33]:
yhat = nb.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('MVP')], axis=1)

Unnamed: 0,Player,MVP
0,Giannis Antetokounmpo,False
1,LeBron James,False
2,James Harden,False
3,Luka Dončić,False
4,Kawhi Leonard,False
5,Anthony Davis,False
6,Chris Paul,False
7,Damian Lillard,False
8,Nikola Jokić,False
9,Pascal Siakam,False


### Random Foreset

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rf = Pipeline([('pipeline', pipeline),
                ('rf', RandomForestClassifier())])

rf.fit(X_train, y_train);

In [36]:
yhat_train = rf.predict(X_train)

print('Accuracy Score:',accuracy_score(y_train, yhat_train))
print('F1 Score:',f1_score(y_train, yhat_train))

print('Confusion Matrix:')
confusion_matrix(y_train, yhat_train)

Accuracy Score: 1.0
F1 Score: 1.0
Confusion Matrix:


array([[264,   0],
       [  0,  20]], dtype=int64)

In [37]:
yhat = rf.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('MVP')], axis=1)

Unnamed: 0,Player,MVP
0,Giannis Antetokounmpo,False
1,LeBron James,False
2,James Harden,False
3,Luka Dončić,False
4,Kawhi Leonard,False
5,Anthony Davis,False
6,Chris Paul,False
7,Damian Lillard,False
8,Nikola Jokić,False
9,Pascal Siakam,False


### Boosted Forest

In [38]:
from sklearn.ensemble import AdaBoostClassifier

In [39]:
ada = Pipeline([('pipeline', pipeline),
                ('ada', AdaBoostClassifier())])

ada.fit(X_train, y_train);

In [40]:
yhat_train = ada.predict(X_train)

print('Accuracy Score:',accuracy_score(y_train, yhat_train))
print('F1 Score:',f1_score(y_train, yhat_train))

print('Confusion Matrix:')
confusion_matrix(y_train, yhat_train)

Accuracy Score: 1.0
F1 Score: 1.0
Confusion Matrix:


array([[264,   0],
       [  0,  20]], dtype=int64)

In [41]:
yhat = ada.predict(X_test)
pd.concat([names_test,pd.Series(yhat).rename('MVP')], axis=1)

Unnamed: 0,Player,MVP
0,Giannis Antetokounmpo,True
1,LeBron James,False
2,James Harden,False
3,Luka Dončić,False
4,Kawhi Leonard,False
5,Anthony Davis,False
6,Chris Paul,False
7,Damian Lillard,False
8,Nikola Jokić,False
9,Pascal Siakam,False
