In [24]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
%matplotlib inline 

In [4]:
STATS_CSV = "Seasons_Stats.csv"
PLAYER_DATA_CSV = "player_data.csv"
#Read data
stats = pd.read_csv(STATS_CSV)
#Only keep players that have played many minutes, are in the 5 major positions
pos = ['C', 'PF', 'PG', 'SG', 'SF']
stats = stats[stats['MP'] >= 1750]
stats = stats[stats['Pos'].isin(pos)]
#Only keep useful columns
stats = stats[['Pos', 'Age', 'TS%', '3PAr', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TRB%', 'USG%', 'FG%', '3P%', '2P%', 'FT%']]
#Drop rows with missing data
stats = stats.dropna(0)

In [5]:
for p in pos:
    print(p + ": ", stats[stats['Pos'] == p].shape[0])

C:  722
PF:  1010
PG:  1106
SG:  1133
SF:  1130


In [6]:
#Get and normalize the features
X = stats.drop(['Pos'], axis=1)
feature_labels = X.columns
X = X.to_numpy()
#normalize(X, copy=False)
X_scaled = StandardScaler().fit(X).transform(X)

#Get and normalize the target
y = stats['Pos']
for i in range(5):
    y = y.replace(to_replace=pos[i], value=i)
y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=21)

# Linear Kernel

In [9]:
cVals = [0.001, 0.01, 0.1, 1, 10, 100, 1000]

In [16]:
def run_linear_svc(X_tr, y_tr, X_ts, y_ts):
    for C in cVals:
        linear_model = SVC(kernel='linear', C=C).fit(X_tr, y_tr)
        for p, sv in zip(pos,linear_model.n_support_):
            print(p, " support vectors: ", sv)
        train_pred = linear_model.predict(X_tr)
        test_pred = linear_model.predict(X_ts)
        print("C=", C, ", Train Accuracy=", accuracy_score(y_tr,train_pred))
        print(classification_report(y_tr,train_pred,labels=[0,1,2,3,4],target_names=pos))
        print("C=", C, ", Test Accuracy=", accuracy_score(y_ts,test_pred))
        print(classification_report(y_ts,test_pred,labels=[0,1,2,3,4],target_names=pos))

In [17]:
run_linear_svc(X_train, y_train, X_test, y_test)

C  support vectors:  479
PF  support vectors:  672
PG  support vectors:  629
SG  support vectors:  749
SF  support vectors:  762
C= 0.001 , Train Accuracy= 0.6599356160374598
              precision    recall  f1-score   support

           C       0.87      0.34      0.49       492
          PF       0.55      0.74      0.63       673
          PG       0.88      0.83      0.85       741
          SG       0.59      0.81      0.68       749
          SF       0.62      0.48      0.54       762

    accuracy                           0.66      3417
   macro avg       0.70      0.64      0.64      3417
weighted avg       0.69      0.66      0.65      3417

C= 0.001 , Test Accuracy= 0.6656769596199525
              precision    recall  f1-score   support

           C       0.85      0.30      0.44       230
          PF       0.56      0.72      0.63       337
          PG       0.90      0.84      0.87       365
          SG       0.61      0.81      0.69       384
          SF       0

# Radial Basis Kernel

In [25]:
# adopted from JakeVDP
param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
rbf_model = make_pipeline(SVC(kernel='rbf', class_weight='balanced'))

In [26]:
grid = GridSearchCV(rbf_model,param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('svc', SVC(class_weight='balanced'))]),
             param_grid={'svc__C': [1, 5, 10, 50],
                         'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]})

In [27]:
print(grid.best_params_)

{'svc__C': 50, 'svc__gamma': 0.005}


In [28]:
rbf_train_pred = grid.predict(X_train)
rbf_test_pred = grid.predict(X_test)

In [33]:
print(classification_report(y_train,rbf_train_pred))

              precision    recall  f1-score   support

           0       0.68      0.76      0.72       492
           1       0.66      0.61      0.64       673
           2       0.89      0.90      0.89       741
           3       0.72      0.72      0.72       749
           4       0.67      0.67      0.67       762

    accuracy                           0.73      3417
   macro avg       0.73      0.73      0.73      3417
weighted avg       0.73      0.73      0.73      3417



In [34]:
print(classification_report(y_test,rbf_test_pred))

              precision    recall  f1-score   support

           0       0.68      0.70      0.69       230
           1       0.64      0.62      0.63       337
           2       0.87      0.90      0.88       365
           3       0.71      0.68      0.70       384
           4       0.64      0.66      0.65       368

    accuracy                           0.71      1684
   macro avg       0.71      0.71      0.71      1684
weighted avg       0.71      0.71      0.71      1684



In [45]:
best_rbf_model = SVC(kernel='rbf', C=50, gamma=0.005, class_weight='balanced')
best_rbf_model.fit(X_train,y_train)

SVC(C=50, class_weight='balanced', gamma=0.005)

In [46]:
best_rbf_model.n_support_

array([299, 573, 206, 521, 623])

# Polynomial Kernel

In [35]:
poly_model = make_pipeline(SVC(kernel='poly', class_weight='balanced'))

In [36]:
poly_grid = GridSearchCV(poly_model,param_grid)
poly_grid.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('svc',
                                        SVC(class_weight='balanced',
                                            kernel='poly'))]),
             param_grid={'svc__C': [1, 5, 10, 50],
                         'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]})

In [37]:
print(poly_grid.best_params_)

{'svc__C': 50, 'svc__gamma': 0.005}


In [38]:
poly_train_pred = poly_grid.predict(X_train)
poly_test_pred = poly_grid.predict(X_test)

In [39]:
print(classification_report(y_train,poly_train_pred))

              precision    recall  f1-score   support

           0       0.83      0.44      0.58       492
           1       0.59      0.31      0.40       673
           2       0.98      0.38      0.55       741
           3       0.45      0.92      0.61       749
           4       0.40      0.52      0.45       762

    accuracy                           0.52      3417
   macro avg       0.65      0.51      0.52      3417
weighted avg       0.63      0.52      0.51      3417



In [40]:
print(classification_report(y_test,poly_test_pred))

              precision    recall  f1-score   support

           0       0.80      0.41      0.54       230
           1       0.57      0.29      0.39       337
           2       0.98      0.43      0.60       365
           3       0.47      0.91      0.62       384
           4       0.38      0.50      0.43       368

    accuracy                           0.52      1684
   macro avg       0.64      0.51      0.52      1684
weighted avg       0.62      0.52      0.52      1684



In [43]:
best_poly_model = SVC(kernel='poly', C=50, gamma=0.005, class_weight='balanced')
best_poly_model.fit(X_train,y_train)

SVC(C=50, class_weight='balanced', gamma=0.005, kernel='poly')

In [44]:
best_poly_model.n_support_

array([388, 673, 684, 749, 762])