# Running Classification Models
by Rachel Shurberg

1. [Import and Check the Data](#Import)
2. [Run the Classifiers](#Class)
3. [K-Nearest Neighbors](#KNN)
4. [Random Forest Decision Trees](#Random)
5. [Decision Tree Classifier](#Decision)
6. [Gaussian Naive Bayes](#Naive)

# 1. Import and Check the Data <a class="anchor" id="Import"></a>
The data is imported and checked to make sure there are no missing values. The data is split into x,y data sets and converted to arrays.

### Import the 4 bucket data sets

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree  # Using sklearn Decision Tree classifier
from sklearn import ensemble  # Using sklearn Random Forest classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn import tree

# Read in the 4 bucket data sets
#b1 = pd.read_csv('b1.csv')
#b2 = pd.read_csv('b2.csv')
#b3 = pd.read_csv('b3.csv')
#b4 = pd.read_csv('b4.csv')

b1 = pd.read_csv('b1Mean.csv')
b2 = pd.read_csv('b2Mean.csv')
b3 = pd.read_csv('b3Mean.csv')
b4 = pd.read_csv('b4Mean.csv')

np.random.seed(10)

Count the number of missing data values in each bucket:

In [2]:
nan_count1 = b1.isna().sum().sum()
nan_count2 = b2.isna().sum().sum()
nan_count3 = b3.isna().sum().sum()
nan_count4 = b4.isna().sum().sum()
print(nan_count1)
print(nan_count2)
print(nan_count3)
print(nan_count4)

0
0
0
0


Get the shape of each data set:

In [2]:
print(b1.shape)
print(b2.shape)
print(b3.shape)
print(b4.shape)

(591, 15)
(823, 15)
(1040, 15)
(794, 15)


Drop any players with empty values:

In [134]:
xb1Drop=xb1.dropna(axis=0)
xb2Drop=xb2.dropna(axis=0)
xb3Drop=xb3.dropna(axis=0)
xb4Drop=xb4.dropna(axis=0)

Check that all missing values are dropped:

In [135]:
nan_count1 = xb1Drop.isna().sum().sum()
nan_count2 = xb2Drop.isna().sum().sum()
nan_count3 = xb3Drop.isna().sum().sum()
nan_count4 = xb4Drop.isna().sum().sum()
print(nan_count1)
print(nan_count1)
print(nan_count1)
print(nan_count1)

0
0
0
0


Get the dependent variable for each bucket:

In [15]:
#yb1=xb1Drop['CollegePlayerType']
#yb2=xb2Drop['CollegePlayerType']
#yb3=xb3Drop['CollegePlayerType']
#yb4=xb4Drop['CollegePlayerType']
yb1=b1['CollegePlayerType']
yb2=b2['CollegePlayerType']
yb3=b3['CollegePlayerType']
yb4=b4['CollegePlayerType']

Drop the dependent variable for each bucket:

In [16]:
#xb1=xb1Drop.drop(['CollegePlayerType'], axis=1)
#xb2=xb2Drop.drop(['CollegePlayerType'], axis=1)
#xb3=xb3Drop.drop(['CollegePlayerType'], axis=1)
#xb4=xb4Drop.drop(['CollegePlayerType'], axis=1)

xb1=b1.drop(['CollegePlayerType','Unnamed: 0'], axis=1)
xb2=b2.drop(['CollegePlayerType','Unnamed: 0'], axis=1)
xb3=b3.drop(['CollegePlayerType','Unnamed: 0'], axis=1)
xb4=b4.drop(['CollegePlayerType','Unnamed: 0'], axis=1)

Convert the data sets to arrays:

In [17]:
xb1Arr=xb1.to_numpy()
yb1Arr=yb1.to_numpy()
xb2Arr=xb2.to_numpy()
yb2Arr=yb2.to_numpy()
xb3Arr=xb3.to_numpy()
yb3Arr=yb3.to_numpy()
xb4Arr=xb4.to_numpy()
yb4Arr=yb4.to_numpy()

Check the shape of the remaining arrays:

In [18]:
print(xb1Arr.shape)
print(xb2Arr.shape)
print(xb3Arr.shape)
print(xb4Arr.shape)

(591, 13)
(823, 13)
(1040, 13)
(794, 13)


In [19]:
xb1.columns

Index(['Plane', 'Connection', 'Rotation', 'BatSpeed', 'RotationalAcceleration',
       'OnPlaneEfficiency', 'Power', 'TimeToContact', 'PeakHandSpeed', 'H1',
       'HH', 'armVelo', 'exitVelo'],
      dtype='object')

Count the number of each type of player in each bucket:

In [20]:
yb1.value_counts()

Role Player      248
Starter          232
Impact Player    111
Name: CollegePlayerType, dtype: int64

In [21]:
yb2.value_counts()

Starter          344
Role Player      323
Impact Player    156
Name: CollegePlayerType, dtype: int64

In [22]:
yb3.value_counts()

Starter          446
Role Player      387
Impact Player    207
Name: CollegePlayerType, dtype: int64

In [23]:
yb4.value_counts()

Starter          327
Role Player      297
Impact Player    170
Name: CollegePlayerType, dtype: int64

### Split each bucket into training, validation, and testing data:

In [24]:
# Using train_test_split to generate training, validation, and test data
XTrainb1,XTestb1,yTrainb1,yTestb1=train_test_split(xb1Arr,yb1Arr,train_size=.6,test_size=.4)
XTrainb2,XTestb2,yTrainb2,yTestb2=train_test_split(xb2Arr,yb2Arr,train_size=.6,test_size=.4)
XTrainb3,XTestb3,yTrainb3,yTestb3=train_test_split(xb3Arr,yb3Arr,train_size=.6,test_size=.4)
XTrainb4,XTestb4,yTrainb4,yTestb4=train_test_split(xb4Arr,yb4Arr,train_size=.6,test_size=.4)

XValidb1,XTestb1,yValidb1,yTestb1=train_test_split(XTestb1,yTestb1,train_size=.5,test_size=.5)
XValidb2,XTestb2,yValidb2,yTestb2=train_test_split(XTestb2,yTestb2,train_size=.5,test_size=.5)
XValidb3,XTestb3,yValidb3,yTestb3=train_test_split(XTestb3,yTestb3,train_size=.5,test_size=.5)
XValidb4,XTestb4,yValidb4,yTestb4=train_test_split(XTestb4,yTestb4,train_size=.5,test_size=.5)

### Function that calculates the accuracy of each type of player:

In [25]:
def roleAccuracy(yActual,yPred):
    accurDict={"Role Player":[0,0], "Impact Player":[0,0], "Starter":[0,0]}
    for i in range(len(yActual)):
        accurDict[yActual[i]][0]+=1
        if yActual[i]==yPred[i]:
            accurDict[yActual[i]][1]+=1
    for player in accurDict.keys():
        accur=accurDict[player][1]/accurDict[player][0]
        print(player+ ": "+str(accur))

# 2. Run the Classifiers <a class="anchor" id="Class"></a>

## 3. Knn Classifier <a class="anchor" id="KNN"></a>

In [26]:
from sklearn.neighbors import KNeighborsClassifier
def sklearn_knn_predict(trainX, trainy, testX, distance_metric, k):
    model=KNeighborsClassifier(n_neighbors=k,algorithm='brute',metric=distance_metric)
    model.fit(trainX, trainy)
    return model

In [27]:
def knn_grid_search(trainX, trainy, validationX, validationy, distance_metric_list, n_neighbors_list):
    """For each metric in distance_metric_list, and each value k in n_neighbors_list,
    trains knn classifiers with those parameters
    on the training data and computes the accuracy on the validation data.
    Returns a dictionary mapping each value of the hyperparameter pair (metric, k)
    to the accuracy with those hyperparameters on the validation data
    """
    accurDict={}
    for metric in distance_metric_list:
        for n in n_neighbors_list:
            tup=(metric, n)
            classifier=KNeighborsClassifier(n_neighbors=n,algorithm='brute',metric=metric)
            classifier.fit(trainX, trainy)
            accuracy=classifier.score(validationX,validationy)
            accurDict[tup]=accuracy
    return accurDict

### a.) Bucket 1:

### Grid search with different distance metrics and k values:

In [31]:
# cross validation,
grid=knn_grid_search(XTrainb1,yTrainb1,XValidb1,yValidb1,['euclidean','manhattan'],[1,3,5,7,9,11,13,15,17,19,21,23])
grid

{('euclidean', 1): 0.3389830508474576,
 ('euclidean', 3): 0.3728813559322034,
 ('euclidean', 5): 0.3644067796610169,
 ('euclidean', 7): 0.3728813559322034,
 ('euclidean', 9): 0.3898305084745763,
 ('euclidean', 11): 0.3728813559322034,
 ('euclidean', 13): 0.3728813559322034,
 ('euclidean', 15): 0.3644067796610169,
 ('euclidean', 17): 0.3389830508474576,
 ('euclidean', 19): 0.3813559322033898,
 ('euclidean', 21): 0.3898305084745763,
 ('euclidean', 23): 0.3644067796610169,
 ('manhattan', 1): 0.3474576271186441,
 ('manhattan', 3): 0.3898305084745763,
 ('manhattan', 5): 0.3728813559322034,
 ('manhattan', 7): 0.3644067796610169,
 ('manhattan', 9): 0.3644067796610169,
 ('manhattan', 11): 0.423728813559322,
 ('manhattan', 13): 0.3389830508474576,
 ('manhattan', 15): 0.3559322033898305,
 ('manhattan', 17): 0.3983050847457627,
 ('manhattan', 19): 0.4067796610169492,
 ('manhattan', 21): 0.3983050847457627,
 ('manhattan', 23): 0.3728813559322034}

In [32]:
# manhattan, k=11 has highest accuracy
print(max(grid.values()))

0.423728813559322


### b.) Bucket 2:

In [33]:
grid=knn_grid_search(XTrainb2,yTrainb2,XValidb2,yValidb2,['euclidean','manhattan'],[1,3,5,7,9,11,13,15,17,19,21,23])
grid

{('euclidean', 1): 0.3878787878787879,
 ('euclidean', 3): 0.3333333333333333,
 ('euclidean', 5): 0.3939393939393939,
 ('euclidean', 7): 0.42424242424242425,
 ('euclidean', 9): 0.45454545454545453,
 ('euclidean', 11): 0.38181818181818183,
 ('euclidean', 13): 0.3696969696969697,
 ('euclidean', 15): 0.3878787878787879,
 ('euclidean', 17): 0.4121212121212121,
 ('euclidean', 19): 0.4,
 ('euclidean', 21): 0.40606060606060607,
 ('euclidean', 23): 0.4121212121212121,
 ('manhattan', 1): 0.3878787878787879,
 ('manhattan', 3): 0.34545454545454546,
 ('manhattan', 5): 0.37575757575757573,
 ('manhattan', 7): 0.3696969696969697,
 ('manhattan', 9): 0.4,
 ('manhattan', 11): 0.3939393939393939,
 ('manhattan', 13): 0.3696969696969697,
 ('manhattan', 15): 0.36363636363636365,
 ('manhattan', 17): 0.3878787878787879,
 ('manhattan', 19): 0.38181818181818183,
 ('manhattan', 21): 0.3939393939393939,
 ('manhattan', 23): 0.40606060606060607}

In [34]:
# Euclidean, k=9 is best
print(max(grid.values()))

0.45454545454545453


### c.) Bucket 3:

In [35]:
grid=knn_grid_search(XTrainb3,yTrainb3,XValidb3,yValidb3,['euclidean','manhattan'],[1,3,5,7,9,11,13,15,17,19,21,23])
grid

{('euclidean', 1): 0.40865384615384615,
 ('euclidean', 3): 0.40384615384615385,
 ('euclidean', 5): 0.4519230769230769,
 ('euclidean', 7): 0.47596153846153844,
 ('euclidean', 9): 0.4182692307692308,
 ('euclidean', 11): 0.4807692307692308,
 ('euclidean', 13): 0.5096153846153846,
 ('euclidean', 15): 0.4807692307692308,
 ('euclidean', 17): 0.4855769230769231,
 ('euclidean', 19): 0.4855769230769231,
 ('euclidean', 21): 0.49038461538461536,
 ('euclidean', 23): 0.49038461538461536,
 ('manhattan', 1): 0.38461538461538464,
 ('manhattan', 3): 0.3894230769230769,
 ('manhattan', 5): 0.4375,
 ('manhattan', 7): 0.4519230769230769,
 ('manhattan', 9): 0.47115384615384615,
 ('manhattan', 11): 0.4567307692307692,
 ('manhattan', 13): 0.46153846153846156,
 ('manhattan', 15): 0.46153846153846156,
 ('manhattan', 17): 0.4423076923076923,
 ('manhattan', 19): 0.46153846153846156,
 ('manhattan', 21): 0.46634615384615385,
 ('manhattan', 23): 0.4951923076923077}

In [36]:
# Euclidean, k=13 is best
print(max(grid.values()))

0.5096153846153846


### d.) Bucket 4:

In [37]:
grid=knn_grid_search(XTrainb4,yTrainb4,XValidb4,yValidb4,['euclidean','manhattan'],[1,3,5,7,9,11,13,15,17,19,21,23])
grid

{('euclidean', 1): 0.3710691823899371,
 ('euclidean', 3): 0.3584905660377358,
 ('euclidean', 5): 0.3836477987421384,
 ('euclidean', 7): 0.389937106918239,
 ('euclidean', 9): 0.3836477987421384,
 ('euclidean', 11): 0.4025157232704403,
 ('euclidean', 13): 0.4025157232704403,
 ('euclidean', 15): 0.4528301886792453,
 ('euclidean', 17): 0.4528301886792453,
 ('euclidean', 19): 0.4591194968553459,
 ('euclidean', 21): 0.4339622641509434,
 ('euclidean', 23): 0.4528301886792453,
 ('manhattan', 1): 0.3522012578616352,
 ('manhattan', 3): 0.2830188679245283,
 ('manhattan', 5): 0.37735849056603776,
 ('manhattan', 7): 0.3710691823899371,
 ('manhattan', 9): 0.3584905660377358,
 ('manhattan', 11): 0.3836477987421384,
 ('manhattan', 13): 0.4276729559748428,
 ('manhattan', 15): 0.41509433962264153,
 ('manhattan', 17): 0.44654088050314467,
 ('manhattan', 19): 0.44025157232704404,
 ('manhattan', 21): 0.41509433962264153,
 ('manhattan', 23): 0.4276729559748428}

In [38]:
# Euclidean, k=19
print(max(grid.values()))

0.4591194968553459


### Fit the models to the best metric: Euclidean, k=13

In [39]:
b1Model=sklearn_knn_predict(XTrainb1,yTrainb1,XTestb1,'euclidean',13)
print('Training Accuracy: '+str(b1Model.score(XTrainb1,yTrainb1)))
print('Validation Accuracy: '+str(b1Model.score(XValidb1,yValidb1)))
print('Testing Accuracy: '+str(b1Model.score(XTestb1,yTestb1)))

testPred = b1Model.predict(XTestb1)

Training Accuracy: 0.5225988700564972
Validation Accuracy: 0.3728813559322034
Testing Accuracy: 0.46218487394957986


In [40]:
roleAccuracy(yTestb1, testPred)

Role Player: 0.5714285714285714
Impact Player: 0.13043478260869565
Starter: 0.5106382978723404


In [41]:
b2Model=sklearn_knn_predict(XTrainb2,yTrainb2,XTestb2,'euclidean',13)
testPred = b2Model.predict(XTestb2)
print('Training Accuracy: '+str(b2Model.score(XTrainb2,yTrainb2)))
print('Validation Accuracy: '+str(b2Model.score(XValidb2,yValidb2)))
print('Testing Accuracy: '+str(b2Model.score(XTestb2,yTestb2)))
print(roleAccuracy(yTestb2, testPred))

Training Accuracy: 0.5436105476673428
Validation Accuracy: 0.3696969696969697
Testing Accuracy: 0.42424242424242425
Role Player: 0.5072463768115942
Impact Player: 0.038461538461538464
Starter: 0.4857142857142857
None


In [42]:
b3Model=sklearn_knn_predict(XTrainb3,yTrainb3,XTestb3,'euclidean',13)
testPred = b3Model.predict(XTestb3)
print('Training Accuracy: '+str(b3Model.score(XTrainb3,yTrainb3)))
print('Validation Accuracy: '+str(b3Model.score(XValidb3,yValidb3)))
print('Testing Accuracy: '+str(b3Model.score(XTestb3,yTestb3)))
print(roleAccuracy(yTestb3, testPred))

Training Accuracy: 0.5528846153846154
Validation Accuracy: 0.5096153846153846
Testing Accuracy: 0.4182692307692308
Role Player: 0.42857142857142855
Impact Player: 0.07317073170731707
Starter: 0.5783132530120482
None


In [43]:
b4Model=sklearn_knn_predict(XTrainb4,yTrainb4,XTestb4,'euclidean',13)
testPred = b4Model.predict(XTestb4)
print('Training Accuracy: '+str(b4Model.score(XTrainb4,yTrainb4)))
print('Validation Accuracy: '+str(b4Model.score(XValidb4,yValidb4)))
print('Testing Accuracy: '+str(b4Model.score(XTestb4,yTestb4)))
print(roleAccuracy(yTestb4, testPred))

Training Accuracy: 0.5483193277310925
Validation Accuracy: 0.4025157232704403
Testing Accuracy: 0.4025157232704403
Role Player: 0.4166666666666667
Impact Player: 0.1388888888888889
Starter: 0.5396825396825397
None


## 4. Random Forest Decision Trees <a class="anchor" id="Random"></a>
Found in exercise 1
1600 is the best n_estimates parameter.

In [44]:
np.random.seed(10)

### a.) Bucket 1

In [45]:
nEstimates=[200,400,600,800,1000,1200,1400,1600,1800,2000,2200]

for n in nEstimates:
    clf = ensemble.RandomForestClassifier(n_estimators=n) 
    clf.fit(XTrainb1, yTrainb1)
    print("Validation accuracy "+str(n)+": "+str(clf.score(XValidb1,yValidb1)))
# 600, 1000,1200,1600, 2200

Validation accuracy 200: 0.4152542372881356
Validation accuracy 400: 0.4491525423728814
Validation accuracy 600: 0.4661016949152542
Validation accuracy 800: 0.4576271186440678
Validation accuracy 1000: 0.4661016949152542
Validation accuracy 1200: 0.4661016949152542
Validation accuracy 1400: 0.4491525423728814
Validation accuracy 1600: 0.4661016949152542
Validation accuracy 1800: 0.4491525423728814
Validation accuracy 2000: 0.4491525423728814
Validation accuracy 2200: 0.4661016949152542


In [98]:
# show train, validation, test accuracies
# see where the tree is splitting first: those are most important features
# Present results in table, balance with visualizations
# Include more major takeaways
# random forest account for how correlated variables actually are, less biased
# decision tree could be using the same variables each time
# Kmeans clustering, tried to do it by predicting bucket
clf = ensemble.RandomForestClassifier(n_estimators=600) # Create decision tree classifier
clf.fit(XTrainb1, yTrainb1)
testPred=clf.predict(XTestb1)
print("Training accuracy: "+str(clf.score(XTrainb1,yTrainb1)))
print("Validation accuracy: "+str(clf.score(XValidb1,yValidb1)))
print("Testing accuracy: "+str(clf.score(XTestb1,yTestb1)))
print(roleAccuracy(yTestb1, testPred))

Training accuracy: 1.0
Validation accuracy: 0.4576271186440678
Testing accuracy: 0.44537815126050423
Role Player: 0.5102040816326531
Impact Player: 0.13043478260869565
Starter: 0.5319148936170213
None


In [100]:
clf = ensemble.RandomForestClassifier() # Create decision tree classifier
clf.fit(XTrainb1, yTrainb1)
testPred=clf.predict(XTestb1)
print("Training accuracy: "+str(clf.score(XTrainb1,yTrainb1)))
print("Validation accuracy: "+str(clf.score(XValidb1,yValidb1)))
print("Testing accuracy: "+str(clf.score(XTestb1,yTestb1)))
print(roleAccuracy(yTestb1, testPred))

Training accuracy: 1.0
Validation accuracy: 0.4406779661016949
Testing accuracy: 0.453781512605042
Role Player: 0.4897959183673469
Impact Player: 0.21739130434782608
Starter: 0.5319148936170213
None


### b.) Bucket 2:

In [46]:
nEstimates=[200,400,600,800,1000,1200,1400,1600,1800,2000,2200]

for n in nEstimates:
    clf = ensemble.RandomForestClassifier(n_estimators=n) 
    clf.fit(XTrainb2, yTrainb2)
    print("Validation accuracy "+str(n)+": "+str(clf.score(XValidb2,yValidb2)))

# 200, 400, 600, 1800

Validation accuracy 200: 0.4666666666666667
Validation accuracy 400: 0.4666666666666667
Validation accuracy 600: 0.4666666666666667
Validation accuracy 800: 0.4303030303030303
Validation accuracy 1000: 0.45454545454545453
Validation accuracy 1200: 0.44242424242424244
Validation accuracy 1400: 0.45454545454545453
Validation accuracy 1600: 0.4484848484848485
Validation accuracy 1800: 0.4666666666666667
Validation accuracy 2000: 0.45454545454545453
Validation accuracy 2200: 0.46060606060606063


In [101]:
clf = ensemble.RandomForestClassifier(n_estimators=600) 
clf.fit(XTrainb2, yTrainb2)
testPred=clf.predict(XTestb2)

print("Training accuracy: "+str(clf.score(XTrainb2,yTrainb2)))
print("Validation accuracy: "+str(clf.score(XValidb2,yValidb2)))
print("Testing accuracy: "+str(clf.score(XTestb2,yTestb2)))
print(roleAccuracy(yTestb2, testPred))

Training accuracy: 1.0
Validation accuracy: 0.4727272727272727
Testing accuracy: 0.4666666666666667
Role Player: 0.6086956521739131
Impact Player: 0.11538461538461539
Starter: 0.45714285714285713
None


In [53]:
clf = ensemble.RandomForestClassifier() 
clf.fit(XTrainb2, yTrainb2)

print("Training accuracy: "+str(clf.score(XTrainb2,yTrainb2)))
print("Validation accuracy: "+str(clf.score(XValidb2,yValidb2)))
print("Testing accuracy: "+str(clf.score(XTestb2,yTestb2)))

Training accuracy: 1.0
Validation accuracy: 0.46060606060606063
Testing accuracy: 0.5515151515151515


### c.) Bucket 3:

In [47]:
nEstimates=[200,400,600,800,1000,1200,1400,1600,1800,2000,2200]

for n in nEstimates:
    clf = ensemble.RandomForestClassifier(n_estimators=n) 
    clf.fit(XTrainb3, yTrainb3)
    print("Validation accuracy "+str(n)+": "+str(clf.score(XValidb3,yValidb3)))
    
# 800, 

Validation accuracy 200: 0.4326923076923077
Validation accuracy 400: 0.4519230769230769
Validation accuracy 600: 0.4375
Validation accuracy 800: 0.46153846153846156
Validation accuracy 1000: 0.4567307692307692
Validation accuracy 1200: 0.4326923076923077
Validation accuracy 1400: 0.44711538461538464
Validation accuracy 1600: 0.4519230769230769
Validation accuracy 1800: 0.4423076923076923
Validation accuracy 2000: 0.4423076923076923
Validation accuracy 2200: 0.44711538461538464


In [107]:
clf = ensemble.RandomForestClassifier(n_estimators=600) 
clf.fit(XTrainb3, yTrainb3)
testPred=clf.predict(XTestb3)

print("Training accuracy: "+str(clf.score(XTrainb3,yTrainb3)))
print("Validation accuracy: "+str(clf.score(XValidb3,yValidb3)))
print("Testing accuracy: "+str(clf.score(XTestb3,yTestb3)))
print(roleAccuracy(yTestb3, testPred))

Training accuracy: 1.0
Validation accuracy: 0.46153846153846156
Testing accuracy: 0.44711538461538464
Role Player: 0.5
Impact Player: 0.17073170731707318
Starter: 0.5301204819277109
None


In [103]:
clf = ensemble.RandomForestClassifier() 
clf.fit(XTrainb3, yTrainb3)

print("Training accuracy: "+str(clf.score(XTrainb3,yTrainb3)))
print("Validation accuracy: "+str(clf.score(XValidb3,yValidb3)))
print("Testing accuracy: "+str(clf.score(XTestb3,yTestb3)))

Training accuracy: 1.0
Validation accuracy: 0.44711538461538464
Testing accuracy: 0.4230769230769231


### d.) Bucket 4:

In [48]:
nEstimates=[200,400,600,800,1000,1200,1400,1600,1800,2000,2200]

for n in nEstimates:
    clf = ensemble.RandomForestClassifier(n_estimators=n) 
    clf.fit(XTrainb4, yTrainb4)
    print("Validation accuracy "+str(n)+": "+str(clf.score(XValidb4,yValidb4)))
    
# 200

Validation accuracy 200: 0.37735849056603776
Validation accuracy 400: 0.33962264150943394
Validation accuracy 600: 0.3584905660377358
Validation accuracy 800: 0.3522012578616352
Validation accuracy 1000: 0.36477987421383645
Validation accuracy 1200: 0.3584905660377358
Validation accuracy 1400: 0.34591194968553457
Validation accuracy 1600: 0.36477987421383645
Validation accuracy 1800: 0.3270440251572327
Validation accuracy 2000: 0.3522012578616352
Validation accuracy 2200: 0.3710691823899371


In [113]:
clf = ensemble.RandomForestClassifier(n_estimators=600) 
clf.fit(XTrainb4, yTrainb4)
testPred=clf.predict(XTestb4)

print("Training accuracy: "+str(clf.score(XTrainb4,yTrainb4)))
print("Validation accuracy: "+str(clf.score(XValidb4,yValidb4)))
print("Testing accuracy: "+str(clf.score(XTestb4,yTestb4)))
print(roleAccuracy(yTestb4, testPred))

Training accuracy: 1.0
Validation accuracy: 0.36477987421383645
Testing accuracy: 0.44654088050314467
Role Player: 0.43333333333333335
Impact Player: 0.2222222222222222
Starter: 0.5873015873015873
None


In [110]:
clf = ensemble.RandomForestClassifier() 
clf.fit(XTrainb4, yTrainb4)

print("Training accuracy: "+str(clf.score(XTrainb4,yTrainb4)))
print("Validation accuracy: "+str(clf.score(XValidb4,yValidb4)))
print("Testing accuracy: "+str(clf.score(XTestb4,yTestb4)))

Training accuracy: 1.0
Validation accuracy: 0.3522012578616352
Testing accuracy: 0.44025157232704404


## 5. Decision Tree Classifier <a class="anchor" id="Decision"></a>

### a.) Bucket 1

In [68]:
clf=tree.DecisionTreeClassifier()
clf=clf.fit(XTrainb1,yTrainb1)
y_test=clf.predict(XTestb1)
y_valid=clf.predict(XValidb1)
y_train=clf.predict(XTrainb1)

In [69]:
accTest=accuracy_score(yTestb1, y_test)
accValid=accuracy_score(yValidb1, y_valid)
accTrain=accuracy_score(yTrainb1, y_train)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 1.0
Validation Accuracy: 0.4152542372881356
Test Accuracy: 0.3445378151260504


### b.) Bucket 2

In [70]:
clf=tree.DecisionTreeClassifier()
clf=clf.fit(XTrainb2,yTrainb2)
y_test=clf.predict(XTestb2)
y_valid=clf.predict(XValidb2)
y_train=clf.predict(XTrainb2)

In [71]:
accTest=accuracy_score(yTestb2, y_test)
accValid=accuracy_score(yValidb2, y_valid)
accTrain=accuracy_score(yTrainb2, y_train)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 1.0
Validation Accuracy: 0.3515151515151515
Test Accuracy: 0.3575757575757576


### c.) Bucket 3

In [72]:
clf=tree.DecisionTreeClassifier()
clf=clf.fit(XTrainb3,yTrainb3)
y_test=clf.predict(XTestb3)
y_valid=clf.predict(XValidb3)
y_train=clf.predict(XTrainb3)

In [73]:
accTest=accuracy_score(yTestb3, y_test)
accValid=accuracy_score(yValidb3, y_valid)
accTrain=accuracy_score(yTrainb3, y_train)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 1.0
Validation Accuracy: 0.40384615384615385
Test Accuracy: 0.3894230769230769


### d.) Bucket 4

In [76]:
clf=tree.DecisionTreeClassifier()
clf=clf.fit(XTrainb4,yTrainb4)
y_test=clf.predict(XTestb4)
y_valid=clf.predict(XValidb4)
y_train=clf.predict(XTrainb4)

In [77]:
accTest=accuracy_score(yTestb4, y_test)
accValid=accuracy_score(yValidb4, y_valid)
accTrain=accuracy_score(yTrainb4, y_train)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 1.0
Validation Accuracy: 0.3270440251572327
Test Accuracy: 0.39622641509433965


## 6. Gaussian Naive Bayes <a class="anchor" id="Naive"></a>

### a.) Bucket 1

In [87]:
gnb = GaussianNB()
gnb.fit(XTrainb1, yTrainb1)
y_test = gnb.predict(XTestb1)
y_valid = gnb.predict(XValidb1)
y_train = gnb.predict(XTrainb1)

In [88]:
accTest=accuracy_score(yTestb1, y_test)
accTrain=accuracy_score(yTrainb1, y_train)
accValid=accuracy_score(yValidb1, y_valid)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 0.4745762711864407
Validation Accuracy: 0.4152542372881356
Test Accuracy: 0.36134453781512604


### b.) Bucket 2

In [89]:
gnb = GaussianNB()
gnb.fit(XTrainb2, yTrainb2)
y_test = gnb.predict(XTestb2)
y_valid = gnb.predict(XValidb2)
y_train = gnb.predict(XTrainb2)

In [90]:
accTest=accuracy_score(yTestb2, y_test)
accTrain=accuracy_score(yTrainb2, y_train)
accValid=accuracy_score(yValidb2, y_valid)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 0.5050709939148073
Validation Accuracy: 0.4121212121212121
Test Accuracy: 0.4727272727272727


### c.) Bucket 3

In [91]:
gnb = GaussianNB()
gnb.fit(XTrainb3, yTrainb3)
y_test = gnb.predict(XTestb3)
y_valid = gnb.predict(XValidb3)
y_train = gnb.predict(XTrainb3)

In [92]:
accTest=accuracy_score(yTestb3, y_test)
accTrain=accuracy_score(yTrainb3, y_train)
accValid=accuracy_score(yValidb3, y_valid)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 0.5048076923076923
Validation Accuracy: 0.5336538461538461
Test Accuracy: 0.4375


### d.) Bucket 4

In [96]:
gnb = GaussianNB()
gnb.fit(XTrainb4, yTrainb4)
y_test = gnb.predict(XTestb4)
y_valid = gnb.predict(XValidb4)
y_train = gnb.predict(XTrainb4)

In [97]:
accTest=accuracy_score(yTestb4, y_test)
accTrain=accuracy_score(yTrainb4, y_train)
accValid=accuracy_score(yValidb4, y_valid)

print("Train Accuracy: "+str(accTrain))
print("Validation Accuracy: "+str(accValid))
print("Test Accuracy: "+str(accTest))

Train Accuracy: 0.4957983193277311
Validation Accuracy: 0.4276729559748428
Test Accuracy: 0.389937106918239
