In [1]:
# import the data

import pandas as pd

original = pd.read_csv('nba.csv', header = 0)

In [2]:
total_rows=len(original.axes[0])
total_cols=len(original.axes[1])
print("Number of Rows: "+str(total_rows))
print("Number of Columns: "+str(total_cols))

Number of Rows: 1340
Number of Columns: 21


In [3]:
# Checks to see which columns have missing values

original.isnull().any()

Name    False
GP      False
MIN     False
PPT     False
FGM     False
FGA     False
FG%     False
3PM     False
3PA     False
3P%      True
FTM     False
FTA     False
FT%     False
OREB    False
DREB    False
REB     False
AST     False
STL     False
BLK     False
TOV     False
TAR     False
dtype: bool

In [4]:
# Replace the missing values with the median value of the column

train2 = original.sort_values(by='3P%')
nA = train2.iloc[:, 9].isnull().sum()
vr = total_rows-nA
if vr%2 != 0:
    vr = vr + 1
med = int(vr/2)
medA = train2.iloc[med, 9]
print(nA) # Number of null values in the column
print(med) # Median index
print(medA) # Median value
original['3P%'] = original['3P%'].fillna(medA)

11
665
22.4


In [5]:
# Import the necessary sklearn packages

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn import metrics

In [6]:
# Here we remove the "Name" column, because the name of the player would not add any value in determining if the player
# will last over 5 years

target = original.iloc[:, -1]
temp = original.iloc[:, 1:20]

In [7]:
# Here we create another dataframe where we remove/add other attributes and later compare its performace with the other
# dataframe which only has the "Name" attribute removed

temp2 = original.iloc[:, 1:20]
temp2 = temp2.drop(['FGM', 'FGA', '3PM', '3PA', 'FTM', 'FTA', 'OREB', 'DREB'], axis = 1)

temp2['P/MIN'] = temp2['GP']*temp2['PPT']/temp2['MIN']
temp2 = temp2.drop(['GP', 'PPT'], axis = 1)

temp2['REB/MIN'] = temp2['REB']/temp2['MIN']
temp2['AST/MIN'] = temp2['AST']/temp2['MIN']
temp2['STL/MIN'] = temp2['STL']/temp2['MIN']
temp2['BLK/MIN'] = temp2['BLK']/temp2['MIN']
temp2['TOV/MIN'] = temp2['TOV']/temp2['MIN']

temp2.drop(['REB', 'AST', 'STL', 'BLK', 'TOV'], axis = 1)
temp2 = temp2.drop(['REB', 'AST', 'STL', 'BLK', 'TOV'], axis = 1)

In [8]:
# We split the dataset into 80% training and 20% test

X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

test_rows=len(X_test.axes[0])
test_cols=len(X_test.axes[1])
print("Number of Test Rows: "+str(test_rows))
print("Number of Test Columns: "+str(test_cols))

train_rows=len(X_train.axes[0])
train_cols=len(X_train.axes[1])
print("Number of Training Rows: "+str(train_rows))
print("Number of Training Columns: "+str(train_cols))

Number of Test Rows: 268
Number of Test Columns: 19
Number of Training Rows: 1072
Number of Training Columns: 19


# Cross Validation Method

In [9]:
# Custom method that implements 10-fold cross validation

def crossValidation(X, y, choice):
    
    length = int(train_rows/10)
    
    # The folds are set at these specific intervals to ensure a degree of sameness across the models.
    # These folds are the validation set
    
    X1 = X.iloc[0:length]
    X2 = X.iloc[length:length*2]
    X3 = X.iloc[length*2:length*3]
    X4 = X.iloc[length*3:length*4]
    X5 = X.iloc[length*4:length*5]
    X6 = X.iloc[length*5:length*6]
    X7 = X.iloc[length*6:length*7]
    X8 = X.iloc[length*7:length*8]
    X9 = X.iloc[length*8:length*9]
    X10 = X.iloc[length*9:train_rows]
    
    y1 = y.iloc[0:length]
    y2 = y.iloc[length:length*2]
    y3 = y.iloc[length*2:length*3]
    y4 = y.iloc[length*3:length*4]
    y5 = y.iloc[length*4:length*5]
    y6 = y.iloc[length*5:length*6]
    y7 = y.iloc[length*6:length*7]
    y8 = y.iloc[length*7:length*8]
    y9 = y.iloc[length*8:length*9]
    y10 = y.iloc[length*9:train_rows]
    
    # These folds are the training sets for their respective validation sets
    
    fold2 = []
    
    for i in range(0, length):
        fold2.append(i)
        
    for i in range(length*2, train_rows):
        fold2.append(i)
    
    fold3 = []
    
    for i in range(0, length*2):
        fold3.append(i)
        
    for i in range(length*3, train_rows):
        fold3.append(i)
    
    fold4 = []
    
    for i in range(0, length*3):
        fold4.append(i)
        
    for i in range(length*4, train_rows):
        fold4.append(i)
    
    fold5 = []
    
    for i in range(0, length*4):
        fold5.append(i)
        
    for i in range(length*5, train_rows):
        fold5.append(i)
    
    fold6 = []
    
    for i in range(0, length*5):
        fold6.append(i)
        
    for i in range(length*6, train_rows):
        fold6.append(i)
    
    fold7 = []
    
    for i in range(0, length*6):
        fold7.append(i)
        
    for i in range(length*7, train_rows):
        fold7.append(i)
    
    fold8 = []
    
    for i in range(0, length*7):
        fold8.append(i)
        
    for i in range(length*8, train_rows):
        fold8.append(i)
    
    fold9 = []
    
    for i in range(0, length*8):
        fold9.append(i)
        
    for i in range(length*9, train_rows):
        fold9.append(i)
    
    X11 = X.iloc[length*2:train_rows]
    X22 = X.iloc[fold2]
    X33 = X.iloc[fold3]
    X44 = X.iloc[fold4]
    X55 = X.iloc[fold5]
    X66 = X.iloc[fold6]
    X77 = X.iloc[fold7]
    X88 = X.iloc[fold8]
    X99 = X.iloc[fold9]
    X100 = X.iloc[0:length*9]
    
    y11 = y.iloc[length*2:train_rows]
    y22 = y.iloc[fold2]
    y33 = y.iloc[fold3]
    y44 = y.iloc[fold4]
    y55 = y.iloc[fold5]
    y66 = y.iloc[fold6]
    y77 = y.iloc[fold7]
    y88 = y.iloc[fold8]
    y99 = y.iloc[fold9]
    y100 = y.iloc[0:length*9]
    
    # Choice of "1" implements 10-fold cross validation for K-Nearest Neighbors
    
    if(choice == 1):
        great = 0.0
        gm = 0
        knn1 = KNeighborsClassifier(n_neighbors=5)
        
        knn1.fit(X11, y11)
        
        y_pred = knn1.predict(X1)
        f1 = metrics.f1_score(y1, y_pred)
        
        if f1>great:
            great = f1
            gm = 1
            
        knn2 = KNeighborsClassifier(n_neighbors=5)
        
        knn2.fit(X22, y22)
        
        y_pred = knn2.predict(X2)
        f2 = metrics.f1_score(y2, y_pred)
        
        if f2>great:
            great = f2
            gm = 2
            
        knn3 = KNeighborsClassifier(n_neighbors=5)
        
        knn3.fit(X33, y33)
        
        y_pred = knn3.predict(X3)
        f3 = metrics.f1_score(y3, y_pred)
        
        if f3>great:
            great = f3
            gm = 3
            
        knn4 = KNeighborsClassifier(n_neighbors=5)
        
        knn4.fit(X44, y44)
        
        y_pred = knn1.predict(X4)
        f4 = metrics.f1_score(y4, y_pred)
        
        if f4>great:
            great = f4
            gm = 4
            
        knn5 = KNeighborsClassifier(n_neighbors=5)
        
        knn5.fit(X55, y55)
        
        y_pred = knn5.predict(X5)
        f5 = metrics.f1_score(y5, y_pred)
        
        if f5>great:
            great = f5
            gm = 5
            
        knn6 = KNeighborsClassifier(n_neighbors=5)
        
        knn6.fit(X66, y66)
        
        y_pred = knn6.predict(X6)
        f6 = metrics.f1_score(y6, y_pred)
        
        if f6>great:
            great = f6
            gm = 6
            
        knn7 = KNeighborsClassifier(n_neighbors=5)
        
        knn7.fit(X77, y77)
        
        y_pred = knn7.predict(X7)
        f7 = metrics.f1_score(y7, y_pred)
        
        if f7>great:
            great = f7
            gm = 7
            
        knn8 = KNeighborsClassifier(n_neighbors=5)
        
        knn8.fit(X88, y88)
        
        y_pred = knn8.predict(X8)
        f8 = metrics.f1_score(y8, y_pred)
        
        if f8>great:
            great = f8
            gm = 8
            
        knn9 = KNeighborsClassifier(n_neighbors=5)
        
        knn9.fit(X99, y99)
        
        y_pred = knn9.predict(X9)
        f9 = metrics.f1_score(y9, y_pred)
        
        if f9>great:
            great = f9
            gm = 9
            
        knn10 = KNeighborsClassifier(n_neighbors=5)
        
        knn10.fit(X100, y100)
        
        y_pred = knn10.predict(X10)
        f10 = metrics.f1_score(y10, y_pred)
        
        if f10>great:
            great = f10
            gm = 10
            
            
        if(gm==1):
            return knn1
        if(gm==2):
            return knn2
        if(gm==3):
            return knn3
        if(gm==4):
            return knn4
        if(gm==5):
            return knn5
        if(gm==6):
            return knn6
        if(gm==7):
            return knn7
        if(gm==8):
            return knn8
        if(gm==9):
            return knn9
        if(gm==10):
            return knn10
    
    # Choice of "2" implements 10-fold cross validation for Random Forest
    
    if(choice == 2):
        great = 0.0
        gm = 0
        knn1 = RandomForestClassifier(n_estimators=100)
        
        knn1.fit(X11, y11)
        
        y_pred = knn1.predict(X1)
        f1 = metrics.f1_score(y1, y_pred)
        
        if f1>great:
            great = f1
            gm = 1
            
        knn2 = RandomForestClassifier(n_estimators=100)
        
        knn2.fit(X22, y22)
        
        y_pred = knn2.predict(X2)
        f2 = metrics.f1_score(y2, y_pred)
        
        if f2>great:
            great = f2
            gm = 2
            
        knn3 = RandomForestClassifier(n_estimators=100)
        
        knn3.fit(X33, y33)
        
        y_pred = knn3.predict(X3)
        f3 = metrics.f1_score(y3, y_pred)
        
        if f3>great:
            great = f3
            gm = 3
            
        knn4 = RandomForestClassifier(n_estimators=100)
        
        knn4.fit(X44, y44)
        
        y_pred = knn1.predict(X4)
        f4 = metrics.f1_score(y4, y_pred)
        
        if f4>great:
            great = f4
            gm = 4
            
        knn5 = RandomForestClassifier(n_estimators=100)
        
        knn5.fit(X55, y55)
        
        y_pred = knn5.predict(X5)
        f5 = metrics.f1_score(y5, y_pred)
        
        if f5>great:
            great = f5
            gm = 5
            
        knn6 = RandomForestClassifier(n_estimators=100)
        
        knn6.fit(X66, y66)
        
        y_pred = knn6.predict(X6)
        f6 = metrics.f1_score(y6, y_pred)
        
        if f6>great:
            great = f6
            gm = 6
            
        knn7 = RandomForestClassifier(n_estimators=100)
        
        knn7.fit(X77, y77)
        
        y_pred = knn7.predict(X7)
        f7 = metrics.f1_score(y7, y_pred)
        
        if f7>great:
            great = f7
            gm = 7
            
        knn8 = RandomForestClassifier(n_estimators=100)
        
        knn8.fit(X88, y88)
        
        y_pred = knn8.predict(X8)
        f8 = metrics.f1_score(y8, y_pred)
        
        if f8>great:
            great = f8
            gm = 8
            
        knn9 = RandomForestClassifier(n_estimators=100)
        
        knn9.fit(X99, y99)
        
        y_pred = knn9.predict(X9)
        f9 = metrics.f1_score(y9, y_pred)
        
        if f9>great:
            great = f9
            gm = 9
            
        knn10 = RandomForestClassifier(n_estimators=100)
        
        knn10.fit(X100, y100)
        
        y_pred = knn10.predict(X10)
        f10 = metrics.f1_score(y10, y_pred)
        
        if f10>great:
            great = f10
            gm = 10
            
            
        if(gm==1):
            return knn1
        if(gm==2):
            return knn2
        if(gm==3):
            return knn3
        if(gm==4):
            return knn4
        if(gm==5):
            return knn5
        if(gm==6):
            return knn6
        if(gm==7):
            return knn7
        if(gm==8):
            return knn8
        if(gm==9):
            return knn9
        if(gm==10):
            return knn10
        
    # Choice of "3" implements 10-fold cross validation for Logistic Regression
    
    if(choice == 3):
        great = 0.0
        gm = 0
        knn1 = LogisticRegression(max_iter=20000)
        
        knn1.fit(X11, y11)
        
        y_pred = knn1.predict(X1)
        f1 = metrics.f1_score(y1, y_pred)
        
        if f1>great:
            great = f1
            gm = 1
            
        knn2 = LogisticRegression(max_iter=20000)
        
        knn2.fit(X22, y22)
        
        y_pred = knn2.predict(X2)
        f2 = metrics.f1_score(y2, y_pred)
        
        if f2>great:
            great = f2
            gm = 2
            
        knn3 = LogisticRegression(max_iter=20000)
        
        knn3.fit(X33, y33)
        
        y_pred = knn3.predict(X3)
        f3 = metrics.f1_score(y3, y_pred)
        
        if f3>great:
            great = f3
            gm = 3
            
        knn4 = LogisticRegression(max_iter=20000)
        
        knn4.fit(X44, y44)
        
        y_pred = knn1.predict(X4)
        f4 = metrics.f1_score(y4, y_pred)
        
        if f4>great:
            great = f4
            gm = 4
            
        knn5 = LogisticRegression(max_iter=20000)
        
        knn5.fit(X55, y55)
        
        y_pred = knn5.predict(X5)
        f5 = metrics.f1_score(y5, y_pred)
        
        if f5>great:
            great = f5
            gm = 5
            
        knn6 = LogisticRegression(max_iter=20000)
        
        knn6.fit(X66, y66)
        
        y_pred = knn6.predict(X6)
        f6 = metrics.f1_score(y6, y_pred)
        
        if f6>great:
            great = f6
            gm = 6
            
        knn7 = LogisticRegression(max_iter=20000)
        
        knn7.fit(X77, y77)
        
        y_pred = knn7.predict(X7)
        f7 = metrics.f1_score(y7, y_pred)
        
        if f7>great:
            great = f7
            gm = 7
            
        knn8 = LogisticRegression(max_iter=20000)
        
        knn8.fit(X88, y88)
        
        y_pred = knn8.predict(X8)
        f8 = metrics.f1_score(y8, y_pred)
        
        if f8>great:
            great = f8
            gm = 8
            
        knn9 = LogisticRegression(max_iter=20000)
        
        knn9.fit(X99, y99)
        
        y_pred = knn9.predict(X9)
        f9 = metrics.f1_score(y9, y_pred)
        
        if f9>great:
            great = f9
            gm = 9
            
        knn10 = LogisticRegression(max_iter=20000)
        
        knn10.fit(X100, y100)
        
        y_pred = knn10.predict(X10)
        f10 = metrics.f1_score(y10, y_pred)
        
        if f10>great:
            great = f10
            gm = 10
            
            
        if(gm==1):
            return knn1
        if(gm==2):
            return knn2
        if(gm==3):
            return knn3
        if(gm==4):
            return knn4
        if(gm==5):
            return knn5
        if(gm==6):
            return knn6
        if(gm==7):
            return knn7
        if(gm==8):
            return knn8
        if(gm==9):
            return knn9
        if(gm==10):
            return knn10
        
    # Choice of "4" implements 10-fold cross validation for MLP Classifier
    
    if(choice == 4):
        great = 0.0
        gm = 0
        knn1 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn1.fit(X11, y11)
        
        y_pred = knn1.predict(X1)
        f1 = metrics.f1_score(y1, y_pred)
        
        if f1>great:
            great = f1
            gm = 1
            
        knn2 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn2.fit(X22, y22)
        
        y_pred = knn2.predict(X2)
        f2 = metrics.f1_score(y2, y_pred)
        
        if f2>great:
            great = f2
            gm = 2
            
        knn3 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn3.fit(X33, y33)
        
        y_pred = knn3.predict(X3)
        f3 = metrics.f1_score(y3, y_pred)
        
        if f3>great:
            great = f3
            gm = 3
            
        knn4 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn4.fit(X44, y44)
        
        y_pred = knn1.predict(X4)
        f4 = metrics.f1_score(y4, y_pred)
        
        if f4>great:
            great = f4
            gm = 4
            
        knn5 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn5.fit(X55, y55)
        
        y_pred = knn5.predict(X5)
        f5 = metrics.f1_score(y5, y_pred)
        
        if f5>great:
            great = f5
            gm = 5
            
        knn6 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn6.fit(X66, y66)
        
        y_pred = knn6.predict(X6)
        f6 = metrics.f1_score(y6, y_pred)
        
        if f6>great:
            great = f6
            gm = 6
            
        knn7 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn7.fit(X77, y77)
        
        y_pred = knn7.predict(X7)
        f7 = metrics.f1_score(y7, y_pred)
        
        if f7>great:
            great = f7
            gm = 7
            
        knn8 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn8.fit(X88, y88)
        
        y_pred = knn8.predict(X8)
        f8 = metrics.f1_score(y8, y_pred)
        
        if f8>great:
            great = f8
            gm = 8
            
        knn9 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn9.fit(X99, y99)
        
        y_pred = knn9.predict(X9)
        f9 = metrics.f1_score(y9, y_pred)
        
        if f9>great:
            great = f9
            gm = 9
            
        knn10 = MLPClassifier(max_iter=20000, early_stopping = True)
        
        knn10.fit(X100, y100)
        
        y_pred = knn10.predict(X10)
        f10 = metrics.f1_score(y10, y_pred)
        
        if f10>great:
            great = f10
            gm = 10
            
            
        if(gm==1):
            return knn1
        if(gm==2):
            return knn2
        if(gm==3):
            return knn3
        if(gm==4):
            return knn4
        if(gm==5):
            return knn5
        if(gm==6):
            return knn6
        if(gm==7):
            return knn7
        if(gm==8):
            return knn8
        if(gm==9):
            return knn9
        if(gm==10):
            return knn10

# F1 Scores for the Data Frame without any Removed/Added Attributes

In [10]:
%%time
# This cell calls the custom cross validation method and GridSearchCV to run the models and get an F1 score
# The dataset without removed/added attributes is used here

X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42) # 80% training and 20% test

# K-Nearest Neighbors

# Calls custom cross validation
knn = crossValidation(X_train, y_train, 1)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print("F1 Score for K-Nearest Neighbors using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

knn = KNeighborsClassifier()

# Uses the cross validation available in Grid Search
# Grid Search is run with the default parameters
gs_random = GridSearchCV(estimator = knn, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_

y_pred = knn_best_model.predict(X_test)
print("F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Random Forests

clf = crossValidation(X_train, y_train, 2)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
print("\nF1 Score for Random Forests using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

clf = RandomForestClassifier(n_estimators=100)

gs_random = GridSearchCV(estimator = clf, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
clf_best_model = gs_random.best_estimator_

y_pred = clf_best_model.predict(X_test)
print("F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Logistic Regression

logreg = crossValidation(X_train, y_train, 3)

logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)
print("\nF1 Score for Logistic Regression using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Artificial Neural Networks

mlpc = crossValidation(X_train, y_train, 4)

mlpc.fit(X_train,y_train)

y_pred=mlpc.predict(X_test)
print("\nF1 Score for Artificial Neural Network using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

mlpc = MLPClassifier(max_iter=20000, early_stopping = True)

gs_random = GridSearchCV(estimator = mlpc, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_

y_pred = mlpc_best_model.predict(X_test)
print("F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

F1 Score for K-Nearest Neighbors using Custom Cross Validation Method: 0.7374301675977653
F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter: 0.7374301675977653

F1 Score for Random Forests using Custom Cross Validation Method: 0.7777777777777778
F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter: 0.7777777777777778

F1 Score for Logistic Regression using Custom Cross Validation Method: 0.8111111111111111
F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter: 0.8111111111111111

F1 Score for Artificial Neural Network using Custom Cross Validation Method: 0.7764127764127764
F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter: 0.792022792022792
Wall time: 13.7 s


# F1 Scores for the Data Frame with Removed/Added Attributes

In [11]:
%%time
# This cell calls the custom cross validation method and GridSearchCV to run the models and get an F1 score
# The dataset with removed/added attributes is used here

X_train, X_test, y_train, y_test = train_test_split(temp2, target, test_size=0.2, random_state = 42) # 80% training and 20% test

# K-Nearest Neighbors

knn = crossValidation(X_train, y_train, 1)

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
print("F1 Score for K-Nearest Neighbors using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

knn = KNeighborsClassifier()

gs_random = GridSearchCV(estimator = knn, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_

y_pred = knn_best_model.predict(X_test)
print("F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Random Forests

clf = crossValidation(X_train, y_train, 2)

clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
print("\nF1 Score for Random Forests using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

clf = RandomForestClassifier(n_estimators=100)

gs_random = GridSearchCV(estimator = clf, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
clf_best_model = gs_random.best_estimator_

y_pred = clf_best_model.predict(X_test)
print("F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Logistic Regression

logreg = crossValidation(X_train, y_train, 3)

logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)
print("\nF1 Score for Logistic Regression using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Artificial Neural Networks

mlpc = crossValidation(X_train, y_train, 4)

mlpc.fit(X_train,y_train)

y_pred=mlpc.predict(X_test)
print("\nF1 Score for Artificial Neural Network using Custom Cross Validation Method:",metrics.f1_score(y_test, y_pred))

mlpc = MLPClassifier(max_iter=20000, early_stopping = True)

gs_random = GridSearchCV(estimator = mlpc, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_

y_pred = mlpc_best_model.predict(X_test)
print("F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

F1 Score for K-Nearest Neighbors using Custom Cross Validation Method: 0.7811634349030471
F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter: 0.7811634349030471

F1 Score for Random Forests using Custom Cross Validation Method: 0.787709497206704
F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter: 0.7875354107648724

F1 Score for Logistic Regression using Custom Cross Validation Method: 0.7978142076502731
F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter: 0.7978142076502731

F1 Score for Artificial Neural Network using Custom Cross Validation Method: 0.8044077134986226
F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter: 0.7660818713450293
Wall time: 8.43 s


# F1 Scores with Normalization

In [12]:
%%time
# This cell runs the models after the data has been normalized
# GridSearchCV with default parameters is used for cross validation

temp = temp2 # The dataframe with the best average F1 score across models was picked. In this case, it was the dataframe with
# the attributes removed/added

ntemp = preprocessing.normalize(temp) # Performs normalization on the dataset
X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42) # 80% training and 20% test

print("F1 Scores after Normalization\n")

# K-Nearest Neighbors

knn = KNeighborsClassifier()

gs_random = GridSearchCV(estimator = knn, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_

y_pred = knn_best_model.predict(X_test)
print("F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Random Forests

clf = RandomForestClassifier(n_estimators=100)

gs_random = GridSearchCV(estimator = clf, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
clf_best_model = gs_random.best_estimator_

y_pred = clf_best_model.predict(X_test)
print("F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Logistic Regression

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Artificial Neural Networks

mlpc = MLPClassifier(max_iter=20000, early_stopping = True)

gs_random = GridSearchCV(estimator = mlpc, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_

y_pred = mlpc_best_model.predict(X_test)
print("F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

F1 Scores after Normalization

F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter: 0.777142857142857


KeyboardInterrupt: 

# F1 Scores with Standardization

In [13]:
%%time
# This cell runs the models after the data has been standardized
# GridSearchCV with default parameters is used for cross validation

temp = temp2
stemp = preprocessing.scale(temp) # Performs standardization on the dataset
X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42) # 80% training and 20% test

print("F1 Scores after Standardization\n")

# K-Nearest Neighbors

knn = KNeighborsClassifier()

gs_random = GridSearchCV(estimator = knn, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_

y_pred = knn_best_model.predict(X_test)
print("F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Random Forests

clf = RandomForestClassifier(n_estimators=100)

gs_random = GridSearchCV(estimator = clf, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
clf_best_model = gs_random.best_estimator_

y_pred = clf_best_model.predict(X_test)
print("F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Logistic Regression

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Artificial Neural Networks

mlpc = MLPClassifier(max_iter=20000, early_stopping = True)

gs_random = GridSearchCV(estimator = mlpc, param_grid = {}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_

y_pred = mlpc_best_model.predict(X_test)
print("F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

F1 Scores after Standardization

F1 Score for K-Nearest Neighbors using GridSearchCV with Default Scoring Parameter: 0.7768595041322313
F1 Score for Random Forests using GridSearchCV with Default Scoring Parameter: 0.7909604519774012
F1 Score for Logistic Regression using GridSearchCV with Default Scoring Parameter: 0.8033707865168538
F1 Score for Artificial Neural Network using GridSearchCV with Default Scoring Parameter: 0.7967914438502673
Wall time: 3.76 s


# F1 Scores for Logistic Regression Using the Various Regularization Penalties

In [14]:
%%time
# This cell runs the various regularization penalties on the Logistic Regression model
# It is run for when the data has not been scaled, has been normalized, and for when it has been standardized

temp = temp2
ntemp = preprocessing.normalize(temp)
stemp = preprocessing.scale(temp)
print("Without Data Manipulation\n")

X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

# No penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['none'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
# The 'saga' solver is used here because it is the only one which supports all the regularization penalties

gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with no penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L1 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l1'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l1 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L2 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l2'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l2 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Elasticnet penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5]}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with elasticnet penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

print("\nWith Normalization\n")

X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42)

# No penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['none'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with no penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L1 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l1'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l1 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L2 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l2'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l2 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Elasticnet penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5]}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with elasticnet penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

print("\nWith Standardization\n")

X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42)

# No penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['none'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with no penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L1 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l1'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l1 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# L2 penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['l2'], 'solver': ['saga']}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with l2 penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

# Elasticnet penalty

logreg = LogisticRegression(max_iter=20000)

gs_random = GridSearchCV(estimator = logreg, param_grid = {'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.5]}, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_

y_pred = logreg_best_model.predict(X_test)
print("F1 Score for Logistic Regression with elasticnet penaltly using GridSearchCV with Default Scoring Parameter:",metrics.f1_score(y_test, y_pred))

Without Data Manipulation

F1 Score for Logistic Regression with no penaltly using GridSearchCV with Default Scoring Parameter: 0.8032786885245901
F1 Score for Logistic Regression with l1 penaltly using GridSearchCV with Default Scoring Parameter: 0.7989130434782609
F1 Score for Logistic Regression with l2 penaltly using GridSearchCV with Default Scoring Parameter: 0.7956403269754767
F1 Score for Logistic Regression with elasticnet penaltly using GridSearchCV with Default Scoring Parameter: 0.8010899182561307

With Normalization

F1 Score for Logistic Regression with no penaltly using GridSearchCV with Default Scoring Parameter: 0.782608695652174
F1 Score for Logistic Regression with l1 penaltly using GridSearchCV with Default Scoring Parameter: 0.7795698924731184
F1 Score for Logistic Regression with l2 penaltly using GridSearchCV with Default Scoring Parameter: 0.78125
F1 Score for Logistic Regression with elasticnet penaltly using GridSearchCV with Default Scoring Parameter: 0.77044

# Using GridSearch to select hyperparameters for models

In [15]:
%%time
# Using GridSearch to select the best parameters for the K-Nearest Neighbors model

temp = temp2
X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

knn = KNeighborsClassifier()

knn_params = { 
    'n_neighbors': [2, 5, 7, 10, 12, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

gs_random = GridSearchCV(estimator = knn, param_grid = knn_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_
knn_best_params = gs_random.best_params_

print ("Best Model:", knn_best_model)
print ("\nBest Parameters:", knn_best_params)

y_pred = knn_best_model.predict(X_test)
print("\nF1 Score for K-Nearest Neighbors:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [16]:
%%time
# Using GridSearch to select the best parameters for the Random Forests model

temp = temp2
X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

rf = RandomForestClassifier()

rf_params = { 
    'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
}

gs_random = GridSearchCV(estimator = rf, param_grid = rf_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
rf_best_model = gs_random.best_estimator_
rf_best_params = gs_random.best_params_

print ("Best Model:", rf_best_model)
print ("\nBest Parameters:", rf_best_params)

y_pred = rf_best_model.predict(X_test)
print("\nF1 Score for Random Forest:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [17]:
%%time
# Using GridSearch to select the best parameters for the Logistic Regression model

temp = temp2
X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

logreg = LogisticRegression(max_iter = 20000)

logreg_params = {
    'C': [0.1, 0.3, 0.5, 0.7, 1.0],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

gs_random = GridSearchCV(estimator = logreg, param_grid = logreg_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_
logreg_best_params = gs_random.best_params_

print ("Best Model:", logreg_best_model)
print ("\nBest Parameters:", logreg_best_params)

y_pred = logreg_best_model.predict(X_test)
print("\nF1 Score for Logistic Regression:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [18]:
%%time
# Using GridSearch to select the best parameters for the Artificial Neural Networks model

temp = temp2
X_train, X_test, y_train, y_test = train_test_split(temp, target, test_size=0.2, random_state = 42)

mlpc = MLPClassifier(max_iter=20000)

mlpc_params = { 
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'early_stopping': [True]
}

gs_random = GridSearchCV(estimator = mlpc, param_grid = mlpc_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_
mlpc_best_params = gs_random.best_params_

print ("Best Model:", mlpc_best_model)
print ("\nBest Parameters:", mlpc_best_params)

y_pred = mlpc_best_model.predict(X_test)
print("\nF1 Score for Artificial Neural Network:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

# Using Grid Search to select hyperparameters after data has been Normalized

In [19]:
%%time
# Using GridSearch to select the best parameters for the K-Nearest Neighbors model
# The data here has been normalized

temp = temp2
ntemp = preprocessing.normalize(temp)
X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42)

knn = KNeighborsClassifier()

knn_params = { 
    'n_neighbors': [2, 5, 7, 10, 12, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

gs_random = GridSearchCV(estimator = knn, param_grid = knn_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_
knn_best_params = gs_random.best_params_

print ("Best Model:", knn_best_model)
print ("\nBest Parameters:", knn_best_params)

y_pred = knn_best_model.predict(X_test)
print("\nF1 Score for K-Nearest Neighbors:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [20]:
%%time
# Using GridSearch to select the best parameters for the Random Forests model
# The data here has been normalized

temp = temp2
ntemp = preprocessing.normalize(temp)
X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42)

rf = RandomForestClassifier()

rf_params = { 
    'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
}

gs_random = GridSearchCV(estimator = rf, param_grid = rf_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
rf_best_model = gs_random.best_estimator_
rf_best_params = gs_random.best_params_

print ("Best Model:", rf_best_model)
print ("\nBest Parameters:", rf_best_params)

y_pred = rf_best_model.predict(X_test)
print("\nF1 Score for Random Forest:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [21]:
%%time
# Using GridSearch to select the best parameters for the Logistic Regression model
# The data here has been normalized

temp = temp2
ntemp = preprocessing.normalize(temp)
X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42)

logreg = LogisticRegression(max_iter=20000)

logreg_params = {
    'C': [0.1, 0.3, 0.5, 0.7, 1.0],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

gs_random = GridSearchCV(estimator = logreg, param_grid = logreg_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_
logreg_best_params = gs_random.best_params_

print ("Best Model:", logreg_best_model)
print ("\nBest Parameters:", logreg_best_params)

y_pred = logreg_best_model.predict(X_test)
print("\nF1 Score for Logistic Regression:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [22]:
%%time
# Using GridSearch to select the best parameters for the Artificial Neural Networks model
# The data here has been normalized

temp = temp2
ntemp = preprocessing.normalize(temp)
X_train, X_test, y_train, y_test = train_test_split(ntemp, target, test_size=0.2, random_state = 42)

mlpc = MLPClassifier(max_iter=20000)

mlpc_params = { 
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'early_stopping': [True]
}

gs_random = GridSearchCV(estimator = mlpc, param_grid = mlpc_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_
mlpc_best_params = gs_random.best_params_

print ("Best Model:", mlpc_best_model)
print ("\nBest Parameters:", mlpc_best_params)

y_pred = mlpc_best_model.predict(X_test)
print("\nF1 Score for Artificial Neural Network:",metrics.f1_score(y_test, y_pred))



KeyboardInterrupt: 

# Using Grid Search to select hyperparameters after data has been Standardized

In [23]:
%%time
# Using GridSearch to select the best parameters for the K-Nearest Neighbors model
# The data here has been standardized

temp = temp2
stemp = preprocessing.scale(temp)
X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42)

#Create KNN Classifier
knn = KNeighborsClassifier()

knn_params = { 
    'n_neighbors': [2, 5, 7, 10, 12, 15, 17, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50],
    'p': [1, 2]
}

gs_random = GridSearchCV(estimator = knn, param_grid = knn_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
knn_best_model = gs_random.best_estimator_
knn_best_params = gs_random.best_params_

print ("Best Model:", knn_best_model)
print ("\nBest Parameters:", knn_best_params)

y_pred = knn_best_model.predict(X_test)
print("\nF1 Score for K-Nearest Neighbors:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [24]:
%%time
# Using GridSearch to select the best parameters for the Random Forests model
# The data here has been standardized

temp = temp2
stemp = preprocessing.scale(temp)
X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42)

rf = RandomForestClassifier()

rf_params = { 
    'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2'],
}

gs_random = GridSearchCV(estimator = rf, param_grid = rf_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
rf_best_model = gs_random.best_estimator_
rf_best_params = gs_random.best_params_

print ("Best Model:", rf_best_model)
print ("\nBest Parameters:", rf_best_params)

y_pred = rf_best_model.predict(X_test)
print("\nF1 Score for Random Forest:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [25]:
%%time
# Using GridSearch to select the best parameters for the Logistic Regression model
# The data here has been standardized

temp = temp2
stemp = preprocessing.scale(temp)
X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42)

logreg = LogisticRegression(max_iter=20000)

logreg_params = {
    'C': [0.1, 0.3, 0.5, 0.7, 1.0],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
}

gs_random = GridSearchCV(estimator = logreg, param_grid = logreg_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
logreg_best_model = gs_random.best_estimator_
logreg_best_params = gs_random.best_params_

print ("Best Model:", logreg_best_model)
print ("\nBest Parameters:", logreg_best_params)

y_pred = logreg_best_model.predict(X_test)
print("\nF1 Score for Logistic Regression:",metrics.f1_score(y_test, y_pred))

KeyboardInterrupt: 

In [26]:
%%time
# Using GridSearch to select the best parameters for the Artificial Neural Networks model
# The data here has been standardized

temp = temp2
stemp = preprocessing.scale(temp)
X_train, X_test, y_train, y_test = train_test_split(stemp, target, test_size=0.2, random_state = 42)

mlpc = MLPClassifier(max_iter=20000)

mlpc_params = { 
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4],
    'tol': [1e-1, 1e-2, 1e-3, 1e-4],
    'early_stopping': [True]
}

gs_random = GridSearchCV(estimator = mlpc, param_grid = mlpc_params, scoring = 'f1', cv= 10)
gs_random.fit(X_train, y_train)
mlpc_best_model = gs_random.best_estimator_
mlpc_best_params = gs_random.best_params_

print ("Best Model:", mlpc_best_model)
print ("\nBest Parameters:", mlpc_best_params)

y_pred = mlpc_best_model.predict(X_test)
print("\nF1 Score for Artificial Neural Network:",metrics.f1_score(y_test, y_pred))

Traceback (most recent call last):
  File "C:\Users\rajendren\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\rajendren\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 1027, in fit
    return self._fit(X, y, incremental=(self.warm_start and
  File "C:\Users\rajendren\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 370, in _fit
    self._fit_stochastic(X, y, activations, deltas, coef_grads,
  File "C:\Users\rajendren\anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py", line 591, in _fit_stochastic
    self.coefs_ = self._best_coefs
AttributeError: 'MLPClassifier' object has no attribute '_best_coefs'



KeyboardInterrupt: 