In [None]:
#Importing required Python packages 
%matplotlib inline 
import numpy as np 
from pprint import pprint 
import pandas as pd 
import zipfile 
import requests 
import io 
import datetime 
import seaborn as sns
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy import stats
from sklearn.model_selection import train_test_split

In [None]:
#Load the first abalone dataset
abalone = pd.read_csv('abalone.data', header=None)
abalone = pd.get_dummies(abalone)
Y_abalone_pre = abalone[8].as_matrix() # Column 8 is the label
Y_abalone = []
count1 = 0
count2 = 0
for i in Y_abalone_pre:
    if i > 9:
        Y_abalone.append(1)
        count1 += 1
    else:
        Y_abalone.append(0)
        count2 += 1
print(count1)
print(count2)
Y_abalone = np.array(Y_abalone)
del abalone[8]
X_abalone = abalone.as_matrix()

In [None]:
# Load the wine dataset, 1 for quality 6, 0 for everything else
wine = np.loadtxt('winequality-white.csv', delimiter=';')
X_wine = wine[:,:11]
Y_wine_pre = wine[:,11]
Y_wine = []
count1 = 0
count2 = 0
for i in Y_wine_pre:
    if i == 6:
        Y_wine.append(1)
        count1 += 1
    else:
        Y_wine.append(0)
        count2 += 1
print(count1)
print(count2)
Y_wine = np.array(Y_wine)

In [None]:
# Load the covtype dataset
covtype = np.loadtxt('covtype.data', delimiter=',')
np.random.shuffle(covtype)    # Shuffle the data.
print(covtype.shape)

In [None]:
X_cov = covtype[:8000,:54] 
Y_cov_pre = covtype[:8000,54]
print(type(Y_cov_pre))
#Find highest frequency y value
from collections import Counter
data = Counter(list(Y_cov_pre))
HighestFreq = data.most_common(1)[0][0]
print(HighestFreq)
Y_cov = []
count1 = 0
count2 = 0
for i in Y_cov_pre:
    if i == HighestFreq:
        Y_cov.append(1)
        count1 += 1
    else:
        Y_cov.append(0)
        count2 += 1
print(count1)
print(count2)
Y_cov = np.array(Y_cov)
print(X_cov.shape, Y_cov.shape)

In [None]:
# Load the adult dataset
adult = pd.read_csv('adultData.csv', header=None)
adult = pd.get_dummies(adult)
adult_matrix = adult.as_matrix()
np.random.shuffle(adult_matrix)    # Shuffle the data.

X_adult = adult_matrix[:12000, :104] # ignore the 109th column as that is an extra y label
Y_adult = adult_matrix[:12000, 105] # binary: 1 represents > 50k, 0 represents <= 50k
count1 = 0
count2 = 0
for i in Y_adult:
    if i == 0:
        count1 += 1
    else:
        count2 += 1
print(count1)
print(count2)
print(X_adult.shape, Y_adult.shape)

# Classifier 1: Decision Tree Classifier

In [None]:
# Abalone Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.1, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

param_grid = {'max_depth': np.arange(1,10)}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True, cv=5)
tree.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", tree.best_params_)
test_acc = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.5, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True, cv=5)
tree.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", tree.best_params_)
test_acc2 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.9, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True, cv=5)
tree.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", tree.best_params_)
test_acc3 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc3)


train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_abalone = (test_acc + test_acc2 + test_acc3)/3
print("average testing accuracy is", test_avg_abalone)

In [None]:
# Wine quality Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.1, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

param_grid = {'max_depth': np.arange(1,6)}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True)
tree.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", tree.best_params_)
test_acc = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.5, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", tree.best_params_)
test_acc2 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.9, random_state=42)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", tree.best_params_)
test_acc3 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc3)


train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_wine = (test_acc + test_acc2 + test_acc3)/3
print("average testing accuracy is", test_avg_wine)

In [None]:
# COV_TYPE Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.1, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

param_grid = {'max_depth': np.arange(1,10)}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True)
tree.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", tree.best_params_)
test_acc = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.5, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", tree.best_params_)
test_acc2 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.9, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", tree.best_params_)
test_acc3 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc3)


train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_cov = (test_acc + test_acc2 + test_acc3)/3
print("average testing accuracy is", test_avg_cov)

In [None]:
# Adult Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.1, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

param_grid = {'max_depth': np.arange(1,30)}
tree = GridSearchCV(DecisionTreeClassifier(), param_grid, return_train_score=True)
tree.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", tree.best_params_)
test_acc = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.5, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", tree.best_params_)
test_acc2 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.9, random_state=0)  
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)

tree.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", tree.best_params_)
test_acc3 = tree.score(X_test, Y_test)
print("testing accuracy is", test_acc3)


train_acc = tree.cv_results_['mean_train_score']
val_acc = tree.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_adult = (test_acc + test_acc2 + test_acc3)/3
print("average testing accuracy is", test_avg_adult)

In [None]:
# Summary of Decision Tree

print("Average abalone test accuaracy:", test_avg_abalone)
print("Average wine quality test accuaracy:", test_avg_wine)
print("Average cov type test accuaracy:", test_avg_cov)
print("Average adult test accuaracy:", test_avg_adult)

# Classifier 2: KNN

In [None]:
# KNN 
classifier  = KNeighborsClassifier(algorithm='brute')


In [None]:
# Abalone Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.1, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.5, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.9, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc3 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc3)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_abalone = (test_acc + test_acc2 + test_acc3)/3
print("average abalone testing accuracy is", test_avg_abalone)

In [None]:
# Wine Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.1, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.5, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.9, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc3 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc3)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_wine = (test_acc + test_acc2 + test_acc3)/3
print("average wine testing accuracy is", test_avg_wine)

In [None]:
# cov Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.1, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.5, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.9, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc3 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc3)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_cov = (test_acc + test_acc2 + test_acc3)/3
print("average cov testing accuracy is", test_avg_cov)

In [None]:
# Adult Data Set

# Search with 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.1, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True, cv=5)
print(X_train_val.shape, X_test.shape, Y_train_val.shape, Y_test.shape)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 50 50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.5, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

# Search with 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.9, random_state=42)  
partition = list(np.linspace(1, X_train_val.shape[0]/5*4-4, num=26))
partition = [int(i) for i in partition]
param_grid = {'n_neighbors': partition}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc3 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc3)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()

test_avg_adult = (test_acc + test_acc2 + test_acc3)/3
print("average adult testing accuracy is", test_avg_adult)

In [None]:
# Summary of KNN

print("Average abalone test accuaracy:", test_avg_abalone)
print("Average wine quality test accuaracy:", test_avg_wine)
print("Average cov type test accuaracy:", test_avg_cov)
print("Average adult test accuaracy:", test_avg_adult)

# 3rd Classifier: Random Forest

In [None]:
#Abalone Dataset
# Search with 90 10 split
classifier = RandomForestClassifier(1200)
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.1, random_state=42) 
maxf = [2,4,6,8]
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()


# 50/50 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.5, random_state=42) 
maxf = [2,4,6,8]
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()


# 10 90 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.9, random_state=42) 
maxf = [2,4,6,8]
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True, cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc3 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc3)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training scores:", train_acc)
print("validation scores:", val_acc)
print()
test_avg_abalone = (test_acc+test_acc2+test_acc3)/3
print("average test acc out of all 3 paritions is:", test_avg_abalone)

In [None]:
# wine quality data set

max_list = [2,4,6,8,10]

# Search with 90 10 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.1, random_state=42) 
maxf = [2,4,6,8,10]
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.5, random_state=42) 
param_grid = {'max_features': maxf}
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10/90 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.9, random_state=42) 
param_grid = {'max_features': maxf}
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_wine = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_wine)

In [None]:
# COV_type  data set

maxf = [2,4,6,8,12,16,20]
# Search with 90 10 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.1, random_state=42) 
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10/90 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_cov = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_cov)

In [None]:
# Adult  data set

maxf = [2,4,6,8,12,16,20]
# Search with 90 10 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.1, random_state=42) 
param_grid = {'max_features': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.5, random_state=42) 

clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
classifier = RandomForestClassifier()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_adult = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_adult)

In [None]:
# Summary of Random Forest

print("Average abalone test accuaracy:", test_avg_abalone)
print("Average wine quality test accuaracy:", test_avg_wine)
print("Average cov type test accuaracy:", test_avg_cov)
print("Average adult test accuaracy:", test_avg_adult)

# Classifer #4, Logistic Regression

In [None]:
# Abalone Data Set

# Search with 90 10 split
classifier = LogisticRegression()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.1, random_state=42) 
maxf = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3,\
        10**-2, 10**-1, 1, 10, 10**2, 10**3, 10**4]
param_grid = {'C': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_abalone = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_abalone)

In [None]:
# wine quality data set

# Search with 90 10 split
classifier = LogisticRegression()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.1, random_state=42) 
maxf = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3,\
        10**-2, 10**-1, 1, 10, 10**2, 10**3, 10**4]
param_grid = {'C': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_wine = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_wine)

In [None]:
# COV_TYPE data set

# Search with 90 10 split
classifier = LogisticRegression()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.1, random_state=42) 
maxf = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3,\
        10**-2, 10**-1, 1, 10, 10**2, 10**3, 10**4]
param_grid = {'C': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_cov = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_cov)

In [None]:
# Adult data set

# Search with 90 10 split
classifier = LogisticRegression()
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.1, random_state=42) 
maxf = [10**-8, 10**-7, 10**-6, 10**-5, 10**-4, 10**-3,\
        10**-2, 10**-1, 1, 10, 10**2, 10**3, 10**4]
param_grid = {'C': maxf}
clf = GridSearchCV(classifier, param_grid, return_train_score=True,cv=5)
clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_adult = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_adult)

In [None]:
# Summary of Logistic Regression

print("Average abalone test accuaracy:", test_avg_abalone)
print("Average wine quality test accuaracy:", test_avg_wine)
print("Average cov type test accuaracy:", test_avg_cov)
print("Average adult test accuaracy:", test_avg_adult)

# Classifier 5: Artificial Neural Nets

In [None]:
# Abalone Data Set

# 80 20 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.1, random_state=42) 

classifier = MLPClassifier(solver='sgd', max_iter=20000)
m_list = [0,0.2,0.5,0.9]
h_list = [(1,), (2,),(4,), (8,), (16,), (32,), (64,), (128)]
param_grid = {'momentum': m_list, 'hidden_layer_sizes': h_list}
clf = RandomizedSearchCV(classifier, param_grid, return_train_score=True,cv=5)

clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_abalone, Y_abalone, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_abalone = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_abalone)

In [None]:
# Wine Quality Data Set

# 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.1, random_state=42) 

clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_wine, Y_wine, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_wine = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_wine)

In [None]:
# cov Data Set

# 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.1, random_state=42) 

clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10 90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_cov, Y_cov, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_cov = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_cov)

In [None]:
# Adult Data Set

# 90 10 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.1, random_state=42) 

classifier = MLPClassifier(solver='sgd', max_iter=20000)
m_list = [0,0.2,0.5,0.9]
h_list = [(1,), (2,),(4,), (8,), (16,), (32,), (64,), (128)]
param_grid = {'momentum': m_list, 'hidden_layer_sizes': h_list}
clf = GridSearchCV(classifier, param_grid, return_train_score=True)

clf.fit(X_train_val, Y_train_val)
print("Best params with 90/10 split", clf.best_params_)
test_acc = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

# 50/50 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.5, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 50/50 split", clf.best_params_)
test_acc1 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc1)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()

#2 10/90 split
X_train_val, X_test, Y_train_val, Y_test = \
train_test_split(X_adult, Y_adult, test_size=0.9, random_state=42) 
clf.fit(X_train_val, Y_train_val)
print("Best params with 10/90 split", clf.best_params_)
test_acc2 = clf.score(X_test, Y_test)
print("testing accuracy is", test_acc2)
train_acc = clf.cv_results_['mean_train_score']
val_acc = clf.cv_results_['mean_test_score']
print("training acc is:", train_acc)
print("validation acc is:", val_acc)
print()
test_avg_adult = (test_acc+test_acc1+test_acc2)/3
print("average test acc out of all 3 paritions is:", test_avg_adult)

In [None]:
# Summary of Neural Nets

print("Average abalone test accuaracy:", test_avg_abalone)
print("Average wine quality test accuaracy:", test_avg_wine)
print("Average cov type test accuaracy:", test_avg_cov)
print("Average adult test accuaracy:", test_avg_adult)