### Exercise 1.1 : Data Preprocessing (10 points)

In [3]:
import numpy as np
import scipy
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

# reading dataset
df = pd.read_csv('Dry_Beans_Dataset.csv')
# df.head()

# encode as integers
myData_encoder = LabelEncoder()
myData_encoded =  myData_encoder.fit_transform(df.Class) 
# print (myData_encoded)
 
# binary encode
onehot_encoder = OneHotEncoder(sparse=False) # disable sparse return type

# reshape the array
myData_encoded = myData_encoded.reshape(len(myData_encoded), 1) 
onehot_encoded = onehot_encoder.fit_transform(myData_encoded) 

#turn encoded data into DataFrame
encoded_class = pd.DataFrame(onehot_encoded)

#get unique headers into class_head variable
class_head = np.unique(np.concatenate(df['Class'].str.split(';\s*').values))

#put headers on encoded_class Datatframe
encoded_class.columns = class_head

#set scaler for normailze data
scaler = MinMaxScaler()

#drop 'Class' header from source DataFrame
df.drop(['Class'], axis = 1, inplace = True)

#transform data into normalized decimal between 0 and 1
for x in df.columns:
    cur = df[x]
    cur = [[i] for i in cur]
    scaled = scaler.fit_transform(cur)
    df[x] = scaled


### Exercise 1.2 : Training and Testing the Neural Network

In [4]:
# Tensorflow / Keras
from tensorflow import keras
from keras.models import Sequential
from keras import Input
from keras.layers import Dense
from keras_hist_graph import plot_history

# Sklearn
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

#x,y labels
y = encoded_class
x = df

#split training set and test set
X_train, X_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.1)

#regression model
clf = MLPClassifier(hidden_layer_sizes=(12,3), activation="logistic", solver = 'sgd', learning_rate_init=0.3, max_iter = 500, random_state=1)

#fitting training set to regression model
hist = clf.fit(X_train, y_train)

In [3]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#function to find mse
def mse(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.square(np.subtract(actual,pred)).mean() 

#predict y using x_test
y_pred = clf.predict(X_test) 

#turn results into dataframe and give headers
y_pred = pd.DataFrame(y_pred)
y_pred.columns = class_head

#format for confusion matrix
y_test = np.argmax(y_test.values, axis = 1)
y_pred = np.argmax(y_pred.values, axis = 1)

#print confusion matrix and mse
print("confusion matrix = \n", confusion_matrix(y_test, y_pred))
print("MSE of the model = ", mse(y_test,y_pred))

# format dataframe for precision
precision_df = pd.DataFrame(class_head, precision_score(y_test,y_pred, pos_label=1, average = None))
precision_df.columns = ['Precision per Bean type']

# format dataframe for recall
recall_df = pd.DataFrame(class_head, recall_score(y_test,y_pred, pos_label=1, average = None))
recall_df.columns = ['Recall per Bean type']

#print precision and recall
print(precision_df)
print(recall_df)


confusion matrix = 
 [[140   0   8   0   0   0   0]
 [  0  45   0   0   0   0   0]
 [  7   0 142   0   1   0   0]
 [ 25   0   0 292   0   2  29]
 [  5   0   2   0 179   0   2]
 [ 10   0   0   0   0 190   6]
 [ 28   0   1   2   7   2 237]]
MSE of the model =  1.4552129221732746
         Precision per Bean type
0.651163                BARBUNYA
1.000000                  BOMBAY
0.928105                    CALI
0.993197                DERMASON
0.957219                   HOROZ
0.979381                   SEKER
0.864964                    SIRA
         Recall per Bean type
0.945946             BARBUNYA
1.000000               BOMBAY
0.946667                 CALI
0.839080             DERMASON
0.952128                HOROZ
0.922330                SEKER
0.855596                 SIRA


## Exercise 2 : k-fold Cross Validation 

In [6]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

from numpy import mean
from numpy import absolute

#define cross-validation method to use 
cv = KFold(n_splits=10, random_state=1, shuffle=True)

#use k-fold CV to evaluate model (random sampling to ensure there is no bias)
scores = cross_validate(clf, x, y, cv=cv, scoring=('accuracy','neg_mean_squared_error'), return_train_score=True)


In [5]:

# print(scores.keys())

print('MSE for each train set: \n', abs(scores['train_neg_mean_squared_error']), "\n")
print('Average MSE for train set: \n', mean(abs(scores['train_neg_mean_squared_error'])), "\n")
print('Accuracy for each train set: \n', scores['train_accuracy'], "\n")
print('Average Accuracy for train set: \n', mean(scores['train_accuracy']), "\n")

print('MSE for each test set: \n', abs(scores['test_neg_mean_squared_error']), "\n")
print('Average MSE for test set: \n', mean(abs(scores['test_neg_mean_squared_error'])), "\n")
print('Accuracy for each test set: \n', scores['test_accuracy'], "\n")
print('Average Accuracy for test set: \n', mean(scores['test_accuracy']), "\n")


MSE for each train set: 
 [0.02326721 0.02298542 0.02509621 0.0248863  0.02218076 0.02697376
 0.0357551  0.02635569 0.03297959 0.0238484 ] 

Average MSE for train set: 
 0.026432842997595806 

Accuracy for each train set: 
 [0.90358397 0.89844898 0.89085714 0.89020408 0.90963265 0.86979592
 0.8157551  0.88644898 0.86481633 0.90114286] 

Average Accuracy for train set: 
 0.8830686006854371 

MSE for each test set: 
 [0.02045312 0.02445681 0.02424688 0.02466674 0.02718589 0.02697596
 0.03516322 0.02813058 0.03264406 0.02708093] 

Average MSE for test set: 
 0.027100418055986487 

Accuracy for each test set: 
 [0.91483113 0.89199118 0.89860397 0.8853784  0.89346069 0.86554004
 0.81631154 0.87729611 0.8662748  0.89272594] 

Average Accuracy for test set: 
 0.8802413790499125 



## Exercise 3 : Hyperparameter Tuning

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt

#generate 7 random ints between 400 and 999
mi = sp_randInt(400,999).rvs(7)

#list of parameters to test with test values in array
hyperparameter_space = {'hidden_layer_sizes':[(14,4),(13,6),(13,5),(14,5)], 
                        'learning_rate_init':[.055,.065,.05, .04, .02, .009, .045],
                        'max_iter': mi}

#model using clf as regression, testing param_distributions, rating by accuracy, n_jobs=-1 to use all cpu cores, 
gridSearch = GridSearchCV(clf, param_grid=hyperparameter_space, scoring = "accuracy", n_jobs=-1, cv=cv, return_train_score=False)

#fit x and y to randomized model
gridSearch.fit(x, y)

GridSearchCV(cv=KFold(n_splits=10, random_state=1, shuffle=True),
             estimator=MLPClassifier(activation='logistic',
                                     hidden_layer_sizes=(12, 3),
                                     learning_rate_init=0.3, max_iter=500,
                                     random_state=1, solver='sgd'),
             n_jobs=-1,
             param_grid={'hidden_layer_sizes': [(14, 4), (13, 6), (13, 5),
                                                (14, 5)],
                         'learning_rate_init': [0.055, 0.065, 0.05, 0.04, 0.02,
                                                0.009, 0.045],
                         'max_iter': array([692, 883, 518, 797, 459, 440, 545])},
             scoring='accuracy')

In [10]:

print(" Results from Random Search: \n" )
print('-----------------------------')
print("The best estimator across ALL searched params:", gridSearch.best_estimator_)
print("The best Accuracy across ALL searched params:", gridSearch.best_score_)
print("The best parameters across ALL searched params:", gridSearch.best_params_)


 Results from Random Search: 

-----------------------------
The best estimator across ALL searched params: MLPClassifier(activation='logistic', hidden_layer_sizes=(13, 5),
              learning_rate_init=0.065, max_iter=692, random_state=1,
              solver='sgd')
The best Accuracy across ALL searched params: 0.9141866835843473
The best parameters across ALL searched params: {'hidden_layer_sizes': (13, 5), 'learning_rate_init': 0.065, 'max_iter': 692}
