## Data modeling using generated data file after performing data preperation

## 1. Setup

In [79]:
import os
import tensorflow as t2
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [80]:
# import numpy and pandas libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix
# set random seed to ensure that results are repeatable
np.random.seed(1)
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn import datasets
from __future__ import print_function
import matplotlib.pyplot as plt


## 2. Load the data

In [81]:
X_train = pd.read_csv("appledata_train_X.csv")
X_test = pd.read_csv("appledata_test_X.csv")
y_train = pd.read_csv("appledata_train_y.csv")
y_test = pd.read_csv("appledata_test_y.csv")

## 3. creating performance data frame 

In [82]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## 4. Logistic regression using Random search

In [83]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'max_iter':np.arange(500,1000),
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

log_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = log_reg, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.8397435897435898
... with parameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 576}


1255 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
66 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\linear_model\_logistic.py", line 1160, in fit
    self._validate_params()
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in vali

In [85]:

c_matrix = confusion_matrix(y_test, rand_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression rand search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Logistic regression using grid search

In [86]:
sscore_measure = "recall"
kfolds = 5
max_iter = rand_search.best_params_['max_iter']
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {
    'max_iter': np.arange(max_iter-10,max_iter+10),  
    'penalty': [penalty],
    'solver': [solver]
}

log_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = log_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallLogistic = grid_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
The best recall score is 0.8397435897435898
... with parameters: {'max_iter': 566, 'penalty': 'l1', 'solver': 'saga'}


  y = column_or_1d(y, warn=True)


In [87]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression grid search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## SVM classification model using Random Search

In [88]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'C': np.arange(1,30),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm_model = SVC()
rand_search = RandomizedSearchCV(estimator = svm_model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")



Fitting 5 folds for each of 174 candidates, totalling 870 fits
The best recall score is 0.9371794871794872
... with parameters: {'kernel': 'poly', 'gamma': 'scale', 'C': 1}


  y = column_or_1d(y, warn=True)


In [89]:
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM Random search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## SVM classification model using Grid Search

In [90]:
score_measure = "recall"
kfolds = 5

C = rand_search.best_params_['C']
gamma = rand_search.best_params_['gamma']
kernel = rand_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-2,C+2),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm_model = SVC()
grid_search = GridSearchCV(estimator = svm_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallSVM = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
The best recall score is 0.9371794871794872
... with parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'poly'}


10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\svm\_base.py", line 180, in fit
    self._validate_params()
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_cons

In [91]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM grid search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Decision tree using RandomSearchCV

In [92]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
random_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")

bestRecallTree = random_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 1.0
... with parameters: {'min_samples_split': 80, 'min_samples_leaf': 36, 'min_impurity_decrease': 0.0011, 'max_leaf_nodes': 74, 'max_depth': 44, 'criterion': 'entropy'}


25 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\tree\_classes.py", line 889, in fit
    super().fit(
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\tree\_classes.py", line 177, in fit
    self._validate_params()
  File "C:\Users\Shanthi\anaconda3\envs\t2\lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  Fi

In [93]:
c_matrix = confusion_matrix(y_test, random_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Dtree_random", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Decision tree using Grid search

In [94]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 9072 candidates, totalling 45360 fits
The best recall score is 0.8551282051282051
... with parameters: {'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 162, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 6, 'min_samples_split': 30}


In [95]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Dtree_grid", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

### Neural Network

In [96]:
%%time

ann = MLPClassifier(hidden_layer_sizes=(60,50,40), solver='adam', max_iter=200)
_ = ann.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


CPU times: total: 1.48 s
Wall time: 542 ms


In [97]:
%%time
y_pred = ann.predict(X_test)

CPU times: total: 0 ns
Wall time: 5.76 ms


## NN With RandomizedSearchCV

In [98]:
%%time

score_measure = "recall"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,),(50,30), (40,20), (60,40, 20), (70,50,40)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [5000]
}

ann = MLPClassifier()
rand_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

bestRecallTree = rand_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 162, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 6, 'min_samples_split': 30}
CPU times: total: 1.45 s
Wall time: 38 s


  y = column_or_1d(y, warn=True)


In [99]:
%%time
X_test=X_test[:len(y_test)]
y_pred = bestRecallTree.predict(X_test)
print(classification_report(y_test, y_pred))
c_matrix = confusion_matrix(y_test, rand_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"NN_Rand", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        14
           1       0.65      1.00      0.79        26

    accuracy                           0.65        40
   macro avg       0.33      0.50      0.39        40
weighted avg       0.42      0.65      0.51        40

CPU times: total: 78.1 ms
Wall time: 35.4 ms


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## NN With GridSearchCV

In [100]:
%%time

score_measure = "recall"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,), (90,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestRecallTree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'activation': 'tanh', 'alpha': 1, 'hidden_layer_sizes': (30,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.005, 'max_iter': 5000, 'solver': 'adam'}
CPU times: total: 2.08 s
Wall time: 34 s


  y = column_or_1d(y, warn=True)


In [101]:
%%time
X_test=X_test[:len(y_test)]
y_pred = bestRecallTree.predict(X_test)

print(classification_report(y_test, y_pred))
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"NN_GRID", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

              precision    recall  f1-score   support

           0       0.50      0.43      0.46        14
           1       0.71      0.77      0.74        26

    accuracy                           0.65        40
   macro avg       0.61      0.60      0.60        40
weighted avg       0.64      0.65      0.64        40

CPU times: total: 0 ns
Wall time: 54.2 ms


# Using Keras

## Deep Network

In [61]:
import tensorflow as t2
from tensorflow import keras

# fix random seed for reproducibility
np.random.seed(1)
t2.random.set_seed(1)

In [62]:
%%time

# create model stucture
model = keras.models.Sequential()
model.add(keras.layers.Input(10))
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.Dense(50, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid')) # final layer, 1 categories


# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# if you want to overide the defaults for the optimizer....
#adam = keras.optimizers.Adam(learning_rate=0.01)
#model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])


CPU times: total: 46.9 ms
Wall time: 114 ms


In [63]:
%%time

# fit the model

history = model.fit(X_train, y_train, 
                    validation_data=(X_test[:len(y_test)], y_test), 
                    epochs=20, batch_size=100)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
CPU times: total: 9.02 s
Wall time: 3.4 s


In [64]:
# evaluate the model

scores = model.evaluate(X_test[:len(y_test)],y_test, verbose=0)
scores
# In results, first is loss, second is accuracy

[0.6151880025863647, 0.699999988079071]

In [65]:
# let's format this into a better output...

print("%s: %.2f" % (model.metrics_names[0], scores[0]))
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

loss: 0.62
accuracy: 70.00%


## Wide and Deep Network

In [66]:
#Define the model: for multi-class

model = keras.models.Sequential()

model.add(keras.layers.Input(shape=10))
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [67]:
# Compile model

#Optimizer:
adam = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

In [68]:
# Fit the model
X_test=X_test[:len(y_test)]
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [69]:
# evaluate the model
X_test=X_test[:len(y_test)]
scores = model.evaluate(X_test, y_test, verbose=0)
scores

# In results, first is loss, second is accuracy

[3.773873805999756, 0.6000000238418579]

In [70]:
# extract the accuracy from model.evaluate

print("%s: %.2f" % (model.metrics_names[0], scores[0]))
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

loss: 3.77
accuracy: 60.00%


## RandomGridSearch

In [71]:
%%time

# If you don't have the following installed, from command line '!pip install scikeras'
from scikeras.wrappers import KerasClassifier
from keras.initializers import GlorotNormal

score_measure = "recall"
kfolds = 5

def build_clf(hidden_layer_sizes, dropout):
    ann = t2.keras.models.Sequential()
    ann.add(keras.layers.Input(shape=10)),
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, kernel_initializer= t2.keras.initializers.GlorotNormal(), 
                                     bias_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), activation="relu"))
        model.add(keras.layers.Dropout(dropout))
    ann.add(t2.keras.layers.Dense(1, activation='sigmoid'))
    ann.compile(loss = 'binary_crossentropy', metrics = ['accuracy'])
    return ann


CPU times: total: 0 ns
Wall time: 998 µs


For more information on dense layers and initializers, see the following:
* https://keras.io/api/layers/core_layers/dense/
* https://keras.io/api/layers/initializers/

In [72]:
from scikeras.wrappers import KerasClassifier

keras_clf = KerasClassifier(
    model=build_clf,
    hidden_layer_sizes=40,
    dropout = 0.0
)


In [73]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV

params = {
    'optimizer__learning_rate': [0.0005, 0.001, 0.005],
    'model__hidden_layer_sizes': [(70,),(90, ), (100,), (100, 90)],
    'model__dropout': [0, 0.1],
    'batch_size':[20, 60, 100],
    'epochs':[10, 50, 100],
    'optimizer':["adam",'sgd']
}
keras_clf.get_params().keys()



dict_keys(['model', 'build_fn', 'warm_start', 'random_state', 'optimizer', 'loss', 'metrics', 'batch_size', 'validation_batch_size', 'verbose', 'callbacks', 'validation_split', 'shuffle', 'run_eagerly', 'epochs', 'hidden_layer_sizes', 'dropout', 'class_weight'])

In [74]:
rnd_search_cv = RandomizedSearchCV(estimator=keras_clf, param_distributions=params, scoring='accuracy', n_iter=50, cv=5)

import sys
sys.setrecursionlimit(10000) # note: the default is 3000 (python 3.9)

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
callback = [earlystop]

_ = rnd_search_cv.fit(X_train, y_train, callbacks=callback, verbose=0)




In [75]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.0005,
 'optimizer': 'adam',
 'model__hidden_layer_sizes': (100,),
 'model__dropout': 0,
 'epochs': 50,
 'batch_size': 60}

In [76]:
best_net = rnd_search_cv.best_estimator_
print(rnd_search_cv.best_params_)

{'optimizer__learning_rate': 0.0005, 'optimizer': 'adam', 'model__hidden_layer_sizes': (100,), 'model__dropout': 0, 'epochs': 50, 'batch_size': 60}


In [77]:
%%time
X_test=X_test[:len(y_test)]
y_pred = best_net.predict(X_test)
print(classification_report(y_test, y_pred))
c_matrix = confusion_matrix(y_test, rnd_search_cv.predict(X_test[:len(y_test)]))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"DNN", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

              precision    recall  f1-score   support

           0       0.38      0.43      0.40        14
           1       0.67      0.62      0.64        26

    accuracy                           0.55        40
   macro avg       0.52      0.52      0.52        40
weighted avg       0.56      0.55      0.56        40

CPU times: total: 250 ms
Wall time: 386 ms


## 5.0 Summary

Sorted by recall, the best models are:

DNN performance using recall as the score metric

In [78]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,DNN,0.55,0.666667,0.615385,0.64


Performance of the remaing models

In [102]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Dtree_random,0.55,0.681818,0.576923,0.625
0,Dtree_grid,0.65,0.8,0.615385,0.695652
0,NN_GRID,0.65,0.714286,0.769231,0.740741
0,Logistic Regression rand search,0.65,0.7,0.807692,0.75
0,Logistic Regression rand search,0.65,0.7,0.807692,0.75
0,Logistic Regression grid search,0.65,0.7,0.807692,0.75
0,SVM Random search,0.675,0.69697,0.884615,0.779661
0,SVM grid search,0.675,0.69697,0.884615,0.779661
0,NN_Rand,0.65,0.65,1.0,0.787879


# Analysis

The dataset is connected to predicting the purchase of the Apple M1 Mac book. The purpose of this data set is to predict whether or not the customer will buy the M1 Macbook. As a result, the goal of this assignment is to establish which features you will employ to analyze purchase behaviors and how these features impact the sales of the apple mac book.

I chose the recall performance indicator for this dataset because recall is used to quantify the proportion of true positives out of all possible outcomes.

According to the above results, the neural network model with random search outperforms all four models with 100 percent accuracy, followed by SVM with random and grid search with 88.461 percent accuracy, logistic regression with random and grid search with 80.76 percent accuracy, deep neural network with approximately 62 percent, and decision tree with grid search with 61.53 percent accuracy.
As a result, when recall is used as the performance metric, the Neural Network model surpasses the other models. 

As a result, when compared to neural networks with MLP classifiers and DNN with keras, we can conclude that the MLP classifier performs well, and the performance is also affected by various factors such as loss function, activation, and number of layers.

The decision tree utilizing grid search model is shown to be the least performing model in terms of recall.
When accuracy is used as a performance indicator, the decision tree model with grid search is deemed the best model, followed by logistic regression with random search and grid search, and SVM.



