In [1]:
#https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
#https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.names

In [2]:
# first neural network with keras tutorial
import numpy as np
import pandas as pd

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [3]:
data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv', 
                   header=None, 
                   names=['times_pregnant', 'plasma_glucose', 'diastolic_bp', 'triceps_thickness', 
                          '2_hour_serum', 'bmi', 'pedigree_fn', 'age', 'target']
                  )

In [4]:
data.head()

Unnamed: 0,times_pregnant,plasma_glucose,diastolic_bp,triceps_thickness,2_hour_serum,bmi,pedigree_fn,age,target
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
data.shape

(768, 9)

In [6]:
data.isnull().sum()

times_pregnant       0
plasma_glucose       0
diastolic_bp         0
triceps_thickness    0
2_hour_serum         0
bmi                  0
pedigree_fn          0
age                  0
target               0
dtype: int64

In [7]:
X = data.drop(columns='target')
y = data['target']

X.shape, y.shape


((768, 8), (768,))

In [8]:
y.value_counts(normalize=True)

0    0.651042
1    0.348958
Name: target, dtype: float64

In [9]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(8,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

2022-07-20 10:09:42.431014: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
# fit the keras model on the dataset
history = model.fit(X, y, epochs=500, batch_size=10, verbose=0)

2022-07-20 10:09:42.688714: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
# evaluate the keras model
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 79.82


In [13]:
# make class predictions with the model
y_pred = (model.predict(X) > 0.5).astype(int)

In [14]:
for i in range(15):
    print('%s => %d (expected %d)' % (X.iloc[i].tolist(), y_pred[i], y[i]))


# print(y.shape)
# print(y_pred.shape)
# for y1, y2 in zip(y, y_pred):
#     print(f'y1/y2: {y1}/{y2[0]}')

[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0] => 1 (expected 1)
[1.0, 85.0, 66.0, 29.0, 0.0, 26.6, 0.351, 31.0] => 0 (expected 0)
[8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0] => 1 (expected 1)
[1.0, 89.0, 66.0, 23.0, 94.0, 28.1, 0.167, 21.0] => 0 (expected 0)
[0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0] => 1 (expected 1)
[5.0, 116.0, 74.0, 0.0, 0.0, 25.6, 0.201, 30.0] => 0 (expected 0)
[3.0, 78.0, 50.0, 32.0, 88.0, 31.0, 0.248, 26.0] => 0 (expected 1)
[10.0, 115.0, 0.0, 0.0, 0.0, 35.3, 0.134, 29.0] => 1 (expected 0)
[2.0, 197.0, 70.0, 45.0, 543.0, 30.5, 0.158, 53.0] => 1 (expected 1)
[8.0, 125.0, 96.0, 0.0, 0.0, 0.0, 0.232, 54.0] => 0 (expected 1)
[4.0, 110.0, 92.0, 0.0, 0.0, 37.6, 0.191, 30.0] => 0 (expected 0)
[10.0, 168.0, 74.0, 0.0, 0.0, 38.0, 0.537, 34.0] => 1 (expected 1)
[10.0, 139.0, 80.0, 0.0, 0.0, 27.1, 1.441, 57.0] => 0 (expected 0)
[1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.398, 59.0] => 1 (expected 1)
[5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.587, 51.0] => 1 (expect

# Post tutorial experimentation (tuning the model)

In [None]:
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

model2 = Sequential()
model2.add(Dense(12, input_shape=(8,), activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model2.fit(X_scaled, y, epochs=500, batch_size=10, verbose=0)

# evaluate the keras model
_, accuracy = model2.evaluate(X_scaled, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 86.46


In [None]:
pd.DataFrame(X_scaled).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,-0.020496


In [None]:
pd.DataFrame(X).head()

Unnamed: 0,times_pregnant,plasma_glucose,diastolic_bp,triceps_thickness,2_hour_serum,bmi,pedigree_fn,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [35]:
from sklearn.preprocessing import RobustScaler

X_scaled = RobustScaler().fit_transform(X)

model2 = Sequential()
model2.add(Dense(12, input_shape=(8,), activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model2.fit(X_scaled, y, epochs=500, batch_size=10, verbose=0)

# evaluate the keras model
_, accuracy = model2.evaluate(X_scaled, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 87.76


# Try dropping zero/0 values

In [19]:
X2 = X.copy()

In [20]:
for column in X2.columns:
    print(f'{column}: {(X2[column] == 0).sum()}')

times_pregnant: 111
plasma_glucose: 5
diastolic_bp: 35
triceps_thickness: 227
2_hour_serum: 374
bmi: 11
pedigree_fn: 0
age: 0


In [21]:
#Drop rows with zero for: plasma_glucose, diastolic_bp, triceps_thickness, 2_hour_serum, bmi

In [22]:
drop_list = []
for column in ['plasma_glucose', 'diastolic_bp', 'triceps_thickness', '2_hour_serum', 'bmi']:
    drop_list = drop_list + X2.index[X2[column] == 0].tolist()
    print(f'{column}:{len(drop_list)}')

plasma_glucose:5
diastolic_bp:40
triceps_thickness:267
2_hour_serum:641
bmi:652


In [23]:
X3 = X2.drop(X2.index[drop_list])
y3 = y.drop(X2.index[drop_list])

X3.shape, y3.shape
             

((392, 8), (392,))

In [24]:
# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(8,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset

history = model.fit(X3, y3, epochs=500, batch_size=10, verbose=0)

# evaluate the keras model
_, accuracy = model.evaluate(X3, y3)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 80.10


# Impute (mean) zero/0 values with column mean

In [25]:
X4 = X.copy()
X4.head(20)

Unnamed: 0,times_pregnant,plasma_glucose,diastolic_bp,triceps_thickness,2_hour_serum,bmi,pedigree_fn,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
5,5,116,74,0,0,25.6,0.201,30
6,3,78,50,32,88,31.0,0.248,26
7,10,115,0,0,0,35.3,0.134,29
8,2,197,70,45,543,30.5,0.158,53
9,8,125,96,0,0,0.0,0.232,54


In [26]:
#Impute rows with zero: plasma_glucose, diastolic_bp, triceps_thickness, 2_hour_serum, bmi
from sklearn.impute import SimpleImputer


In [27]:

for column in ['plasma_glucose', 'diastolic_bp', 'triceps_thickness', '2_hour_serum', 'bmi']:
    mean_imputer = SimpleImputer(missing_values=0, strategy='mean')
    X4[column] = mean_imputer.fit_transform(X[column].to_numpy().reshape(-1,1))
    
X4.head(20)

Unnamed: 0,times_pregnant,plasma_glucose,diastolic_bp,triceps_thickness,2_hour_serum,bmi,pedigree_fn,age
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54


In [32]:
from sklearn.preprocessing import RobustScaler

model4 = Sequential()
model4.add(Dense(12, input_shape=(8,), activation='relu'))
model4.add(Dense(8, activation='relu'))
model4.add(Dense(1, activation='sigmoid'))

model4.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model4.fit(X, y, epochs=500, batch_size=10, verbose=0)

# evaluate the keras model
_, accuracy = model4.evaluate(X4, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 74.74


In [38]:
X5 = RobustScaler().fit_transform(X4)

# define the keras model
model = Sequential()
model.add(Dense(12, input_shape=(8,), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset

history = model.fit(X5, y, epochs=500, batch_size=10, verbose=0)

# evaluate the keras model
_, accuracy = model.evaluate(X5, y)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 89.19


# Try a KNN Classifier with the above datasets

```
X = unaltered dataset - Accuracy: 79.82
X_scaled = scaled, zeros NOT removed - Accuracy: 89.32
X3/y3 = Drop zeros - Accuracy: 80.10
X4 = Mean impute zeros - Accuracy: 74.74
X5 = Mean impute zeros then robust scaler - Accuracy: 89.19
```

In [39]:
from sklearn.neighbors import KNeighborsClassifier

In [40]:
knn_1 = KNeighborsClassifier()
knn_1.fit(X,y)
knn_1.score(X,y)

0.8033854166666666

In [41]:
knn_2 = KNeighborsClassifier()
knn_2.fit(X_scaled,y)
knn_2.score(X_scaled,y)

0.8072916666666666

In [42]:
knn_2 = KNeighborsClassifier()
knn_2.fit(X3,y3)
knn_2.score(X3,y3)

0.8112244897959183

In [43]:
knn_1 = KNeighborsClassifier()
knn_1.fit(X4,y)
knn_1.score(X4,y)

0.8072916666666666

In [44]:
knn_1 = KNeighborsClassifier()
knn_1.fit(X5,y)
knn_1.score(X5,y)

0.8216145833333334

In [45]:
knn_1 = KNeighborsClassifier(n_neighbors=2)
knn_1.fit(X5,y)
knn_1.score(X5,y)

0.82421875

In [46]:
knn_1 = KNeighborsClassifier(n_neighbors=10)
knn_1.fit(X5,y)
knn_1.score(X5,y)

0.7825520833333334

# Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

In [52]:
def do_log_reg(X, y):
    lr = LogisticRegression(max_iter=500, n_jobs=-1).fit(X,y)
    print(lr.score(X,y))

In [53]:
do_log_reg(X,y)

0.78125


In [54]:
do_log_reg(X_scaled,y)

0.7838541666666666


In [55]:
do_log_reg(X3,y3)

0.7780612244897959


In [56]:
do_log_reg(X4,y)

0.7747395833333334


In [57]:
do_log_reg(X5,y)

0.7734375
