# Data Preparation and Model

In [2]:
import pandas as pd
import numpy as np

In [3]:
# set seed for reproducibility
SEED = 20
np.random.seed(SEED)

In [4]:
# Loading Data
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.28287,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [5]:
# Replacing all 0 values with Null values
def replace_zero(df):
    df.drop("PatientID",axis=1,inplace=True)
    df_nan=df.copy(deep=True)
    cols = ["PlasmaGlucose","DiastolicBloodPressure","TricepsThickness","SerumInsulin","BMI"]
    df_nan[cols] = df_nan[cols].replace({0:np.nan})
    return df_nan
df_nan=replace_zero(df)

In [6]:
def find_median(frame,var):
    temp = frame[frame[var].notnull()]
    temp = frame[[var,'Diabetic']].groupby('Diabetic')[[var]].median().reset_index()
    return temp

In [7]:
def replace_null(frame,var):
    median_df=find_median(frame,var)
    var_0=median_df[var].iloc[0]
    var_1=median_df[var].iloc[1]
    frame.loc[(frame['Diabetic'] == 0) & (frame[var].isnull()), var] = var_0
    frame.loc[(frame['Diabetic'] == 1) & (frame[var].isnull()), var] = var_1
    return frame[var].isnull().sum()

In [8]:
print(str(replace_null(df_nan,'PlasmaGlucose'))+ ' Nulls for Glucose')
print(str(replace_null(df_nan,'TricepsThickness'))+ ' Nulls for SkinThickness')
print(str(replace_null(df_nan,'SerumInsulin'))+ ' Nulls for Insulin')
print(str(replace_null(df_nan,'BMI'))+ ' Nulls for BMI')
print(str(replace_null(df_nan,'DiastolicBloodPressure'))+ ' Nulls for BloodPressure')
# We have successfully handled Nulls

0 Nulls for Glucose
0 Nulls for SkinThickness
0 Nulls for Insulin
0 Nulls for BMI
0 Nulls for BloodPressure


In [9]:
df_nan.isnull().sum()
# Just a confirmation
# Everything looks good

Pregnancies               0
PlasmaGlucose             0
DiastolicBloodPressure    0
TricepsThickness          0
SerumInsulin              0
BMI                       0
DiabetesPedigree          0
Age                       0
Diabetic                  0
dtype: int64

In [10]:
df.columns

Index(['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],
      dtype='object')

In [11]:
X = df_nan.drop("Diabetic",axis=1)
Y = df_nan["Diabetic"]

In [13]:
Y.head()

0    0
1    0
2    0
3    1
4    0
Name: Diabetic, dtype: int64

In [14]:
#Keeping train  size as 0.8
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=20, stratify=Y)


In [15]:
# We are good to go with baseline model
# Let's first implement KNN
from sklearn.neighbors import KNeighborsClassifier
test_scores = []
train_scores = []
for i in range(5,15):
    neigh = KNeighborsClassifier(n_neighbors=i)
    neigh.fit(X_train, Y_train)
    train_scores.append(neigh.score(X_train,Y_train))
    test_scores.append(neigh.score(X_test,Y_test))

In [43]:
print('Max train_scores is ' + str(max(train_scores)*100) + ' for k = '+ 
      str(train_scores.index(max(train_scores))+5))

Max train_scores is 88.84166666666667 for k = 5


In [44]:
print('Max test_scores is ' + str(max(test_scores)*100) + ' for k = '+ 
      str(test_scores.index(max(test_scores))+5))
# K=13 has generalized well for our data.

knn = max(test_scores)*100

Max test_scores is 84.93333333333334 for k = 13


In [45]:
# Lets try Logistic regression now
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=20, penalty='l2',max_iter=1000).fit(X_train, Y_train)
log_pred=log_model.predict(X_test)
log_model.score(X_test, Y_test)

lr = log_model.score(X_test, Y_test) * 100

In [48]:
# Support Vector Machines
from sklearn import svm
svm_model = svm.SVC().fit(X_train, Y_train)
svm_pred=svm_model.predict(X_test)
svm_model.score(X_test, Y_test)
# Almost 82% Accuracy

svm = svm_model.score(X_test, Y_test) * 100

In [49]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(max_depth=2, random_state=20).fit(X_train, Y_train)
rf_pred=rf_model.predict(X_test)
rf_model.score(X_test, Y_test)
# Almost 90% Accuracy

rf = rf_model.score(X_test, Y_test) * 100

In [24]:
import tensorflow as tf
def build_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(8, activation='relu', input_shape=[len(X_train.keys())]),
    tf.keras.layers.Dense(4, activation='relu'),
    tf.keras.layers.Dense(2, activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
  ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

neural_model = build_model()

In [25]:
neural_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 72        
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 10        
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 3         
Total params: 121
Trainable params: 121
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Keeping EPOCHs high as dataset is small.
EPOCHS = 100
neural_pred = neural_model.fit(X_train, Y_train,epochs=EPOCHS, validation_split=0.1, verbose=2)

Epoch 1/100
338/338 - 1s - loss: 0.6667 - accuracy: 0.6470 - val_loss: 0.6198 - val_accuracy: 0.6733
Epoch 2/100
338/338 - 0s - loss: 0.6187 - accuracy: 0.6659 - val_loss: 0.5681 - val_accuracy: 0.6733
Epoch 3/100
338/338 - 0s - loss: 0.5825 - accuracy: 0.6659 - val_loss: 0.5712 - val_accuracy: 0.6733
Epoch 4/100
338/338 - 0s - loss: 0.5703 - accuracy: 0.6659 - val_loss: 0.5605 - val_accuracy: 0.6733
Epoch 5/100
338/338 - 0s - loss: 0.5676 - accuracy: 0.6659 - val_loss: 0.5586 - val_accuracy: 0.6733
Epoch 6/100
338/338 - 0s - loss: 0.5233 - accuracy: 0.6823 - val_loss: 0.4475 - val_accuracy: 0.7583
Epoch 7/100
338/338 - 0s - loss: 0.4667 - accuracy: 0.7454 - val_loss: 0.4555 - val_accuracy: 0.7600
Epoch 8/100
338/338 - 0s - loss: 0.4686 - accuracy: 0.7422 - val_loss: 0.4324 - val_accuracy: 0.7542
Epoch 9/100
338/338 - 0s - loss: 0.4555 - accuracy: 0.7518 - val_loss: 0.4176 - val_accuracy: 0.7758
Epoch 10/100
338/338 - 0s - loss: 0.4527 - accuracy: 0.7519 - val_loss: 0.4186 - val_accura

In [54]:
_, acc = neural_model.evaluate(X_test, Y_test,
                            batch_size=64)
print('Test accuracy:', acc)

acc = acc *100

Test accuracy: 0.7833333611488342


# ¿ Cual es el mejor modelo ?

In [62]:
data = {'Modelo': ['KNN', 'Regresión Logística', 'Random Forest', 'SVM', 'Keras'],
    'acc': [knn, lr, rf, svm, acc]}

df_models = pd.DataFrame(data)

In [69]:
df_models.sort_values("acc",ascending=False,)

Unnamed: 0,Modelo,acc
2,Random Forest,90.0
0,KNN,84.933333
3,SVM,82.266667
1,Regresión Logística,79.266667
4,Keras,78.333336


# Serializando el mejor modelo

El modulo pickle implementa protocolos binarios para serializar y deserializar una estructura de objetos Python. «Pickling» es el proceso mediante el cual una jerarquía de objetos de Python se convierte en una secuencia de bytes, y el «unpickling» es la operación inversa, mediante la cual una secuencia de bytes de un archivo binario (binary file) ó un objeto tipo binario (bytes-like object) es convertido nuevamente en una jerarquía de objetos. Pickling (y unpickling) son alternativamente conocidos como «serialización», «ensamblaje,»

In [32]:
import pickle
# Lets dump our rf_model
pickle.dump(rf_model, open('rf_model.pkl','wb'))

In [33]:
(X_test[2:3])

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age
13724,0,93,80,33,40,20.875334,0.1067,21


In [34]:
pred = rf_model.predict_proba( X_test[2:3] )

In [35]:
pred

array([[0.9483709, 0.0516291]])

In [36]:
np.argmax(pred)

0