In [4]:
# Import required libraries
# source: https://www.pluralsight.com/guides/machine-learning-neural-networks-scikit-learn

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn import datasets
# Import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report,confusion_matrix
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

# https://www.kaggle.com/uciml/pima-indians-diabetes-database/data?select=diabetes.csv
df = pd.read_csv('datasets/diabetes.csv')
print("Data shape: ", df.shape, "\n")
print(df.describe().transpose(), "\n")
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.transpose.html

target_column = ['Outcome'] 
predictors = list(set(list(df.columns))-set(target_column))
df[predictors] = df[predictors]/df[predictors].max()
df.describe().transpose()
#https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.describe.html

X = df[predictors].values
y = df[target_column].values
y = y.reshape(df.shape[0],).reshape(df.shape[0],)   # convert (768, 1) dataframe to (768,) vector array

#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print("Train data shape: ", X_train.shape, "\n")
print("Test data shape: ", X_test.shape, "\n")

# Supported activations are ['identity', 'logistic', 'relu', 'softmax', 'tanh']
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html
#mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='relu', solver='adam', max_iter=500)
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='logistic', solver='sgd', max_iter=2000)
#mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), activation='logistic', solver='adam', max_iter=2000)

mlp.fit(X_train, y_train)

predict_train = mlp.predict(X_train)
predict_test = mlp.predict(X_test)

#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
print("Confusion matrix for train data: \n", confusion_matrix(y_train, predict_train), "\n")
print("Classification report for train data: \n", classification_report(y_train, predict_train), "\n")

print("Confusion matrix for test data: \n", confusion_matrix(y_test, predict_test), "\n")
print("Classification report for test data: \n", classification_report(y_test, predict_test))

Data shape:  (768, 9) 

                          count        mean         std     min       25%  \
Pregnancies               768.0    3.845052    3.369578   0.000   1.00000   
Glucose                   768.0  120.894531   31.972618   0.000  99.00000   
BloodPressure             768.0   69.105469   19.355807   0.000  62.00000   
SkinThickness             768.0   20.536458   15.952218   0.000   0.00000   
Insulin                   768.0   79.799479  115.244002   0.000   0.00000   
BMI                       768.0   31.992578    7.884160   0.000  27.30000   
DiabetesPedigreeFunction  768.0    0.471876    0.331329   0.078   0.24375   
Age                       768.0   33.240885   11.760232  21.000  24.00000   
Outcome                   768.0    0.348958    0.476951   0.000   0.00000   

                               50%        75%     max  
Pregnancies                 3.0000    6.00000   17.00  
Glucose                   117.0000  140.25000  199.00  
BloodPressure              72.0000   

In [2]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [2]:
list(df.columns)

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [5]:
set(list(df.columns))

{'Age',
 'BMI',
 'BloodPressure',
 'DiabetesPedigreeFunction',
 'Glucose',
 'Insulin',
 'Outcome',
 'Pregnancies',
 'SkinThickness'}

In [6]:
set(target_column)

{'Outcome'}

In [7]:
set(list(df.columns))-set(target_column)

{'Age',
 'BMI',
 'BloodPressure',
 'DiabetesPedigreeFunction',
 'Glucose',
 'Insulin',
 'Pregnancies',
 'SkinThickness'}

In [8]:
list(set(list(df.columns))-set(target_column))

['DiabetesPedigreeFunction',
 'Insulin',
 'Pregnancies',
 'BMI',
 'BloodPressure',
 'Age',
 'SkinThickness',
 'Glucose']

In [3]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,0.22618,0.19821,0.0,0.058824,0.176471,0.352941,1.0
Glucose,768.0,0.60751,0.160666,0.0,0.497487,0.58794,0.704774,1.0
BloodPressure,768.0,0.566438,0.158654,0.0,0.508197,0.590164,0.655738,1.0
SkinThickness,768.0,0.207439,0.161134,0.0,0.0,0.232323,0.323232,1.0
Insulin,768.0,0.094326,0.136222,0.0,0.0,0.036052,0.150414,1.0
BMI,768.0,0.47679,0.117499,0.0,0.406855,0.4769,0.545455,1.0
DiabetesPedigreeFunction,768.0,0.19499,0.136913,0.032231,0.100723,0.153926,0.258781,1.0
Age,768.0,0.410381,0.145188,0.259259,0.296296,0.358025,0.506173,1.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0
