Import libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_auc_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [None]:
import tensorflow as tf

Import data

In [None]:
diabetes_data = pd.read_csv('../input/diabetes.csv')

In [None]:
diabetes_data.columns

In [None]:
diabetes_data.head()

Visualize data with pairplot

In [None]:
sns.pairplot(data=diabetes_data, hue='Outcome')

In [None]:
print(diabetes_data['Outcome'].value_counts())

In [None]:
diabetes_data.info()

Scale data

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(diabetes_data.iloc[:,:-1])

In [None]:
scaled_features = scaler.transform(diabetes_data.iloc[:,:-1])

In [None]:
df_feat = pd.DataFrame(scaled_features,columns=diabetes_data.columns[:-1])

In [None]:
df_feat.head()

In [None]:
X = df_feat
y = diabetes_data.iloc[:,-1]


Replace zeroes with mean

In [None]:
replace_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in replace_zero:
    X[column] = X[column].replace(0, np.NaN)
    mean = int(X[column].mean(skipna=True))
    X[column] = X[column].replace(np.NaN, mean)

Train test split with 25% test size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=101)

Use tensorflow DNNClassifier to predict

In [None]:
feat_cols = []

for col in X.columns:
    feat_cols.append(tf.feature_column.numeric_column(col))

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train,batch_size=10,num_epochs=5,shuffle=True)

In [None]:
classifier = tf.estimator.DNNClassifier(hidden_units = [10,20,10],n_classes=2,feature_columns=feat_cols)

In [None]:
classifier.train(input_fn=input_func, steps=50)

In [None]:
pred_fn = tf.estimator.inputs.pandas_input_fn(x=X_test,batch_size=len(X_test),shuffle=False)

In [None]:
note_predictions = list(classifier.predict(input_fn=pred_fn))

In [None]:
final_preds  = []
for pred in note_predictions:
    final_preds.append(pred['class_ids'][0])

In [None]:
print(classification_report(y_test,final_preds))
print(confusion_matrix(y_test,final_preds))

Use Scikit-learn RandomForestClassifer

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=200)

In [None]:
rfc.fit(X_train, y_train)

In [None]:
predictions = rfc.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test,predictions))

Use Scikit-learn LogisticRegression

In [None]:
logmodel = LogisticRegression()

In [None]:
logmodel.fit(X_train, y_train)

In [None]:
predictions = logmodel.predict(X_test)

In [None]:
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test,predictions))