In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Check the spread of data across classes
sns.countplot(y='Outcome', data=df)

In [None]:
sns.pairplot(df, hue="Outcome")

Glucose levels seem to be a strong indicator

In [None]:
sns.heatmap(df.corr(), cmap='Accent')

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
X = df[df.columns[:-1]] # Everything except Outcome
y = df['Outcome']

scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # total: 768 values
print('Train :', len(X_train))
print('Test  :', len(X_test))

In [None]:
log = LogisticRegression(random_state=2)
log.fit(X_train, y_train)

y_pred = log.predict(X_test)
log.score(X_test, y_test)

In [None]:
cfn_mat = np.array(confusion_matrix(y_test, y_pred), dtype='float')
cfn_mat[0,0] /= (y_test == 0).sum()
cfn_mat[1,0] /= (y_test == 0).sum()
cfn_mat[0,1] /= (y_test == 1).sum()
cfn_mat[1,1] /= (y_test == 1).sum()

sns.heatmap(cfn_mat)