In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
sns.set_style(style = 'whitegrid')

In [None]:
# Checking for Null Values
sns.heatmap(data.isnull(), yticklabels = False, cbar = False, cmap = 'viridis')

In [None]:
sns.countplot(x = 'Outcome', data = data)

In [None]:
sns.distplot(data['BloodPressure'], color = 'indigo')

In [None]:
sns.jointplot(x = 'BloodPressure', y = 'Age', data = data, kind = 'kde', color = 'red')

In [None]:
sns.countplot(x = 'Pregnancies', data = data, hue = 'Outcome', palette = 'plasma', saturation=10.75)

In [None]:
sns.pairplot(data, hue = 'Outcome')

In [None]:
# Importing preprocessing libraries

In [None]:
from sklearn.preprocessing import StandardScaler
scaled = StandardScaler()
scaled.fit(data.drop('Outcome', axis = 1))
scaled = scaled.transform(data.drop('Outcome', axis = 1))

In [None]:
import statsmodels.api as sm
X = scaled[:, [0,1,2,3,4,5,6,7]]
regressor_OLS = sm.OLS(exog = X, endog = data['Outcome']).fit()
regressor_OLS.summary()

In [None]:
import statsmodels.api as sm
X = scaled[:, [0,1,2,4,5,6,7]]
regressor_OLS = sm.OLS(exog = X, endog = data['Outcome']).fit()
regressor_OLS.summary()

In [None]:
import statsmodels.api as sm
X = scaled[:, [0,1,2,5,6,7]]
regressor_OLS = sm.OLS(exog = X, endog = data['Outcome']).fit()
regressor_OLS.summary()

In [None]:
import statsmodels.api as sm
X = scaled[:, [0,1,2,5,6]]
regressor_OLS = sm.OLS(exog = X, endog = data['Outcome']).fit()
regressor_OLS.summary()

In [None]:
# So the suitable colmns tto be used for regression are Pregnancies, Glucose, BloodPressure, BMI and DiabetesPedigreeFunction.

In [None]:
X = X
y = data['Outcome']

In [None]:
# Importing train test split
from sklearn.model_selection import train_test_split

In [None]:
# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
pred = logreg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test, pred))
print('\n')
print(classification_report(y_test, pred))