## **Predicting Diabetes using Logistic Regression**


### **1. Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

plt.style.use('ggplot') 
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
print(os.listdir("../input"))

### **2. Load Dataset**

#### **Dataset information**:

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

##### **Columns**:
1. Pregnancies: Number of times pregnant
2. Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
3. BloodPressure: Diastolic blood pressure (mm Hg)
4. SkinThickness: Triceps skin fold thickness (mm)
5. Insulin: 2-Hour serum insulin (
6. BMI: Body mass index (Weight (kg)/(Height (m)^2))
7. DiabetesPedigreeFunction: Diabetes pedigree function
8. Age: Age (years)
9. Outcome: Class variable (0 or 1)

##### Data Source: 

[pima-indians-diabetes-database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)




In [None]:
data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

#### **3. Data Exploration and Visualization**

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
#get correlations of each features in dataset
data.corr()

In [None]:
# heatmap
sns.heatmap(data.corr(), cmap="YlGnBu" )

In [None]:
sns.countplot(x='Outcome', data=data)
plt.title("Count plot for Target Variable")

In [None]:
sns.pairplot(data=data,hue="Outcome")

In [None]:
sns.catplot(x="Outcome", y="Pregnancies", 
            kind="violin", split=True, 
            palette="ch:r=-.5,l=.75",
            inner="stick", 
            data=data)

In [None]:
sns.scatterplot(x='Age',  y='Pregnancies', hue='Outcome',
                data= data, 
                legend = 'full',
                palette="ch:r=-.7,l=.87")

In [None]:
# histograms
data.hist(figsize=(15,10))
plt.figure();

In [None]:
plt.boxplot([data['Age'], data['BMI']])

#### **4. Data Modeling** 

In [None]:
def evaluation(y, y_pred):
    print("MSE: {}".format(mean_squared_error(y, y_pred)))
    print("Accuracy Score:", accuracy_score(y, y_pred))
    print("Precision:", precision_score(y, y_pred))
    print("Recall:", recall_score(y, y_pred))
    print("F1 Score:", f1_score(y, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y, y_pred))
    cm = pd.crosstab(y, y_pred)
    sns.heatmap(cm, annot=True, cmap="YlGnBu")

In [None]:
y = np.array(data['Outcome'])
X = np.array(data.drop(columns=['Outcome'], axis=1))

In [None]:
# split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# W/O Scaling or Normalization
lr = LogisticRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_test)
print("Logistic regressin Report:")
evaluation(y_test,y_pred)

In [None]:
# With Standard Scaling 
SS = StandardScaler()

X_train_scaled = SS.fit_transform(X_train)
X_test_scaled = SS.transform(X_test)

lr = LogisticRegression(max_iter=150)
lr.fit(X_train_scaled,y_train)

y_pred = lr.predict(X_test_scaled)
print("Logistic regressin Report (Standard Scaling):")
evaluation(y_test,y_pred)

In [None]:
# Normalize the data 
means = np.mean(X, axis=0)
stds = np.std(X, axis=0)

X_train_norm = (X_train - means)/stds
X_test_norm = (X_test- means)/stds

lr = LogisticRegression()
lr.fit(X_train_norm,y_train)

y_pred = lr.predict(X_test_norm)
print("Logistic regressin Report (Normalized Data):")
evaluation(y_test,y_pred)

In [None]:
# With MinMax Scaling 
MMS = MinMaxScaler()

X_train_scaled = MMS.fit_transform(X_train)
X_test_scaled = MMS.transform(X_test)

lr = LogisticRegression()
lr.fit(X_train_scaled,y_train)

y_pred = lr.predict(X_test_scaled)
print("Logistic regressin Report (MinMax Scaling):")
evaluation(y_test,y_pred)

**Data Scaling and Normalization perform the same so, I will try parameter tuning to see if it will perform better**


In [None]:
# Features: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age
# get importance 
importance = lr.coef_[0]
# summarize feature importance for normalized model
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show();

#### **From the above figure, we can draw the following:**

*   Glucose, BMI, Age, diabetes pedigree function and pregnancies have significant positive influence on the model, specially glucose level and BMI.

*   Blood pressure and Insulin rate have a negative influence on the prediction

#### **5. Hyperparameter Tuning**

In [None]:
penalty = ['l1', 'l2']
C = [0.0001,0.001,0.01,0.1, 1, 10, 100, 1000]
solver = ['liblinear', 'saga']

param_grid = dict(penalty=penalty, C=C, solver=solver)

grid = GridSearchCV(estimator=lr, param_grid=param_grid, 
                    verbose=1, cv=10)

best_model = grid.fit(X_train_norm, y_train)

In [None]:
y_pred = best_model.predict(X_test_norm)
evaluation(y_test, y_pred)