> **Prediction whether the person will suffer Diabetes or not**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing the Dataset

In [None]:

df = pd.read_csv('../input/diabetes-dataset/diabetes2.csv')
df

Check for Null Values

In [None]:
df.isnull().sum()

Checking for any missing values with the help of 'value_counts()' function

In [None]:
for i in df.columns:
  print(df[i].value_counts())
  print()

Checking the dependence of 'Outcome' variable on other columns with 'countplot( )'

In [None]:
import seaborn as sns
def countplt_fn(x,y,series,xsize,ysize,xtick):
  plt.figure(figsize=(xsize,ysize))
  sns.countplot(series[x], hue=series[y])
  plt.xticks(rotation=xtick)
  plt.show


In [None]:
countplt_fn('Age','Outcome',df,30,6,0)

In [None]:
countplt_fn('BloodPressure','Outcome',df,30,6,0)

In [None]:
countplt_fn('BMI','Outcome',df,70,20,90)

In [None]:
countplt_fn('Glucose','Outcome',df,60,20,90)

In [None]:
countplt_fn('Pregnancies','Outcome',df,30,6,0)

In [None]:
countplt_fn('SkinThickness','Outcome',df,30,6,0)

From above plots, it is clear that the 'Outcome' is not much dependent on 'SkinThickness' variable

Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

In [None]:
x = df.drop(columns='Outcome')
y = df['Outcome']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size= 0.33, random_state = 42)

Sacling the dataset with the help of StandardScaler()

In [None]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
x_train = st.fit_transform(x_train)
x_test = st.fit_transform(x_test)

In [None]:
x_train = pd.DataFrame(x_train)

In [None]:
x_test = pd.DataFrame(x_test)

Fitting the model

In [None]:
lgr =LogisticRegression()
rfe = RFE(lgr,7) #7 are sufficient for prediction
rfe.fit(x_train,y_train)
rfe_features = list(x_train.columns[rfe.support_])
rfe_x_train = x_train[rfe_features]

Out of 8 variables, 7 are sufficient enough to predict the Outcome as it gives the best F1 Score

Building the Logistic Regression model using the features selected by RFE

In [None]:
lgr_1 = LogisticRegression()
lgr_1.fit(rfe_x_train,y_train)

Predicting the values

In [None]:
#F1 scores==> {[8 : 0.80938416, 0.61077844], [7: 0.80938416, 0.61077844], [6: 0.80351906, 0.5988024] }
y_pred = lgr_1.predict(x_test[rfe_features])
f1_score_array = f1_score(y_test,y_pred, average=None)
dict_rfe = {"Features": list(rfe_features), "F1 Score":f1_score_array}
dict_rfe

Implementing the model

In [None]:
features = []
for i in list(rfe_features):
  #print(df.columns[i])
  features.append(df.columns[i])
X = df[features]
Y = df['Outcome']

In [None]:
#features selected by the model
features

Logistic Regression Model Fitting

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.4, random_state = 42)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
#scaling
X_train = st.fit_transform(X_train)
X_test = st.fit_transform(X_test)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
LR = LogisticRegression()
LR.fit(X_train,Y_train)

Predicting the result and checking the accuracy

In [None]:
Y_pred = LR.predict(X_test)
print("Accuracy of Logistic Regression Model is {:.2f}".format(LR.score(X_test,Y_test)))

Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test,Y_pred)
print(cm)

Incorrect predictions

In [None]:
inc_pred = (Y_test != Y_pred).sum()
inc_pred

In [None]:
from sklearn.metrics import classification_report
print(classification_report(Y_test,Y_pred))

In [None]:
#reshaping
Y_train = Y_train.values.reshape(-1,1)
Y_test = Y_test.values.reshape(-1,1)

ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
roc_score = roc_auc_score(Y_test,LR.predict(X_test))
fpr, tpr, thr = roc_curve(Y_test, LR.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr,tpr, label='Logistic Regression Area = {:.2f}'.format(roc_score))
plt.plot([0,1],[0,1],'r--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC : Receiver Operating Characteristics')
plt.legend()
plt.show()

**Conclusion:** Thus, we can predict whether the preson has diabetes or  not with the help of Logistic Regression. The model is working with 75% accuracy.