In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# Done by Logistic Regression

In [None]:
#Loading the dataset
diabetes_data = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')

#Print the first 5 rows of the dataframe.
diabetes_data.head()

# Preprocessing

**Exploring the descriptive statistics of the variables**

In [None]:
# Descriptive statistics are very useful for initial exploration of the variables
diabetes_data.describe(include='all')

****Here is we get some information of missing value by analysing of minimum value. Value of zero doesn't make any sense****

Following columns or variables have an invalid zero value:

1. Glucose
2. BloodPressure
3. SkinThickness
4. Insulin
5. BMI

****It is better to replace zeros with nan since after that counting them would be easier and zeros need to be replaced with suitable values****

In [None]:
diabetes_data_copy = diabetes_data.copy(deep = True)
diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = diabetes_data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

 ****Dealing with missing value****

In [None]:
# data.isnull() # shows a Diabetes_data_copy with the information whether a data point is null 
# Since True = the data point is missing, while False = the data point is not missing, we can sum them
# This will give us the total number of missing values feature-wise
diabetes_data_copy.isnull().sum()

****To fill the null values , we need to understand the data ****

In [None]:
p = diabetes_data.hist(figsize = (20,20))

In [None]:
diabetes_data_copy['Glucose'].fillna(diabetes_data_copy['Glucose'].mean(), inplace = True)
diabetes_data_copy['BloodPressure'].fillna(diabetes_data_copy['BloodPressure'].mean(), inplace = True)
diabetes_data_copy['SkinThickness'].fillna(diabetes_data_copy['SkinThickness'].median(), inplace = True)
diabetes_data_copy['Insulin'].fillna(diabetes_data_copy['Insulin'].median(), inplace = True)
diabetes_data_copy['BMI'].fillna(diabetes_data_copy['BMI'].median(), inplace = True)

In [None]:
#plotting after filling null values
p = diabetes_data_copy.hist(figsize = (20,20))


# Screening of Association between Variables to study Bivariate relationship¶

   ****We will use pairplot to study the association between variables – from individual scatter plots****
   
   ****Then we will compute pearson correlation coefficient****
    
   **** Then we will summarize the same as heatmap****

In [None]:
sns.pairplot(diabetes_data_copy, vars=["Pregnancies", "Glucose","BloodPressure","SkinThickness","Insulin", "BMI","DiabetesPedigreeFunction", "Age"],hue="Outcome")
plt.title("Pairplot of Variables by Outcome")

****The pairs plot builds on two basic figures, the histogram and the scatter plot. The histogram on the diagonal allows us to see the distribution of a single variable while the scatter plots on the upper and lower triangles show the relationship (or lack thereof) between two variables.****


# Inference from Pair Plots

*     From scatter plots, to me only BMI & SkinThickness and Pregnancies & Age seem to have positive linear relationships. Another likely       suspect is Glucose and Insulin.
*     There are no non-linear relationships
*     Lets check it out with Pearson Correlation and plot heat maps

# Pearson's Correlation Coefficient: 
****helps you find out the relationship between two quantities. It gives you the measure of the strength of association between two variables. The value of Pearson's Correlation Coefficient can be between -1 to +1. 1 means that they are highly correlated and 0 means no correlation.****

**** Before cleaning the data****

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(diabetes_data.corr(), annot=True,cmap ='YlGnBu')  # seaborn has very simple solution for heatmap

**** After cleaning the data****

In [None]:
plt.figure(figsize=(12,10))  # on this line I just set the size of figure to 12 by 10.
p=sns.heatmap(diabetes_data_copy.corr(), annot=True,cmap ='RdYlGn')  # seaborn has very simple solution for heatmap

# Logistic Regression


*     A logistic regression is used from the dependent variable is binary, ordinal or nominal and the independent variables are either continuous or discrete
*     In this scenario, a Logit Model has been used to fit the data
*    In this case an event is defined as occurance of ‘1’ in outcome
*     Basically logistic regression uses the odds ratio to build the model

****Declare Dependent variable and Independent variables****

In [None]:
#independent variables
x = diabetes_data_copy[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]

#dependent variables
y = diabetes_data_copy['Outcome']

In [None]:
## Importing stats models for running logistic regression
import statsmodels.api as sm
## Defining the model and assigning Y (Dependent) and X (Independent Variables)
logit_model=sm.Logit(y,x)
## Fitting the model and publishing the results
result=logit_model.fit()
print(result.summary())

# Inference from the Logistic Regression
*     The R sq value of the model is 58%.. that is this model can explain 58% of the variation in dependent variable
*     To identify which variables influence the outcome, we will look at the p-value of each variable. We expect the p-value to be less than 0.05(alpha risk)
*     When p-value<0.05, we can say the variable influences the outcome
*     Hence we will eliminate Diabetes Pedigree Function, Age, Insulin, SkinThickness and again run the model

# 2nd itertion of the Logistic Regression with fewer variables

In [None]:
X1 = diabetes_data_copy[['Pregnancies','Glucose','BloodPressure']]
logit_model2 = sm.Logit(y,X1)
result2 = logit_model2.fit()
print(result2.summary2())


# Inference from 2th Run
*    Now the model is clear. We have 3 variables that influence the Outcome and then are Pregnancies, Glucose and BloodPressure
*     Luckly, none of these 3 variables are co-correlated. Hence we can safetly assume tha the model is not inflated

# Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X1)

In [None]:
X1_scaled = pd.DataFrame(scaler.transform(X1),columns=['Pregnancies', 'Glucose', 'BloodPressure'])
X1_scaled.head()

# Test Train Split and Cross Validation methods

**Train Test Split** : To have unknown datapoints to test the data rather than testing with the same points with which the model was trained. This helps capture the model performance much better.

For Reference : https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6

In [None]:
# checking the balance of the data
diabetes_data_copy['Outcome'].unique()

In [None]:
diabetes_data_copy['Outcome'].value_counts()

**The above result shows that the data is biased towards datapoints having outcome value as 0 where it means that diabetes was not present actually. The number of non-diabetics is almost twice the number of diabetic patients**

In [None]:
#importing train_test_split
from sklearn.model_selection import train_test_split
X1 = diabetes_data_copy[['Pregnancies','Glucose','BloodPressure']]
X_train,X_test,y_train,y_test = train_test_split(X1,y,test_size=0.25,random_state=42, stratify=y)

In [None]:
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
#Importing 
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
Lreg = LogisticRegression(solver = 'lbfgs')
Lreg.fit(X_train, y_train.ravel())  #ravel( will return 1D array with all the input-array elements)

In [None]:
y_predict = Lreg.predict(X_test)
y_predict

In [None]:
y_predict_train = Lreg.predict(X_train)
y_predict_train

In [None]:
y_prob_train = Lreg.predict_proba(X_train)[:, 1]
y_prob_train.reshape(1,-1)

In [None]:
y_prob= Lreg.predict_proba(X_test)[:,1]
y_prob.reshape(-1,1)
y_prob

In [None]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test,y_predict)
score

# Calculating accuracy score using confusin matrix

**A confusion matrix is a matrix (table) that can be used to measure the performance of an machine learning algorithm, usually a supervised learning one. Each row of the confusion matrix represents the instances of an actual class and each column represents the instances of a predicted class. This is the way we keep it in this chapter of our tutorial, but it can be the other way around as well, i.e. rows for predicted classes and columns for actual classes. The name confusion matrix reflects the fact that it makes it easy for us to see what kind of confusions occur in our classification algorithms. For example the algorithms should have predicted a sample as ci because the actual class is ci, but the algorithm came out with cj. In this case of mislabelling the element cm[i,j] will be incremented by one, when the confusion matrix is constructed.**

**For Reference:**https://www.python-course.eu/confusion_matrix.php

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)
pd.crosstab(y_test.ravel(),y_predict.ravel(), rownames=['True'], colnames=['Predicted'], margins=True) # #ravel( will return 1D array with all the input-array elements)

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
print('true negatives', tn)
print('false positive', fp)
print('false negative', fn)
print('true positive', tp)

#  Classification Report

1. Prevalence – how often in our sample do we find a yes? (True Positives + False Negatives) / Total of all 4
2. Accuracy – how often is the classifier correct? = (True Positives + True Negatives) / Total of all 4
3. False positive rate – when it is actually no, how often does it predict yes? = False Positives / (False Positives + True Negatives)
4. True Positive rate or Recall – when it is actually yes, how often does it predict yes? = True Positives / (True Positives + False Negatives)
5. Precision – when it predicts yes, how often is it correct? = True Positives / (True Positives + False Positives)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_predict))

In [None]:
Accuracy = (tp+tn)/(tp+tn+fp+fn)
print('Accuracy {:0.2f}'.format(Accuracy))

In [None]:
Specificity = tn/(tn+fp)
print('Specificity {:0.2f}'.format(Specificity))

In [None]:
Sensitivity = tp/(tp+fn)
print('Sensitivity {:0.2f}'.format(Sensitivity))

# ROC - AUC

ROC (Receiver Operating Characteristic) Curve tells us about how good the model can distinguish between two things (e.g If a patient has a disease or no). Better models can accurately distinguish between the two. Whereas, a poor model will have difficulties in distinguishing between the two.

For Reference: https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
log_ROC_AUC1 = roc_auc_score(y_train, y_predict_train)
fpr1, tpr1, thresholds1 = roc_curve(y_train, y_prob_train)
roc_auc1 = auc(fpr1, tpr1)

In [None]:
plt.figure()
plt.plot(fpr1,tpr1, color = 'blue', label =  'ROC curve (area = %0.2f)'% roc_auc1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')

plt.legend(loc='lower right')
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

In [None]:
print('Area under the roc curve : %f' % roc_auc)

# find optimal cutoff point(thresold value)

In [None]:
import numpy as np 
i = np.arange(len(tpr)) #index for df
roc = pd.DataFrame({'fpr': pd.Series(fpr, index=i), 'tpr': pd.Series(tpr, index=i), '1-fpr':pd.Series(1-fpr, index=i), 'tf':pd.Series(tpr -(1-fpr), index=i), 'thresholds':pd.Series(thresholds, index=i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]

In [None]:
fig, ax = plt.subplots()
plt.plot(roc['tpr'])
plt.plot(roc['1-fpr'], color = 'red')
plt.xlabel('1-false positive rate')
plt.ylabel('true positive rate')
plt.title('receiver operating characteristic')
ax.set_xticklabels([])

In [None]:
from sklearn.preprocessing import binarize
y_predict_class1 = binarize(y_prob.reshape(1, -1),0.341694)[0]
y_predict_class1

In [None]:
confusion_matrix_1 = confusion_matrix(y_test, y_predict_class1)
print(confusion_matrix_1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predict_class1))

 ****I am a beginner . This is my first project ..Please give me suggestion  if you found any mistake ! Thanks.****