# **Objective**: To predict diabetes on diagnotstic using logistic regression

**Data Source**: [Pime Indians Diabetes Database](https://www.kaggle.com/uciml/pima-indians-diabetes-database)

In [None]:
# import libraries for diabities model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
%matplotlib inline
#from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

In [None]:
#prepare a header with easy to use feature names.
column_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
data_set = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv", header=None, names=column_names, skiprows=(0,0))

In [None]:
#check some values
data_set.head(10)

In [None]:
#Check information of data-frame
data_set.info()

In [None]:
#data_frame.describe
data_set.describe()

In [None]:
data_set.describe().transpose()

**Observations from the above :**
* The values for the pregnancies needs to be checked as the mean suggests the value 3.84 which is close to 4 which seems to be a little bit on the higher side.
* Columns have values as 0 at some places which is cleary incorrect data/missing values.
* Independent/Predictor Variables : Pregnancies,Glucose,BloodPressure, Pedigree, Insulin. BMI, Age.
* Dependent/Target Variable : Label.

In [None]:
#convert string into number
convert_col = ['pregnant', 'insulin', 'bmi',  'age', 'glucose', 'bp', 'pedigree']
for col in convert_col: #Iterate over chosen columns
    data_set[col] = pd.to_numeric(data_set[col])

**Data Cleaning:**

In [None]:
ax = sns.countplot(x=data_set["label"],data=data_set)
valcount = data_set['label'].value_counts().values.tolist()
ax.set_xticklabels(
    ['Diabetic'+':'+str(valcount[0]),'Non-Diabetic'+':'+str(valcount[1])]
);

In [None]:
#Finding number of missing values
data_set[['glucose', 'bp', 'skin', 'insulin']] = data_set[['glucose', 'bp', 'skin', 'insulin']].replace(0,np.NaN)

In [None]:
for col in data_set[['glucose', 'bp', 'skin', 'insulin']]:
    print(col
          +':'
          +str(data_set[col].isnull().sum()) # to count number of Null values
          +'\npercentage : '
          +str((data_set[col].isnull().sum())/(len(data_set[col]))*100) + '\n' # To calculate the percentage of Null Values
         )

In [None]:
#filling null values with median values
data_set = data_set.fillna(data_set.median())
#checking if any null values left
data_set.isna().sum()

In [None]:
data_set.groupby(['label']).count()

**Features:**

In [None]:
#feature selection
feature_columns = ['pregnant', 'insulin', 'bmi',  'age', 'glucose', 'bp', 'pedigree']
X = data_set[feature_columns]
y = data_set.label

In [None]:
display(X)

In [None]:
#Checking correlation and creating a Correlation plot
corr = data_set.corr()
plt.figure(figsize=(40,30))

#coor_range is definging correlation matrix. basically the range of values for corr
coor_range = corr[(corr>= 0.3) | (corr <= -0.1)]
#Creating a heat map
#linewidths are the width of line of each cell.
# square = true makes each cell a square
# annot=True display the data values of each cell in the cell respectively
#cmap='GnBu' map the range with color values
#linecolor is the color of line that divides each cell
#cbar_kws shows color bar and is dictionary of key  value mapping
sns.heatmap(coor_range, vmax = .8, linewidths=0.01, square=True, annot=True, cmap='GnBu', linecolor="white", cbar_kws={'label': 'Features Correlation color'})
# adding a titlle to the plot and X and Y-axis lables
plt.title('Correlation between features of PIMA Datasets')
plt.xlabel('Features Values on Y axis')
plt.ylabel('Features Values on X axis')


In [None]:
#split the dataset into test and training data
X_train,X_test,y_train,y_test = train_test_split (X, y, test_size=0.20, random_state=42)

In [None]:
#Applying the logistic Regression algorithm
logistic_function= LogisticRegression(max_iter=1000)

In [None]:
#Fit the model with training data
logistic_function.fit(X_train, y_train)
y_prediction=logistic_function.predict(X_test)

In [None]:
#Model Evaluation using Confusion matrix for the performance of classification model
#Basically checks our predicted values of label(Outcome) with our test data values of label(outcome)
cnf_matrix_evaluation = metrics.confusion_matrix(y_test, y_prediction)
cnf_matrix_evaluation

true negatives(83), false negatives(16),  true positives(21), false positives(34)

In [None]:
class_names = [0,1] #naming the classes
#using subplots we can make all our plots at one time and display it.
fig, ax = plt.subplots()
tick_marks  = np.arange(len(class_names))
#creating tick marks on plot
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

#creating heatmap for visualizing confusion matrix
sns.heatmap(pd.DataFrame(cnf_matrix_evaluation), annot=True, cmap="YlGnBu", fmt="g")
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion Matrix: Diabetes Patient', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
#confusion matrix conclusion for the evaluation matrics
#Accuracy is ratio of correctly predicted observation to the total observations.
print("Accuracy:", metrics.accuracy_score(y_test,y_prediction))
#Precision is the ratio of correctly predicted positive obe=servations to the total predicted positive observations
print("Precision:", metrics.precision_score(y_test,y_prediction))
#Recall is the ratio of correctly predicted positive observations to all the observations in the class
print("Recall:", metrics.recall_score(y_test,y_prediction))

In [None]:
print(metrics.accuracy_score(y_test,y_prediction)*100 , "% : chances that the person is having diabetes in the present dataset")
print(y_prediction)

In [None]:
#performance evaluation using the ROC curve, Receiver Operating Characterstic(ROC) curve is a graphical plot of the illustrate the diagnostic ability of a Binary Classifier system.
#It shows the tradeoff between sensitivity and specificity.

y_prediction_probability = logistic_function.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_prediction_probability)
auc = metrics.roc_auc_score(y_test, y_prediction_probability)
plt.plot(fpr, tpr, label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()
#Note: AUC score for the case is 0.81. AUC score 1 represents perfect classifier, and 0.5 represents a worthless classifier.

In [None]:
#f1_score is a weighted average of precision and recall values
metrics.f1_score(y_test, y_prediction, average=None)