In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
display(data.info(),data.head(10))

In [None]:
data.describe().transpose()

**Observations from the above :**
* The values for the pregnancies needs to be cheked as the mean suggests the value 3.84 which is close to 4 which seems to be a little bit on the higher side.
* Columns have values as 0 at some places which is cleary incorrect data/missing values.
* Independent/Predictor Variables : Pregnancies,Glucose,BloodPressure etc.
* Dependent/Target Variable : Outcome.

In [None]:
ax = sns.countplot(x=data["Outcome"],data=data)
valcount = data['Outcome'].value_counts().values.tolist()
ax.set_xticklabels(
    ['Diabetic'+':'+str(valcount[0]),'Non-Diabetic'+':'+str(valcount[1])]
);

**Missing Values**

In [None]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
for col in data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]:
    print(col
          +':'
          +str(data[col].isnull().sum()) # to count number of Null values
          +'\npercentage : '
          +str((data[col].isnull().sum())/(len(data[col]))*100) + '\n' # To calculate the percentage of Null Values
         )

In [None]:
plt.figure(figsize=(25,20))
for i,col in enumerate(data.columns,start=0):
    if i<8:
        plt.subplot(4,2,i+1)
        sns.boxplot(x=col,data=data,color = 'lightblue');

In [None]:
data = data.fillna(data.median()) # Replacing the NaN values with the median values
data.isna().sum() # Checking if any NaN alues are present

In [None]:
data.groupby(['Outcome']).count()

As seen earlier diabetic class being a "non-normal" class has an under representation

In [None]:
sns.pairplot(data,hue='Outcome',diag_kind='kde');

The problem that we are dealing with is a classificaion kind of problem hence lets start by looking at the diagonal values.

* We see that there is an overlap of the density graph plotted for diabetic and non diabetic cases.The variables that have less overlap of the density graph will be the weak predictors and the variables which have a major or complete overlap will be poor predictors.


In [None]:
array = data.values
x = array[:,0:8]        # all rows from first 8 columns
y = array[:,8]          # all rows and the 8th column
test_size = 0.30   #taking 70:30 training and test split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=test_size,random_state=42)

#Setting random_state a fixed value will guarantee that same sequence of random numbers 
#are generated each time you run the code. And unless there is some other randomness present 
#in the process, the results produced will be same as always. 
#This helps in verifying the output.

Fit the model on the original data

In [None]:
# Fit the model on original data i.e. before upsampling
model = LogisticRegression(solver='liblinear')
model.fit(x_train, y_train)
y_predict = model.predict(x_test)
model_score = model.score(x_test, y_test)
print(model_score)
print(metrics.confusion_matrix(y_test, y_predict))
print(metrics.classification_report(y_test, y_predict))

true negatives(127), false negatives(33),  true positives(47), false positives(24)

In [None]:
cm = metrics.confusion_matrix(y_test, y_predict)
plt.clf()
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Wistia)
classNames = ['NonDiabetic', 'Diabetic']
plt.title('Confusion Matrix - Test Data')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classNames))
plt.xticks(tick_marks, classNames, rotation=45)
plt.yticks(tick_marks, classNames)
s = [['G1', 'G2'], ['G1','G2']]
 
for i in range(2):
    for j in range(2):
        plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
plt.show()