In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
pd.set_option("display.precision", 2)

warnings.filterwarnings("ignore")
plt.style.use('ggplot')

In [None]:
df1 = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")
df1.sample(5)

### EXPLORATORY DATA ANALYSIS 

In [None]:
df1.info()

In [None]:
df1.describe().transpose()

There are some minimum values of 0, which may alter the data in a negative way, hence we need to handle them, one way or the other

In [None]:
df2 = df1.copy(deep=True)

df2[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df2[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

df2.isnull().sum()

In [None]:
df2.shape

<strong> BMI and diabetesfunction vs Outcome </strong>

In [None]:
print('Continuous Variables')
print(df2[['BMI','DiabetesPedigreeFunction']].describe().transpose())
print('--'*40)

sns.set_style('darkgrid')
fig = plt.figure(figsize = (20,16))
fig.subplots_adjust(hspace = .30)

ax1 = fig.add_subplot(221)
ax1.hist(df2['BMI'], bins = 20, alpha = .50,edgecolor= 'black',color ='teal')
ax1.set_xlabel('BMI', fontsize = 15)
ax1.set_ylabel('Individuals',fontsize = 15)
ax1.set_title('BMI Spread',fontsize = 15)

ax2 = fig.add_subplot(223)
ax2.hist(df2['DiabetesPedigreeFunction'], bins = 20, alpha = .50,edgecolor= 'black',color ='teal')
ax2.set_xlabel('DiabetesPedigreeFunction',fontsize = 15)
ax2.set_ylabel('Individuals',fontsize = 15)
ax2.set_title('DiabetesPedigreeFunction',fontsize = 15)

ax3 = fig.add_subplot(122)
ax3.scatter(x = df2[df2['Outcome']==0].BMI, y = df2[df2['Outcome']==0].DiabetesPedigreeFunction,
                        alpha = .50,edgecolor= 'black',  c = 'grey', s= 75, label = 'No Diabetes')
ax3.scatter(x = df1[df1['Outcome']==1].BMI, y = df1[df1['Outcome']==1].DiabetesPedigreeFunction,
                        alpha = .50,edgecolors= 'black',  c = 'lightgreen', s= 75, label = 'Diabetes')
ax3.set_xlabel('BMI')
ax3.set_ylabel('DiabetesPedigreeFunction')
ax3.set_title('BMI vs DiabetesPedigreeFunction')
ax3.legend()

Integer Variables

In [None]:
print('Integer Variables')
print(df2[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','Age']].describe().transpose())

Pregnancies

In [None]:
print(pd.crosstab(df2['Pregnancies'],df2['Outcome']))
print("---"*40)


plot = df2.groupby(['Pregnancies','Outcome']).Pregnancies.count().unstack()
# plt.bar(x = ((df1['Pregnancies']==0).count()), y=[x for x in range(768)])

p1 = plot.plot(kind='bar',stacked=True, title='Pregnancies and Outcome', color = ['#2F9C95','#A3F7B5'])

p1.set_xlabel('Pregnancies')
p1.set_ylabel("Individuals")
plt.legend("No Diabetes",'Diabetes')
plt.show()

In [None]:
color_list = ['red' if i==1 else 'green' for i in df1.loc[:,'Outcome']]
pd.plotting.scatter_matrix(df1.loc[:, df1.columns != 'Outcome'],
                                       c=color_list,
                                       figsize= [15,15],
                                       diagonal='hist',
                                       alpha=0.5,
                                       s = 200,
                                       edgecolor= "black")
plt.show()

#### ^Original Data, with 0 values

In [None]:
from pandas.plotting import scatter_matrix
p=scatter_matrix(df2,figsize=(20, 15))

The two scatter matrices are more or less similar, but I did two coded 2 different matrices for the sake of it

In [None]:
sns.countplot(x="Outcome", data=df1)
df1.loc[:,'Outcome'].value_counts()

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='Outcome',hue="Pregnancies", data=df1, palette='RdBu_r')

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
cols = list(df1.columns)
fig = plt.figure(figsize=(10,10))
y=0
cols.pop()

labels = ["Not Diabetic","Diabetic"]

for a in cols:
    data = []
    i, j = divmod(y, 3)
    ax1 = plt.subplot2grid((3,3),(i,j))
    
    bel_avg = (df1[a]>df1[a].mean()).value_counts()[0]
    data.append(bel_avg)
    
    ab_avg = (df1[a]>df1[a].mean()).value_counts()[1]
    data.append(ab_avg)
    
    plt.pie(data,labels=labels)
    plt.title(a)
    y = y + 1
    
    #print(i,j)

In [None]:
print("DIABETES POSITIVE")
print(df1[df1['Outcome'] == 1].mean(),'\n')
print("DIABETES NEGATIVE")
print(df1[df1['Outcome']== 0].mean())

Conclusions - 
    <li>No signs of an imbalanced dataset.</li>
    <li>789 rows, with 9 columns, one being the independant variable </li>
    <li>There are some columns with high correlation, we need to take care of it in the "Data Cleaning" step</li>
    <li>No Null Values, shall see further proof in the next section</li>
    <li>Target variable is of int64 datatype</li>
    <li>Features are majorly of int64 datatype too, with 2 being decimals, overall, all numeric values</li>
    <li>The patients with diabetes have higher data values, except BMI, suggesting there <b>MAY</b> not be a link between diabetics and weight</li>

### Now, let us ask some questions.

#### <H5>Q1) What is the Glucose Level of Diabetics, who have high (above average) insulin?</H5>

In [None]:
print("The avg Glucose Level of Diabetics, who have high insulin have",df1[df1['Insulin']>100]["Glucose"].mean(),"units insulin")

<h5>Q2)What is the average BMI for Diabetics, who have an average Glucose and Blood Pressure?</h5>

In [None]:
print("Average BMI for diabetics, who have average Glucose and Blood pressure have an average BMI of",df1[(df1['Glucose'] < 120) & (df1['Glucose'] > 100) & (df1['BloodPressure']<70) & (df1['BloodPressure']>60)]['BMI'].mean())

### Univariate Analysis

In [None]:
filt = df2['Outcome'] == 0
filt2 = df2['Outcome'] == 1

df_noDiabetes = df2.loc[filt]
df_hasDiabetes = df2.loc[filt2]

for i in range(len(cols)-1):
    fig = plt.figure()
    
    fig.add_subplot(2,1,1).plot(df_hasDiabetes[cols[i]],np.zeros_like(df_hasDiabetes[cols[i]]),'o',label="Has Diabetes")
    fig.add_subplot(2,1,1).plot(df_noDiabetes[cols[i]],np.zeros_like(df_noDiabetes[cols[i]]),'o',label="Does not have diabetes",alpha=0.4)
    
    plt.title(cols[i])
    
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

These graphs are useful in telling us, where the diabetic patients reside in terms of these factors, and how far off or how in range they are when compared to the non-diabetic patients

### Bivariate Analysis

In [None]:
sns.FacetGrid(df2,hue='Outcome',height=5).map(plt.scatter,'BMI','Insulin').add_legend()

There isn't much to infer from this graph, except that there are non-diabetic people with higher BMI, suggesting that there <b>may not</b> be a relation between high weight and diabetes. Of course, these individuals may be of a taller height, therefore, it shall merely be speculation. Another point to note is that this dataset is talking about Type 2 diabetes  

#### Multivariate Analysis

In [None]:
corrmat = df2.corr()
f, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(corrmat, ax=ax, linewidths=0.1)

The features are not <b>highly</b> correlated with each other, hence there is no such need to remove any such features. We might reduce the features ahead, but not on the basis of correlation

## DATA CLEANING

#### Null Values

In [None]:
df2.isnull().sum()

Let us replace the lesser null values with mean, and the higher null values with the median

In [None]:
df2['Glucose'] = df2['Glucose'].fillna(df2['Glucose'].mean())
df2['BloodPressure'] = df2['BloodPressure'].fillna(df2['BloodPressure'].mean())
df2['SkinThickness'] = df2['SkinThickness'].fillna(df2['SkinThickness'].median())
df2['Insulin'] = df2['Insulin'].fillna(df2['Insulin'].median())
df2['BMI'] = df2['BMI'].fillna(df2['BMI'].median())

### Another way of replacing is using inplace = True

df2.isnull().sum()

#### Categorical Variables

In [None]:
cols = df1.columns

num_cols = df1._get_numeric_data().columns

list(set(cols) - set(num_cols))

Since there are no Categorical Variables in this dataset, there is no need for encoding, hence we can again move forward

#### Outlier Detection

In [None]:
_ = df2.hist(bins=30, figsize=(15, 10))

In [None]:
columns_array = list(df2.columns)
f, axes = plt.subplots(round(len(columns_array)/3), 3,figsize=(12,5))  
y = 0;
for name in columns_array:
    i, j = divmod(y, 3)
    sns.boxplot(x=df2[name], ax=axes[i, j])
    y = y + 1

f.tight_layout()
plt.show()

In [None]:
def grubbs_test(x):
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    #print("Grubbs Calculated Value:",g_calculated)
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    #print("Grubbs Critical Value:",g_critical)
    if g_critical > g_calculated:
        return("No outliers\n")
    else:
        return("Outliers\n")
    
for count, columns in enumerate(df2.columns):
    print(count+1,columns_array[count])
    print(grubbs_test(df2[columns]))

<ol> <h6>We can draw the following conclusions</h6>
    <li> Insulin and Diabetes Pedigree Function have a very high number of Outliers </li>
    <li> We shall leave the outliers alone for now, but if we get a low accuracy score, we shall do some cleaning </li>
</ol>

### Modelling of Data

In [None]:
from sklearn.preprocessing import StandardScaler
standard_scaler_X = StandardScaler()
X =  pd.DataFrame(standard_scaler_X.fit_transform(df1.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])

X.head()

In [None]:
X = X.values
y = df2.iloc[:,-1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

cm = confusion_matrix(y_test, y_pred)
print("The Confusion Matrix is ","\n",cm)
print("Precision score: {}".format(precision_score(y_test,y_pred)))

In [None]:
_ = sns.heatmap(pd.DataFrame(cm),annot=True,fmt='g')

We got a Precision Score of 75%, using Logistic Regression, and without using any advanced techniques. Let us see if we can do better.

In [None]:
from sklearn.model_selection import cross_val_score

l_scores = cross_val_score(LogisticRegression(), X, y)
print(f'Logistic Regression Scores, using Cross Validation are {l_scores}')
print(f"The average of those scores is {np.average(l_scores)}")

print("---"*40)

d_scores = cross_val_score(DecisionTreeClassifier(), X, y)
print(f'Decision Tree Scores, using Cross Validation are {d_scores}')
print(f"The average of those scores is {np.average(d_scores)}")

print("---"*40)

s_scores = cross_val_score(SVC(), X, y)
print(f'Support Vector Scores, using Cross Validation are {s_scores}')
print(f"The average of those scores is {np.average(s_scores)}")

print("---"*40)

r_scores = cross_val_score(RandomForestClassifier(n_estimators=40), X, y)
print(f'Support Vector Scores, using Cross Validation are {r_scores}')
print(f"The average of those scores is {np.average(r_scores)}")

print("---"*40)

k_scores = cross_val_score(KNeighborsClassifier(),X,y)
print(f'K-Nearest Neighbor Scores, using Cross Validation are {k_scores}')
print(f"The average of those scores is {np.average(k_scores)}")

The highest score is that of SVC (Support Vector Classifier), hence we will use some GridSearchCV to find the best parameter.

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]
grid_search = GridSearchCV(estimator = SVC(),
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)

accuracy = grid_search.best_score_
best_params = grid_search.best_params_

print(f"The accuracy is {accuracy}")
print(f'The best parameters are {best_params}')

This is one of my first notebooks, hence I would greatly appreciate any form of feedback.