In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pima people - Wikipedia

The Pima are a group of Native Americans living in an area consisting of what is now central and southern Arizona, as well as northwestern Mexico in the states of Sonora and Chihuahua.

https://en.wikipedia.org/wiki/Pima_people

In [None]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

* There are no missing values

In [None]:
df.info()

* All of the available variables are numeric variables

In [None]:
df.describe()

* The target or the dependent variable here is **Outcome** which is binary i.e., 0 for the people who don't have diabetes and 1 for them who have diabetes.
* Looking at the summarized view of all the features, we can see that the maximum values of some features are significantly higher compared to the median and also there is significant difference between the median and mean for the feature. This indicates the presence of outliers. 

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.hist(df["Pregnancies"], bins=50)
plt.show()

In [None]:
sns.countplot(df["Pregnancies"])
plt.show()

In [None]:
sns.boxplot(df["Pregnancies"])
plt.show()

* **Pregnancies** column has a long tail on the right side i.e., right-skewed.
* There are few rare cases present in the dataset where **Pregnancies** have been reported more than 10 times. We might need to look into more details for these outliers and how to treat them for generalization.

In [None]:
sns.scatterplot(x="Pregnancies", y="Outcome", data=df)
plt.show()

In [None]:
sns.countplot(df["Pregnancies"], hue=df["Outcome"])
plt.show()

* While we see that Diabetes has been reported at all levels for the number of Pregnancies,  the number of Diabetes reported seems to be increasing when the number of Pregnancies is more than 6.

In [None]:
sns.distplot(df["Glucose"], bins=25)
plt.show()

#### After going through some study on glucose levels - 

* Normal: < 110 mg/dL
* Pre-diabetes: 110–125 mg/dL
* Diabetes: ≥ 126 mg/dL

* Also it is important to be noted that if the glucose level in the body reduces below 60mg/dl, the human can go unconscious. This state is called diabetic coma. Meaning coma caused by too less glucose level. This condition is considered to be serious.

In [None]:
sns.boxplot(data=df, y="Glucose", x="Outcome")
plt.show()

* Looking at the above box-plot we do get the confirmation that the Pre-diabetes levels have been marked as 0.
* Again there are cases of outliers clealy visible here which would need to be dealt with before creating ML model
* There is small yet significant overlap the Glucose levels as for the Outcomes as 0 and 1.

In [None]:
sns.distplot(df["BloodPressure"], bins=25)
plt.show()

* The normal Blood Pressure level is between 60-80
* There are outliers on the lower sides

In [None]:
sns.boxplot(data=df, y="BloodPressure", x="Outcome")
plt.show()

* The cases of Diabetes have slightly higher blood pressure than those who don't have diabetes.

In [None]:
sns.distplot(df["SkinThickness"], bins=25)
plt.show()

In [None]:
sns.boxplot(data=df, y="SkinThickness", x="Outcome")
plt.show()

* A lot of the cases have a Skin Thickness as 0 and there are cases as well where it has been reported with higher values 
* Comparing Outcome and Skin Thickness, it seems the diabetes cases have a slightly higher skin thickness

In [None]:
sns.distplot(df["Insulin"], bins=25)
plt.show()

In [None]:
sns.boxplot(data=df, y="Insulin", x="Outcome")
plt.show()

* As expected the cases which don't have diabetes, they have higher insulin present.
* There are yet a lot cases where even though a high level of insulin is present yet have diabetes. This needs to be investigated further.

In [None]:
sns.distplot(df["BMI"], bins=25)
plt.show()

In [None]:
sns.boxplot(data=df, y="BMI", x="Outcome")
plt.show()

* Majority of the data is centered with the BMI around 30-35 with few outliers in both the sides
* The diabetes cases have slightly higher BMI

In [None]:
sns.distplot(df["DiabetesPedigreeFunction"], bins=25)
plt.show()

In [None]:
sns.boxplot(data=df, y="DiabetesPedigreeFunction", x="Outcome")
plt.show()

In [None]:
sns.distplot(df["Age"], bins=25)
plt.show()

In [None]:
sns.boxplot(data=df, y="Age", x="Outcome")
plt.show()

* It is the elder people who have majorly been reported with diabetes

In [None]:
pd.plotting.scatter_matrix(df, figsize=(15,15))
plt.show()

In [None]:
df1 = df[(df["Pregnancies"]<=10) & (df["Glucose"]>=60) & (df["BloodPressure"]>=40) & (df["BloodPressure"]<=120) &
         (df["SkinThickness"]<=60) & (df["Insulin"]<=400) & (df["BMI"]>=15) & (df["BMI"]<=50)]
df1.shape, df.shape

In [None]:
df1.head()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(df1.corr(), annot=True)
plt.show()

In [None]:
best_feats = ["Pregnancies","Glucose","BMI","Age","Outcome"]

df2 = df1[best_feats]
df2.head()

In [None]:
pd.plotting.scatter_matrix(df2, figsize=(15,15))
plt.show()

In [None]:
df3 = df2.copy()

In [None]:
df3["Age"] = np.log(df3["Age"])
df3["Pregnancies"] = np.log(df3["Pregnancies"]+1)

In [None]:
X = df3.drop("Outcome", axis=1).copy()
y = df3["Outcome"].copy()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
df3["Outcome"].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [None]:
pred = log_reg.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, pred)

In [None]:
recall_score(y_test, pred)

In [None]:
from sklearn.metrics import f1_score

f1_score(y_test, pred)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, pred)

In [None]:
from sklearn.svm import SVC

svm_clf = SVC()
svm_clf.fit(X_train,y_train)
pred = svm_clf.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print("Precision Score:", precision_score(y_test, pred))
print("Recall Score:", recall_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))
print("ROC AUC Score", roc_auc_score(y_test, pred))

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
pred = knn_clf.predict(X_test)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
print("Precision Score:", precision_score(y_test, pred))
print("Recall Score:", recall_score(y_test, pred))
print("F1 Score:", f1_score(y_test, pred))
print("ROC AUC Score", roc_auc_score(y_test, pred))

### Stratified Sampling

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(df3, df3["Outcome"]):
    train_set = df3.iloc[train_index]
    test_set = df3.iloc[test_index]

In [None]:
train_set["Outcome"].mean(),test_set["Outcome"].mean()

In [None]:
X_train1 = train_set.drop("Outcome", axis=1).copy()
X_test1 = test_set.drop("Outcome", axis=1).copy()

y_train1 = train_set["Outcome"].copy()
y_test1 = test_set["Outcome"].copy()

In [None]:
scaler = StandardScaler()
X_train1 = pd.DataFrame(scaler.fit_transform(X_train1), columns=X_train1.columns)
X_train1.head()

In [None]:
X_test1 = pd.DataFrame(scaler.fit_transform(X_test1), columns=X_test1.columns)
X_test1.head()

In [None]:
log_reg1 = LogisticRegression()
log_reg1.fit(X_train1, y_train1)
pred1 = log_reg1.predict(X_test1)

In [None]:
confusion_matrix(y_test1, pred1)

In [None]:
print("Precision Score:", precision_score(y_test1, pred1))
print("Recall Score:", recall_score(y_test1, pred1))
print("F1 Score:", f1_score(y_test1, pred1))
print("ROC AUC Score", roc_auc_score(y_test1, pred1))

In [None]:
svm_clf1 = SVC()
svm_clf1.fit(X_train1,y_train1)
pred1 = svm_clf1.predict(X_test1)

In [None]:
confusion_matrix(y_test1, pred1)

In [None]:
print("Precision Score:", precision_score(y_test1, pred1))
print("Recall Score:", recall_score(y_test1, pred1))
print("F1 Score:", f1_score(y_test1, pred1))
print("ROC AUC Score", roc_auc_score(y_test1, pred1))

In [None]:
knn_clf1 = KNeighborsClassifier()
knn_clf1.fit(X_train1,y_train1)
pred1 = knn_clf1.predict(X_test1)

In [None]:
confusion_matrix(y_test1, pred1)

In [None]:
print("Precision Score:", precision_score(y_test1, pred1))
print("Recall Score:", recall_score(y_test1, pred1))
print("F1 Score:", f1_score(y_test1, pred1))
print("ROC AUC Score", roc_auc_score(y_test1, pred1))

### With Stratified Shuffle split, the performance of Logistic Regression improved significantly

## Tree Based Algorithms

In [None]:
X = df3.drop("Outcome", axis=1).copy()
y = df3["Outcome"].copy()

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in sss.split(df3, df3["Outcome"]):
    train_set = df3.iloc[train_index]
    test_set = df3.iloc[test_index]

X_train2 = train_set.drop("Outcome", axis=1).copy()
X_test2 = test_set.drop("Outcome", axis=1).copy()

y_train2 = train_set["Outcome"].copy()
y_test2 = test_set["Outcome"].copy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier()
rf_clf.fit(X_train2, y_train2)

In [None]:
pred2 = rf_clf.predict(X_test2)

confusion_matrix(y_test2, pred2)

In [None]:
print("Precision Score:", precision_score(y_test2, pred2))
print("Recall Score:", recall_score(y_test2, pred2))
print("F1 Score:", f1_score(y_test2, pred2))
print("ROC AUC Score", roc_auc_score(y_test2, pred2))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(estimators=[('lr', log_clf),('rf', rnd_clf), ('svc', svm_clf)],voting='hard')

voting_clf.fit(X_train2, y_train2)

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train2, y_train2)
    y_pred = clf.predict(X_test2)
    print(clf.__class__.__name__, accuracy_score(y_test2, y_pred))

In [None]:
log_clf.fit(X_train2, y_train2)
pred3 = log_clf.predict(X_test2)
confusion_matrix(y_test2, pred3)

In [None]:
print("Precision Score:", precision_score(y_test2, pred3))
print("Recall Score:", recall_score(y_test2, pred3))
print("F1 Score:", f1_score(y_test2, pred3))
print("ROC AUC Score", roc_auc_score(y_test2, pred3))

In [None]:
pred4 = voting_clf.predict(X_test2)
confusion_matrix(y_test2, pred4)

In [None]:
print("Precision Score:", precision_score(y_test2, pred4))
print("Recall Score:", recall_score(y_test2, pred4))
print("F1 Score:", f1_score(y_test2, pred4))
print("ROC AUC Score", roc_auc_score(y_test2, pred4))

# Conclusion

* We started with importing the dataset and exploring it to understand the data more closely.
* As there were some outliers in the dataset, we decided to remove them for a creating a more generic model.
* Because the data was a slight imbalanced, we tried with both Random Sampling and Stratified Sampling.
* Models created - Logistic Regression, K-Nearest Neighbors, SVC, Random Forest and we also tried a custom ensemble model. Of all these models, we observed that it was Logistic Regression which performed the best.