In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x=df.Outcome)

- Approximately 500 records of non-diabetic patients and around 250 records of diabetic patients

In [None]:
f, axes = plt.subplots(2, 4, figsize=(15, 6))

sns.boxplot(x= "Glucose", data=df, palette="PRGn", ax=axes[0,0])
sns.boxplot(x="Pregnancies", data=df, palette="PRGn", ax=axes[0,1])
sns.boxplot(x="BloodPressure", data=df, palette="PRGn", ax=axes[0,2])
sns.boxplot(x="SkinThickness", data=df, palette="PRGn", ax=axes[0,3])

sns.boxplot(x= "Insulin", data=df, palette="PRGn", ax=axes[1,0])
sns.boxplot(x="BMI", data=df, palette="PRGn", ax=axes[1,1])
sns.boxplot(x="DiabetesPedigreeFunction", data=df, palette="PRGn", ax=axes[1,2])
sns.boxplot(x="Age", data=df, palette="PRGn", ax=axes[1,3])


plt.show()

- Boxplot clearly shows that features like **"Insulin"**, **"DiabetesPedigreeFunction"**, **"BMI"** and **"BloodPressure"** have a lot of Outliers but we are not going to remove them because all patients data are required for the model training and prediction.

In [None]:
sns.jointplot(data=df[['Age','BloodPressure', 'Glucose','BMI']],height=10, ratio=5, color="r")

- Here, we can say that  **"Glucose"** has a **high variance** than other features, whereas **BMI** shows **high bias** as its data points are more clustered.

In [None]:
f,ax = plt.subplots(figsize=(15, 15))
sns.heatmap(df.corr(), annot=True, linewidths=0.5,linecolor="black", fmt= '.1f',ax=ax, cmap='gray_r')
plt.show()

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
kfold = model_selection.KFold(n_splits = 3)
  
# initialize the base classifier
base_cls = DecisionTreeClassifier()
  
# no. of base classifier
num_trees = 400
  
# bagging classifier
model = BaggingClassifier(estimator = base_cls,
                          n_estimators = num_trees)
  
results = model_selection.cross_val_score(model, X_train, y_train, cv = kfold)

print("accuracy :")
print(results.mean())

In [None]:
input_row = df.iloc[0]

In [None]:
import pandas as pd
df = pd.read_csv("diabetes.csv")

input_row_index = 0
input_row = df.iloc[input_row_index]

print("Input Row:")
print(input_row)
input_features = input_row.drop('Outcome')
input_features = input_features.values.reshape(1, -1)


In [None]:
prediction = model.predict(input_features)

In [None]:
print("Prediction:", prediction)