In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.offline as py
import plotly.express as px
import plotly.io as pio
import plotly.graph_objs as go
import math
from scipy.stats import norm, skew

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df_diab=pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df_diab.info()

In [None]:
df_diab.head()

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(15,12))
axs = axs.flatten()
sns.distplot(df_diab['Pregnancies'],rug=True,color='#38b000',ax=axs[0])
sns.distplot(df_diab['Glucose'],rug=True,color='#FF9933',ax=axs[1])
sns.distplot(df_diab['BloodPressure'],rug=True,color='#522500',ax=axs[2])
sns.distplot(df_diab['SkinThickness'],rug=True,color='#66b3ff',ax=axs[3])
sns.distplot(df_diab['Insulin'],rug=True,color='#FF6699',ax=axs[4])
sns.distplot(df_diab['BMI'],color='#e76f51',rug=True,ax=axs[5])
sns.distplot(df_diab['DiabetesPedigreeFunction'],color='#03045e',rug=True,ax=axs[6])
sns.distplot(df_diab['Age'],rug=True,color='#333533',ax=axs[7])
plt.show()

In [None]:
fig, axs = plt.subplots(4, 2, figsize=(15,12))
axs = axs.flatten()
sns.boxplot(df_diab['Pregnancies'],ax=axs[0])
sns.boxplot(df_diab['Glucose'],ax=axs[1])
sns.boxplot(df_diab['BloodPressure'],ax=axs[2])
sns.boxplot(df_diab['SkinThickness'],ax=axs[3])
sns.boxplot(df_diab['Insulin'],ax=axs[4])
sns.boxplot(df_diab['BMI'],ax=axs[5])
sns.boxplot(df_diab['DiabetesPedigreeFunction'],ax=axs[6])
sns.boxplot(df_diab['Age'],ax=axs[7])
plt.show()

In [None]:
data = df_diab.sample(frac=0.8, random_state=42)
data_unseen = df_diab.drop(data.index)
data.reset_index(inplace=True, drop=True)
data_unseen.reset_index(inplace=True, drop=True)
print('Data for Modeling: ' + str(data.shape))

In [None]:
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

In [None]:
!pip install pycaret

In [None]:
from pycaret.classification import *

In [None]:
numeric_features=data.columns.tolist()
numeric_features.remove('Outcome')
numeric_features[:]

In [None]:
diabetes = setup(data = data, target = 'Outcome', session_id=42,
                  normalize = True, 
                  transformation = True, 
                  log_experiment = True,
                  handle_unknown_categorical = True, 
                  unknown_categorical_method = 'most_frequent',
                  remove_multicollinearity = True, #rop one of the two features that are highly correlated with each other
                  ignore_low_variance = True,#all categorical features with statistically insignificant variances are removed from the dataset.
                  combine_rare_levels = True,# all levels in categorical features below the threshold defined in rare_level_threshold param are combined together as a single level
                  numeric_imputation='median',
                  categorical_imputation='mode',
                 fix_imbalance = True,
                numeric_features=numeric_features,
                 normalize_method = 'robust',
                  train_size = 0.8
                )

In [None]:
rf = create_model('rf')

In [None]:
print(rf)

In [None]:
tuned_rf = tune_model(rf,optimize = 'Accuracy')

In [None]:
plot_model(tuned_rf, plot = 'auc')

In [None]:
plot_model(tuned_rf, plot = 'pr')

In [None]:
plot_model(tuned_rf, plot='feature')

In [None]:
plot_model(tuned_rf, plot = 'confusion_matrix')

In [None]:
evaluate_model(tuned_rf)

In [None]:
predict_model(tuned_rf)

In [None]:
unseen_predictions = predict_model(tuned_rf, data=data_unseen)
unseen_predictions.head()

In [None]:
print("Confidence Score :   {}".format(round(unseen_predictions.Score.mean(),2)))#Confidence Score