In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from matplotlib.pyplot import rcParams
import seaborn as sn
%matplotlib inline
import seaborn as sns

In [None]:
diabete = pd.read_csv('../input/diabete/diabetes.csv')

In [None]:
diabete.head()

In [None]:
diabete.shape

In [None]:
diabete.isnull().sum()

In [None]:
diabete.info()
# BloodPressure: Diastolic blood pressure (mm Hg)
# SkinThickness: Triceps skin fold thickness (mm)
# Insulin: 2-Hour serum insulin (mu U/ml)
# BMI: Body mass index (weight in kg/(height in m)^2)
# DiabetesPedigreeFunction: Diabetes pedigree function
# Age: Age (years)
# Outcome: Class variable (0 or 1)

In [None]:
diabete.hist(figsize=(10,10))
diabete.describe()

In [None]:
is_diabete=diabete[diabete['Outcome']==1]
is_diabete.hist(figsize=(10,10))
is_diabete.describe()

In [None]:
sns.pairplot(diabete, hue="Outcome" ,corner=True)
plt.show()

In [None]:
plt.figure(figsize=(15,7))
sns.heatmap(diabete.corr(),vmin=-1, vmax=1,annot=True,linewidths=.5, cmap="YlGnBu")
plt.show()

In [None]:
plt.figure(figsize=(15,5))
pd.plotting.parallel_coordinates(diabete,'Outcome',color=('gold','red'))
plt.xticks(rotation=45)

In [None]:
diabete.Outcome.value_counts().plot.bar()
diabete.Outcome.value_counts()

In [None]:
# Random Forest
from sklearn.model_selection import train_test_split
x_diabete = diabete.drop(columns=['Outcome'])
y_diabete = diabete['Outcome']

featurename = x_diabete.columns
outcome_value = ["0","1"]

X_train, X_test, y_train, y_test = train_test_split( x_diabete, y_diabete, test_size = 0.2, random_state = 1234,stratify = y_diabete)
print('training set = {} records, test set= {} records'.format(X_train.shape[0],X_test.shape[0]))

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
rf_y_pred_train =  rf.predict(X_train)
rf_y_predprob_train =  rf.predict_proba(X_train)

rf_y_pred_test =  rf.predict(X_test)
rf_y_predprob_test =  rf.predict_proba(X_test)

print('Test set record')
print('Class prediction : {}'.format(rf_y_pred_test[1]))
print('Probability prediction : {}'.format(rf_y_predprob_test[1]))

In [None]:
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 3)) for feature, importance in zip(featurename, importances)]
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, balanced_accuracy_score, classification_report, precision_score, recall_score

def cm_plot(cm):
    plt.clf()
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.coolwarm)
    classNames = ['Not Diabete','Diabete']
    plt.title('Confusion Matrix')
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    tick_marks = np.arange(len(classNames))
    plt.xticks(tick_marks, classNames, rotation=45)
    plt.yticks(tick_marks, classNames)
    s = [['TN','FP'], ['FN', 'TP']]
    plt.grid(None)
    plt.rcParams.update({'font.size': 16})
    plt.rcParams["font.weight"] = "bold"
    for i in range(2):
        for j in range(2):
            plt.text(j,i, str(s[i][j])+" = "+str(cm[i][j]))
    plt.show()

In [None]:
print("Training set | Accuracy is", accuracy_score(y_train,rf_y_pred_train)*100)
print("Test set     | Accuracy is", accuracy_score(y_test,rf_y_pred_test)*100)

In [None]:
# Model Evaluation
test_acc = accuracy_score(y_test,rf_y_pred_test)*100
test_bl_acc = balanced_accuracy_score(y_test,rf_y_pred_test)*100
test_cm = confusion_matrix(y_test,rf_y_pred_test)
test_report = classification_report(y_test,rf_y_pred_test)
# The aacuracy and balanced accuracy of test set
print("Test set | Accuracy is", test_acc)
print("Test set | Balance Accuracy is", test_bl_acc)
cm_plot(test_cm)
print(test_report)

test_acc = accuracy_score(y_train,rf_y_pred_train)*100
test_bl_acc = balanced_accuracy_score(y_train,rf_y_pred_train)*100
test_cm = confusion_matrix(y_train,rf_y_pred_train)
test_report = classification_report(y_train,rf_y_pred_train)
# The aacuracy and balanced accuracy of training set
print("Training set | Accuracy is", test_acc)
print("Training | Balance Accuracy is", test_bl_acc)
cm_plot(test_cm)
print(test_report)