# Diabetes Prediction :-  Random Forest Classification Algorithm

### 1) Importing required libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,roc_curve,make_scorer
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV
%matplotlib inline

### 2) Reading the dataset 

In [None]:
data = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
# Returning first few observations

data.head()

Here, Outcome is a target/dependent variable and others are predictor/independent variables.

In [None]:
data.shape

In [None]:
data.columns.tolist()

In [None]:
data.dtypes

In [None]:
data.isnull().sum()

### Summary of dataframe

In [None]:
data.info()

In [None]:
data.describe()

Some of the variables (Glucose,BloodPressure,SkinThickness,Insulin,BMI) have 0 as minimum value and that is not possible.

These variables have missing values as 0 present in this dataset. So, marking them as missing values by replacing it by NaN.

In [None]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
data.isnull().sum()

There are 5 variables having missing values.
Now, filling missing values in these variables by specific value.

In [None]:
data['Glucose'].fillna(data['Glucose'].median(), inplace = True)
data['BloodPressure'].fillna(data['BloodPressure'].median(), inplace = True)
data['SkinThickness'].fillna(data['SkinThickness'].median(), inplace = True)
data['Insulin'].fillna(data['Insulin'].median(), inplace = True)
data['BMI'].fillna(data['BMI'].mean(), inplace = True)

In [None]:
data.isnull().sum()

Now the missing values are now filled by specified methods.

### Getting correlations of each features in dataframe

In [None]:
corrmat = data.corr()
top_corr_features = corrmat.index
plt.figure(figsize = (15,20))

# Plotting heat map

g = sns.heatmap(data[top_corr_features].corr(),annot = True,cmap = "RdYlGn")

In [None]:
# Finding pairwise correlation of all columns

data.corr()

In [None]:
# Getting unique values 

data['Pregnancies'].unique()

In [None]:
# Finding counts of unique values and sorting it in ascending order

data['Pregnancies'].value_counts().sort_values()

### Grouping predictor variables by target variable 

In [None]:
data.groupby("Outcome")[["Pregnancies","Glucose","BloodPressure"]].agg(['max','min','mean'])

In [None]:
data.groupby("Outcome")[["SkinThickness","Insulin","BMI","Age"]].agg(['max','min','mean'])

In [None]:
# Finding counts of unique values 

data['Outcome'].value_counts()

In [None]:
# Plotting histogram of dataframe

p = data.hist(figsize = (15,20))

### Creating Predictor Matrix

In [None]:
X = data.drop('Outcome',axis = 1)

In [None]:
X.head()

### Target variable

In [None]:
y = data['Outcome']

In [None]:
# Getting first few observations of target variable

y.head()

In [None]:
# Splitting the matrices into random train & test subsets where test data contains 25% data and rest considered as training data

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 200)

In [None]:
# Getting dimensions of train & test subsets

X_train.shape,X_test.shape,y_train.shape,y_test.shape

### Instantiating random forest classifier

In [None]:
clf = RandomForestClassifier(oob_score = True,n_jobs = -1,random_state = 100)
clf

Cross validation score should between 0 and 1 and as high as possible.
Here cross validation has been performed to find how well model is performing in terms of F1 score.

In [None]:
# Performing K-fold cross validation with 5 folds 

scores = cross_val_score(clf,X_train,y_train,cv = 5,scoring = "f1_macro")
scores.mean()

In [None]:
# Building a forest of trees from training set

clf.fit(X_train,y_train)

In [None]:
# Predicting on classifier created

train_pred = clf.predict(X_train)
test_pred = clf.predict(X_test)

In [None]:
# Finding F1 score of training and testing sets 

print("The training F1 score is: ",f1_score(train_pred,y_train))
print("The testing F1 score is :",f1_score(test_pred,y_test))

In [None]:
#  Tuning hyperparameters

parameters = {
             "max_depth":[2,3,4],
             "n_estimators":[100,104,106],
             "min_samples_split":[3,4,5],
             "min_samples_leaf":[4,8,9]
             }

scorer = make_scorer(f1_score)

In [None]:
# Using Randomized Search CV to find best optimal hyperparameter that best describe a classifier

clf1 = RandomizedSearchCV(clf,parameters,scoring = scorer)

# Fitting the model

clf1.fit(X_train,y_train)

# Getting best estimator having high score

best_clf_random = clf1.best_estimator_
best_clf_random

In [None]:
# Again, finding cross validation score

scores = cross_val_score(best_clf_random,X_train,y_train,cv = 5,scoring = "f1_macro")
scores.mean()

In [None]:
# Fitting the best estimator

best_clf_random.fit(X_train,y_train)

In [None]:
# Getting first estimator

best_clf_random.estimators_[0]

Using above way you can get specific estimators / decision trees that combined up to form a random forest classifier.

In [None]:
# Predicting on best estimator

train_pred = best_clf_random.predict(X_train)
test_pred = best_clf_random.predict(X_test)

In [None]:
# Finding the F1 score of training & testing sets

print("The training F1 score is: ",f1_score(train_pred,y_train))
print("The testing F1 score is :",f1_score(test_pred,y_test))

In [None]:
# Getting accuracy score 

accuracy_score(y_test,test_pred)

In [None]:
# Computing ROC AUC from prediction scores

roc_auc_score(y_test,best_clf_random.predict_proba(X_test)[:,1])

In [None]:
# Plotting ROC curve

fpr,tpr,thresholds = roc_curve(y_test,best_clf_random.predict_proba(X_test)[:,1])

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr)
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.show()

Having high Roc curve shows model is performing well.

In [None]:
# Computing confusion matrix

pd.crosstab(y_test,test_pred,rownames = ['True'],colnames = ['Predicted'],margins = True)

In [None]:
# Plotting confusion matrix

cnf_matrix = confusion_matrix(y_test,test_pred)
p = sns.heatmap(pd.DataFrame(cnf_matrix),annot = True,cmap = "YlGnBu",fmt = 'g')
plt.title("Confusion Matrix",y = 1.1)
plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')

In [None]:
# Computing the precision

precision_score(y_test,test_pred)

In [None]:
# Computing the recall

recall_score(y_test,test_pred)

In [None]:
# Getting feature importances

imp_features = pd.Series(best_clf_random.feature_importances_,index = X.columns)
imp_features.sort_values(ascending = False)

In [None]:
# Plotting feature importances 

imp_features.sort_values(ascending = False).plot(kind = "bar")

## Thank you
## If you find this notebook useful, **upvote** it
## Feel free to ask any queries and any suggestions for improving my kernel are welcome!