In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Content
- [Importing the required libraries](#Importing-the-required-libraries)
- [Reading the dataset into a dataframe](#Reading-the-dataset-into-a-dataframe)
- [Data Preparation](#Data-Preparation)
    - [Encoding the categorical features](#Encoding-the-categorical-features)
    - [Visualizing the correlation matrix in the form of heatmap](#Visualizing-the-correlation-matrix-in-the-form-of-heatmap)
    - [Separating our independent and dependent variables](#Separating-our-independent-and-dependent-variables)
    - [Selecting features with chi2 test](#Selecting-features-with-chi2-test)
    - [Splitting the dataframe into training and testing](#Splitting-the-dataframe-into-training-and-testing)
- [Model Selection](#Model-Selection)
    - [Evaluating the model using k-fold cross validation](#Evaluating-the-model-using-k-fold-cross-validation)
    - [Finding the best parameters using GridSearchCV](#Finding-the-best-parameters-using-GridSearchCV)
    - [Printing the results](#Printing-the-results)
- [Confusion matrix and Analysis](#Confusion-matrix-and-Analysis)

## Importing the required libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix , classification_report
from sklearn.model_selection import KFold , cross_val_score
from numpy import mean
from numpy import std
from sklearn.model_selection import GridSearchCV

## Reading the dataset into a dataframe

In [None]:
df = pd.read_csv('../input/diabetes-uci-dataset/diabetes.csv')
df.head()

# Data Preparation

- Encoding the categorical features

In [None]:
df = df.replace(['Male','Female','Yes','No','Positive','Negative'],(1,0,1,0,1,0))
df.head()

- Visualizing the correlation matrix in the form of heatmap

In [None]:
cor_mat = df.corr()
plt.figure(figsize=(15,10))
top_corr_features = cor_mat.index
sns.heatmap(cor_mat[top_corr_features].corr(),annot=True,cmap="RdYlGn")

- Separating our independent and dependent variables

In [None]:
X = df.drop(['class'],axis=1)
y = df['class']

- Selecting features with chi2 test

In [None]:
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['features','Score']
print(featureScores.nlargest(16,'Score'))

- Splitting the dataframe into training and testing

In [None]:
X = df[['Polydipsia','Polyuria','sudden weight loss','partial paresis','Gender','Irritability','Polyphagia','Alopecia']]
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=17,shuffle=True) 

# Model Selection

- Evaluating the model using k-fold cross validation

In [None]:
rf = RandomForestClassifier()
cv = KFold(n_splits=5, random_state=1, shuffle=True)

scores = cross_val_score(rf, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)
print('Accuracy : '+ str(mean(scores))+' Std Deviation :'+str(std(scores)))
print(scores)

- Finding the best parameters using GridSearchCV

In [None]:
parameters = [{'n_estimators':[10,50,100,150,200,250],'criterion':['gini','entropy'],'max_features':['auto','sqrt','log2']}]

clf = GridSearchCV(rf,parameters,scoring='f1')
clf.fit(X_train,y_train)

- Printing the results

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test,y_pred))

## Confusion matrix and Analysis

In [None]:
#false negatives have to be kept low as we can't risk predicting positive patients as negative on the other hand false positives are acceptable as they can be corrected
cm = confusion_matrix(y_test,clf.predict(X_test)) 
cm

# The End
`If you liked the notebook then don't forget to upvote and suggestions are always welcomed.`
`Follow me on Linkedin :` __[Atharva_Dumbre](https://www.linkedin.com/in/atharva-dumbre-208b5716b)__