### Module Import

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

### Data Import and Basic Exploration

In [None]:
data = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')#, names=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'])
print(data.shape)

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.columns

### Data Describtion
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

### columns:
* Pregnencies: Number of times pregnent (Integer)
* Glucose: 
 - Plasma glucose concentration a 2 hours in an oral glucose tolerance test (mg/dL) 
 - Normal Glucose level for Non-diabetes person is between 70-130 mg/dL (Google)
* BloodPressure :
 - Diastolic blood pressure (mm Hg)
 - Normal lower than 80

* SkinThickness :
 - Triceps skin fold thickness (mm)
 - According to certain researches skinthickess is related to Diabetes
 
* Insulin
 - 2-Hour serum insulin (mu U/ml)
 - low insulin : may have type 2 diabetes
 - high insulin : may have type 1 diabetes
* BMI
 - Body mass index (weight in kg/(height in m)^2)
 - Normal 18.5 - 24.9
 - As the BMI increases Female have more chances of diabetes than male
* DiabetesPedigreeFunction 
 - Diabetes pedigree function
 - According to http://www.personal.kent.edu/~mshanker/personal/Zip_files/sar_2000.pdf, the diabetes pedigree function provides “a synthesis of the diabetes mellitus history in relatives and the genetic relationship of those relatives to the subject.”
* Age
 - Age (years)
 - Normally it is seen that as the age increases chance of diabetes increase
* Outcome
 - If a person have diabetes or not (0/1)
 - This is our output variable

In [None]:
data.describe()

> * Max value of Pregnancies is 17, I thinks it's not normal and 25% sample have more than 6 pregnancies which is pretty much more.
> * What does 0 means in Glucose, BloodPressure, Skinthickness, Insulin, BMI means?
 - it doesn't make sense of have this as 0 
> * Even Skinthickness and Insulin have more than 25% of value equal to 0. Will have to look into it in future

In [None]:
data.info()

## Univariate Analysis

In [None]:
columns=data.columns
columns

In [None]:
def distplot(column):
  df = data[column]
  sns.distplot(df)
  plt.show()


In [None]:
sns.set_style('darkgrid')
for column in columns:
  distplot(column)

In [None]:
df = data[(data.Glucose == 0) | (data.BloodPressure==0) | (data.Insulin == 0) | (data.SkinThickness==0) | (data.BMI ==0)]

In [None]:
df.describe()

## Bivariate Analysis

In [None]:
def box(column):
  sns.boxplot(x='Outcome', y=column, data=data)
  plt.show()

In [None]:
for column in columns:
  box(column)

In [None]:
def violin(column):
  sns.violinplot(x='Outcome', y=column, data=data)
  plt.show()

In [None]:
for column in columns:
  violin(column)

In [None]:
def scatter(x, y):
  sns.scatterplot(x=x, y=y, hue='Outcome', data=data, marker='x')
  plt.show()

In [None]:
for i in range(1,8):
  scatter(columns[0], columns[i])

### Count number of O's in each column

In [None]:
df = pd.DataFrame(index=data.columns)
for column in columns:
  df.loc[column, 'count'] = int(len(data[data[column] == 0]))
df

>> * Pregnancies value can be of 0 so we don't need 
>> * While other columns except Outcome should't have 0.
>> * So we are going to change the 0 with mean of that column in that class.

In [None]:
update_column = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in update_column:
  data.loc[(data[column] == 0) & (data['Outcome'] == 0), column] = data[data.Outcome == 0][column].mean()
  data.loc[(data[column] == 0) & (data['Outcome'] == 1), column] = data[data.Outcome == 1][column].mean()
data

In [None]:
data.describe()

In [None]:
for column in columns:
  violin(column)

### Model Fitting

> #### First let try to fit Decision Tree without any parameter tunning and see how it performs

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test,y_train, y_test = train_test_split(data[columns[:-1]], data.iloc[:,-1], random_state=77)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dec_model = DecisionTreeClassifier()
dec_model.fit(x_train, y_train)
print("Train Accuracy: ", dec_model.score(x_train,y_train))
print("Test Accuracy: ", dec_model.score(x_test, y_test))

>> * It surely overfits the data
>> * But still 0.895 test accuracy is nice


> #### Decision Tree with parameter tunning

> * lets try to fit Decision Tree with parameter tunning it should perform better than without tunning

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

params = {'criterion':['gini', 'entropy'],
          'max_depth': [5, 10, 20, 25, 30],
          'max_features': [3, 5, 7, 9],
          'max_leaf_nodes': [2,5,6, 9, 10, 15],
          'splitter': ['best', 'random']}
grid = GridSearchCV(DecisionTreeClassifier(), params, cv=10)
grid.fit(x_train, y_train)
print(grid.best_params_)
print("Score: ", grid.best_score_)
#grid

In [None]:
grid.score(x_test, y_test)

>> #### Visualize Tree

In [None]:
from sklearn import tree
plt.figure(figsize=(18, 10))
tree.plot_tree(grid.best_estimator_, filled=True)

In [None]:
grid

In [None]:
grid.best_estimator_

>> ##### Feature Importances

In [None]:
pd.Series(grid.best_estimator_.feature_importances_, index=columns[:8]).nlargest(8).plot(kind='barh')


In [None]:
x_train_updated = x_train[['Glucose', 'Age', 'Insulin']]
x_test_updated = x_test[['Glucose', 'Age', 'Insulin']]

params = {'criterion':['gini', 'entropy'],
          'max_depth': [5, 10, 20, 25, 30],
          'max_features': [3, 5, 7, 9],
          'max_leaf_nodes': [2,5,6, 9, 10, 15],
          'splitter': ['best', 'random']}
grid = GridSearchCV(DecisionTreeClassifier(), params, cv=10)
grid.fit(x_train_updated, y_train)
print(grid.best_params_)
print("Score: ", grid.best_score_)

In [None]:
grid.score(x_test_updated, y_test)

> #### Random Forest with Parameter tunning

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

params = params = {'criterion':['gini', 'entropy'],
          'max_depth': [5, 10, 20, 25, 30],
          'max_features': [3, 5, 7, 9],
          'max_leaf_nodes': [2,5,6, 9, 10, 15],
          #'splitter': ['best', 'random'],
          'n_estimators':[1,3,5,10]}
random_grid = GridSearchCV(RandomForestClassifier(), params, cv=10)
random_grid.fit(x_train, y_train)
print(random_grid.best_params_)
print(random_grid.best_score_)

In [None]:
random_grid.score(x_test, y_test)

In [None]:
clf = random_grid.best_estimator_

In [None]:
pd.Series(random_grid.best_estimator_.feature_importances_, columns[:8]).nlargest(8).plot(kind='barh')

In [None]:
from sklearn.metrics import confusion_matrix 
con_matrix = confusion_matrix(y_test, clf.predict(x_test))

sns.heatmap(con_matrix, annot=True, fmt='g')
plt.xticks([.5,1.5], ['No', 'Yes'])
plt.yticks([1.5,0.5],['Yes', 'No'],)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matric');

* Its perform Nice but let see we can make it better

In [None]:
data['Outcome'].value_counts()

* let try to use ```class_weight``` parameters to be balanced

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

params = params = {'criterion':['gini', 'entropy'],
          'max_depth': [5, 10, 20, 25, 30],
          'max_features': [3, 5, 7, 9],
          'max_leaf_nodes': [2,5,6, 9, 10, 15],
          #'splitter': ['best', 'random'],
          'n_estimators':[1,3,5,10],
          'class_weight':['balanced']}
random_grid = GridSearchCV(RandomForestClassifier(), params, cv=10)
random_grid.fit(x_train, y_train)
print(random_grid.best_params_)
print(random_grid.best_score_)

In [None]:
random_grid.score(x_test, y_test)

In [None]:
clf = random_grid.best_estimator_
cmatrix = confusion_matrix(y_test,clf.predict(x_test))
sns.heatmap(cmatrix, annot=True, fmt='g')
plt.xticks([.5,1.5], ['No', 'Yes'])
plt.yticks([1.5,0.5],['Yes', 'No'],)
plt.xlabel('Predicted')

plt.ylabel('Actual')
plt.title('Confusion Matric');

* It didn't improved much

>> #### let first balance the data using resample technique and fit the model again 

In [None]:
from sklearn.utils import resample, shuffle
data_pos = data[data.Outcome == 1]
data_neg = data[data.Outcome == 0]
data_pos = resample(data_pos, n_samples=500, random_state=34)
data1 = pd.concat([data_pos, data_neg], axis=0)
new_data = shuffle(data1, random_state=34)

In [None]:
new_data.describe()

In [None]:
x_train, x_test,y_train, y_test = train_test_split(new_data[columns[:-1]], new_data.iloc[:,-1], random_state=77)

> #### Random Forest on Balanced Dataset

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier

params = params = {'criterion':['gini', 'entropy'],
          'max_depth': [5, 10, 20, 25, 30],
          'max_features': [3, 5, 7, 9],
          'max_leaf_nodes': [2,5,6, 9, 10, 15],
          #'splitter': ['best', 'random'],
          'n_estimators':[1,3,5,10]}
random_grid = GridSearchCV(RandomForestClassifier(), params, cv=10)
random_grid.fit(x_train, y_train)
print(random_grid.best_params_)
print(random_grid.best_score_)

In [None]:
random_grid.score(x_test, y_test)

In [None]:
clf = random_grid.best_estimator_
cmatrix = confusion_matrix(y_test,clf.predict(x_test))
sns.heatmap(cmatrix, annot=True, fmt='g')
plt.xticks([.5,1.5], ['No', 'Yes'])
plt.yticks([1.5,0.5],['Yes', 'No'],)
plt.xlabel('Predicted')

plt.ylabel('Actual')
plt.title('Confusion Matric');

* Its perform better than without balance
* let also try to fit one more model and see how it performs

> #### SVM on balance dataset

In [None]:
%%time
from sklearn.svm import SVC
params = {'C':[0.5,1,10,100],
          'gamma':['scale', 1, 0.1,0.01, 0.001, 0.0001],
          'kernel':['rbf']}

grid = GridSearchCV(SVC(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
grid.score(x_test,y_test)

In [None]:
cmatrix = confusion_matrix(y_test, grid.predict(x_test))
sns.heatmap(cmatrix, annot=True, fmt='g')
plt.xticks([.5,1.5], ['No', 'Yes'])
plt.yticks([1.5,0.5],['Yes', 'No'],)
plt.xlabel('Predicted')

plt.ylabel('Actual')
plt.title('Confusion Matric');

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix 

In [None]:
%%time
print("wwe")
from sklearn.svm import SVC
params = {'C':[0.5,1,10,100],
          'gamma':['scale', 1, 0.1,0.01, 0.001, 0.0001],
          'kernel':['rbf'],
          'class_weight':['balanced']}

grid = GridSearchCV(SVC(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
grid.score(x_test,y_test)

In [None]:
cmatrix = confusion_matrix(y_test, grid.predict(x_test))
sns.heatmap(cmatrix, annot=True, fmt='g')
plt.xticks([.5,1.5], ['No', 'Yes'])
plt.yticks([1.5,0.5],['Yes', 'No'],)
plt.xlabel('Predicted')

plt.ylabel('Actual')
plt.title('Confusion Matric');

In [None]:
print("wwe")
from sklearn.svm import SVC
params = {'C':[0.5,1,10,100],
          'gamma':['scale', 1, 0.1,0.01, 0.001, 0.0001],
          'kernel':['rbf', 'sigmoid']}

grid = GridSearchCV(SVC(), params, cv=10, scoring='accuracy')
grid.fit(x_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

In [None]:
grid.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, random_grid.predict(x_test), target_names=['No', 'Yes']))

In [None]:
print(classification_report(y_test, grid.predict(x_test), target_names=['No', 'Yes']))

#### Conclusion:
>> * random Forest with balanced dataset performs better than unbalanced
>> * SVM doesn't performs better than Random Forest on balanced dataset
>> * Final 
  - Train Accuracy : 0.9173
  - Test Accuracy : 0.912
  - Precision : 0.87
  - Recall : 0.94