<h3 style="color:#991100">Load Libraries </h3>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import mean_squared_log_error,mean_squared_error,\
                            plot_confusion_matrix,classification_report,roc_curve,roc_auc_score
                        

<h3 style="color:#991100">Load Dataset</h3>

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')
print("diabetes dataset shape is: ",df.shape)
df.head()

<h3 style="color:#991100">Check for Null Values </h3>

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

<h3 style="color:#991100">Statistics Information </h3>

In [None]:
df.describe().T

In [None]:
df.info()

<h3 style="color:#991100">Data Visulaizations </h3>

In [None]:
sns.pairplot(df, kind='reg', diag_kind='kde', hue = 'Outcome')
plt.show()

In [None]:
df.hist(bins=30)
plt.show()

<h3 style="color:#991100"> Heat Map </h3>

In [None]:
corr = df.corr()
f, ax = plt.subplots(figsize=(12, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, mask = mask)
plt.show()

In [None]:
df['Outcome'].value_counts()

<h2 style="color:#994400"> Modeling</h2>

<h3 style="color:#991100">Split Data </h3>

In [None]:
train,val = train_test_split(df,test_size=.15,random_state=0)
print("Train Shape is: ",train.shape)
print("Validation Shape is: ",val.shape)

In [None]:
X_train = pd.DataFrame(train.drop(['Outcome'],axis=1)) 
y_train = train['Outcome']

X_val = pd.DataFrame(val.drop(['Outcome'],axis=1)) 
y_val = val['Outcome']

<h3 style="color:#991100"> Scaling </h3>

In [None]:
scaler = MinMaxScaler()#StandardScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_val = pd.DataFrame(scaler.transform(X_val))

<h3 style="color:#991100"> Logistic Regression </h3>

In [None]:
model = LogisticRegression()
model.fit(X_train,y_train)
preds = model.predict(X_val)
print("LogisticRegression Accuracy for training is: ",model.score( X_train,y_train))
print("LogisticRegression Accuracy for Validation is: ",model.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(preds,y_val))
print("Mean square log error is: ",mean_squared_error(preds,y_val))
plot_confusion_matrix(model,X_val,y_val)
plt.title('Confusion Matrix for Logistic Regression')
plt.show()

<h4 style="color:#991122"> Classification Report for Logistic Regression</h4>

In [None]:
print(classification_report(y_val,preds))

<h3 style="color:#991100"> Random Forest Classifier</h3>

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
preds = model.predict(X_val)
print("RandomForest Accuracy for training is: ",model.score( X_val,y_val))
print("RandomForest Accuracy for validation is: ",model.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(preds,y_val))
print("Mean square error is: ",mean_squared_error(preds,y_val))
plot_confusion_matrix(model,X_val,y_val)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

<h4 style="color:#991122"> Classification Report for Random Forest</h4>

In [None]:
print(classification_report(y_val,preds))

<h3 style="color:#991100"> Support Vector Machine Classifier </h3>

In [None]:
model = SVC()
model.fit(X_train,y_train)
preds = model.predict(X_val)
print("SVC Accuracy for training is: ",model.score( X_val,y_val))
print("SVC Accuracy for validation is: ",model.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(preds,y_val))
print("Mean square error is: ",mean_squared_error(preds,y_val))
plot_confusion_matrix(model,X_val,y_val)
plt.title('Confusion Matrix for SVC')
plt.show()

<h4 style="color:#991122"> Classification Report for SVC</h4>

In [None]:
print(classification_report(y_val,preds))

<h4 style="color:#991122"> It also do well without Scaling</h4>

In [None]:
X_train = pd.DataFrame(train.drop(['Outcome'],axis=1)) 
y_train = train['Outcome']

X_val = pd.DataFrame(val.drop(['Outcome'],axis=1)) 
y_val = val['Outcome']

<h3 style="color:#991100"> Random Forest Classifier</h3>

In [None]:
model = RandomForestClassifier()
model.fit(X_train,y_train)
preds = model.predict(X_val)
print("RandomForest Accuracy for training is: ",model.score( X_val,y_val))
print("RandomForest Accuracy for validation is: ",model.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(preds,y_val))
print("Mean square error is: ",mean_squared_error(preds,y_val))
plot_confusion_matrix(model,X_val,y_val)
plt.title('Confusion Matrix for Random Forest Classifier')
plt.show()

<h4 style="color:#991122"> Classification Report for Random Forest</h4>

In [None]:
print(classification_report(y_val,preds))

<h3 style="color:#991100"> Support Vector Machine Classifier </h3>

In [None]:
model = SVC()
model.fit(X_train,y_train)
preds = model.predict(X_val)
print("SVC Accuracy for training is: ",model.score( X_val,y_val))
print("SVC Accuracy for validation is: ",model.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(preds,y_val))
print("Mean square error is: ",mean_squared_error(preds,y_val))
plot_confusion_matrix(model,X_val,y_val)
plt.title('Confusion Matrix for SVC')
plt.show()

<h4 style="color:#991122"> Classification Report for SVC</h4>

In [None]:
print(classification_report(y_val,preds))

<h3 style="color:#990022"> First Report </h3>

<p style="color:#995520">
    <strong>
    <ul style="color:#884422">
        <li>The Data doesn't have any missing values or duplicates.
        <li>We Note that StandardScaler didn't work well.
        <li>So we used MinMaxScaler which worked good.
        <li>The data also do good without Standardization.
    </ul>
    </strong>
</p>
<p style="color:#991122"><strong>Logistic Regression didn't do good using this data as the data isn't linear
    <ul>
        <li>It gave a score of %76 in training and %83 in validation.
    </ul>
    </strong>
</p>
<p style="color:#991122"><strong>We can't say that Random Forest do better than Logistic Regression.
    <ul>
        <li>It gave a score of %79 in both training and validation.
        <li>This is better than Logistic Regression.
        <li>Logistic Regression has higher f1_score than Random Forest
    </ul>
    </strong>
</p>
<p style="color:#991122"><strong>It's clear that SVC do better than both Logistic Regression and Random Forest.
    <ul>
        <li>It gave a score of %81 in both training and validation.
        <li>This is better than both Logistic Regression and Random Forest.
        <li>It's also do good in f1_score
    </ul>
    </strong>
</p>

<p style="color:#991122"><strong>So we will focus on Support vector Classifier
    <ul>
        <li> We are going to tuning it with different parameters
    </ul>
    </strong></p>

<h3 style="color:#991100">SVC with GridSearchCV</h3>

In [None]:
'''
param_grid = {"kernel":["poly","rbf"], 
              "C":[1, 0.5],
              "gamma": ["scale", "auto"]
             }
search = GridSearchCV(model, param_grid=param_grid, cv=2)
search.fit(X_train, y_train)
x = search.best_estimator_
y_hat = x.predict(X_val)
'''

In [None]:
param_grid = {"kernel":["linear","rbf"], 
              "C":[1, 0.8, 0.5, 0.2],
              "gamma": ["scale", "auto"],
              "probability":[True]
             }
search = GridSearchCV(model, param_grid=param_grid, cv=2)
search.fit(X_train, y_train)
x = search.best_estimator_
y_hat = x.predict(X_val)

<h3> Best Estimator </h3>

In [None]:
search.best_estimator_

In [None]:
print("SVC Accuracy for training is: ",x.score( X_val,y_val))
print("SVC Accuracy for validation is: ",x.score( X_val,y_val))
print("Mean square log error is: ",mean_squared_log_error(y_hat,y_val))
print("Mean square error is: ",mean_squared_error(y_hat,y_val))
plot_confusion_matrix(x,X_val,y_val)
plt.title('Confusion Matrix for SVC with GridSearchCV')
plt.show()

<h4 style="color:#991122"> Classification Report for SVC with GridSearchCV</h4>

In [None]:
print(classification_report(y_val, y_hat))

<h4 style="color:#991122">ROC Curve for SVC with GridSearchCV</h4>

In [None]:
y_pred_prob = x.predict_proba(X_val)[:,1]
fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
plt.plot(fpr, tpr)
plt.title('ROC curve for SVC', fontsize = 15)
plt.xlabel('False positive rate (1-Specificity)', fontsize = 15)
plt.ylabel('True positive rate (Sensitivity)', fontsize = 15)
plt.text(x = 0.02, y = 0.9, s = ('AUC Score:',round(roc_auc_score(y_val, y_pred_prob),4)))
plt.grid(True)
plt.show()

<h3 style="color:#990022"> Last Report </h3>

<p style="color:#961020">
    <strong>
    <ul style="color:#961020">
        <li>Poly Kernel took too long to train so i didn't use it here.
        <li>It took me more than 6 hours and didn't succeeded
    </ul>
    </strong>
</p>
<p style="color:#961020">
    <strong>
    <ul style="color:#961020">
        <li><strong>Recall</strong> for (Class [ 0 ]) wasn't good for all models. 
        <li>The Resson is that distribution of Classes [ 0, 1 ] is UnBalanced.
        <li>The highest value of Recall was %71 out of Random Forest.
        <li>This means that ( False Negative ) values are high.
        <li><strong>Precision</strong> for (Class [ 0 ]) is better as ( False Positive ) values are low
        <li><strong style="font_size:100">Recall and Precision</strong> for (Class [ 1 ]) are good enough.
    </ul>
    </strong>
</p>