In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib as mpl
import seaborn as sns

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC

from xgboost import XGBClassifier


In [None]:
plt.style.use("seaborn-bright")
mpl.rcParams["figure.figsize"] = (10, 10)
sns.set_theme(style="whitegrid")

In [None]:
data = pd.read_csv('/kaggle/input/diabetes-data-set/diabetes.csv')
print(data.columns)
data.describe()

# Splitting the data into training and test set, in order not to do double dibbing

In [None]:
#Tesing is we have missing values
for col in data.columns:
    if data[col].isna().values.sum() > 0:
        print(f"Missing values in col: {col}")
#No outliers in this dataset
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_data = None
test_data  = None

for train_ind, test_ind in splitter.split(data, data["Outcome"]):
    train_data = data.iloc[train_ind]
    test_data = data.iloc[test_ind]

print(train_data.shape)
print(test_data.shape)
print(f"Precentage of class 1 in train: {train_data['Outcome'].sum(axis=0)/train_data.shape[0]}")
print(f"Precentage of class 1 in test: {test_data['Outcome'].sum(axis=0)/test_data.shape[0]}")



# Data visualization

In [None]:
def visualizeDistribution(data, columnNames):
    gs = GridSpec(3, 3)
    axes = []
    for i in range(0, 3):
        for j in range(0, 3):
            axes.append(plt.subplot(gs[i, j]))

    for i in range(0, 3):
        for j in range(0, 3):
            data[columnNames[i*3 + j]].hist(ax=axes[i*3 +j], density=True);
            data[columnNames[i*3 + j]].plot.kde(ax=axes[i*3 +j]);
            axes[i*3 + j].set_yticks([])
            axes[i*3 + j].set_xlabel(columnNames[i*3 + j])
    #plt.title("Data Distribution using a Gaussian KDE")
    plt.show()
    
columnNames = list(data.columns)
visualizeDistribution(train_data, columnNames)

In [None]:
fig, ax = plt.subplots(figsize=(15, 15))
sns.boxplot(data = train_data, ax = ax)
plt.show()

It is evident from the above two figures that: Insulin, DiabetesPedigreeFunction, Age and Pregnancies.

In [None]:
#replacing outliers in the above mentioned columns by their means value
means = {}
outliers = {}
columnsOutliers =  ['Pregnancies','Insulin','DiabetesPedigreeFunction', 'Age']
for col in columnsOutliers:
    means[col] = train_data[col].mean(axis=0)

print(means)

for col in columnsOutliers:
    outlier = train_data[col].quantile(0.98)
    outliers[col] = outlier

    
def replacingOutliers(data, means, outliers, columnsOutliers, dataType):
    print("Outliers tranformation for " + dataType)
    for col in columnsOutliers:
        print(f"Replacing in {col}: {np.sum(data[col] > outliers[col])}")
        data.loc[data.index[data[col] > outliers[col]]][col] = means[col]

replacingOutliers(train_data, means, outliers, columnsOutliers, "train")
replacingOutliers(test_data, means, outliers, columnsOutliers, "test")

visualizeDistribution(train_data, columnNames)
fig, ax = plt.subplots(figsize=(15, 15))
sns.boxplot(data = train_data, ax = ax)
plt.show()

In [None]:
# colors_diabetes = ["red", "blue"]
#1 -> "Diabetic"
#0 -> "Not-Diabetic"
train_data.loc[train_data.index[train_data["Outcome"] == 0]]["Outcome"] = "Non-Diabetic"
columnNamesNoOutput = { key:value  for (key, value) in  enumerate(columnNames[0:len(columnNames)-1]) }
columnNamesNoOutput
print(columnNamesNoOutput)


while(True):
    continueOrNot = int(input("Do you want to continue(1:cont, 0:break)"))
    if not continueOrNot:
        break
    col1 = int(input("Please input first column you want to see pair plot with(0 - 7):"))
    col2 = int(input("Please input second column you want to see pair plot with(0 - 7):"))
    if col1 in columnNamesNoOutput and col2 in columnNamesNoOutput:
        sns.pairplot(train_data[[columnNamesNoOutput[col1], columnNamesNoOutput[col2], "Outcome"]], hue="Outcome")
        plt.show()

In [None]:
corr = train_data[columnNamesNoOutput.values()].corr()# there is linear dependencies between-> glucode:Insulin, age:pregnancies and BMI: Insulin
corr

# Data transformation

In [None]:
X_train, y_train = np.array(train_data.drop(columns=["Outcome"])), np.array(train_data["Outcome"])
X_test, y_test = np.array(test_data.drop(columns=["Outcome"])), np.array(test_data["Outcome"])

standardScalar = StandardScaler()
standardScalar.fit(X_train)
standardScalar.transform(X_train)
standardScalar.transform(X_test)


# Logistic Regression

In [None]:
param_grid = [
    {"C": [v for v in np.linspace(0.0001, 2, 10)]}
]

logReg = LogisticRegression(random_state=42, penalty="l2", max_iter=1000)
grid_search = GridSearchCV(logReg, param_grid, cv=5, scoring="neg_log_loss", return_train_score=True)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
model = grid_search.best_estimator_

In [None]:
def results(model):
    prediction = model.predict(X_train)
    print(confusion_matrix(y_train, prediction))
    print(classification_report(y_train, prediction))
    prediction = model.predict(X_test)
    print(confusion_matrix(y_test, prediction))
    print(classification_report(y_test, prediction))

results(model)#Accuracy: 0.79 but bad recall score for diabetic class.

# SVM for Classification

In [None]:
param_grid = [
    {"kernel": ["rbf"], "C": [v for v in np.logspace(-3, 2, 10)],
    "gamma": [v for v in np.logspace(-3, 2, 10)]}
]

svc = SVC(kernel="rbf", random_state=42)
grid_search = GridSearchCV(svc, param_grid, cv=5)

grid_search.fit(X_train, y_train)
print(grid_search.best_params_)


In [None]:
model = grid_search.best_estimator_
results(model)

# Using XGBoost

In [None]:
model = XGBClassifier(booster="gbtree",n_estimator=[10], max_depth=3, objective='binary:logistic', use_label_encoder=False )
model.fit(X_train, y_train)
results(model)