<a href="https://colab.research.google.com/github/simsekahmet/predict_having_diabetes/blob/main/diabetes_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

sns.set()

**Import data and First Look, Preprocessing**

In [None]:
df = pd.read_csv("diabetes_prediction_dataset.csv")
def check_df(df):

    """First look at dataframe
    -------------------
    Args:
        df (pandas.core.frame.DataFrame): data
    -------------------
    Returns:
        info: information of dataframe
        dtypes: types of columns
        shape: shpe of the dataframe
        head: first 5 row of dataframe
        tail: last 5 row of dataframe
        describe: statistical info of dataframe columns'
        Loss Value (NA): if there is any loss value or not

    """
    df_copy = df.copy()
    print("#################### info ####################")
    print(df.info())
    print("\n#################### dtype ####################")
    print(df.dtypes)
    print("\n#################### shape ####################")
    print(df.shape)
    print("\n#################### head ####################")
    print(df.head())
    print("\n#################### tail ####################")
    print(df.tail())
    print("\n#################### statistical evaluation ####################")
    print(df.describe([0, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1]).T)
    print("\n#################### Loss Value (NA) ####################")
    if df.isnull().values.any() == True:
        print("Sum of Loss Value:")
        print(df.isnull().sum())
    else:
        print("**There is no loss data**")
    return df_copy

pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.2f" % x)

df.head()


In [None]:
df_copy = check_df(df)

*Change gender values to 1 or 0*

In [None]:
df = df[df["gender"] != "Other"]
print(df["gender"].unique())

In [None]:
df["gender"] = df["gender"].map({"Female": 0, "Male": 1})
df.head()

*Delete float type of age column*

In [None]:
df["age"] = df["age"].astype(int)

print(df["age"].unique())


*Change smoking history object values to numerical data*

In [None]:
df = df[df["smoking_history"] != "No Info"]
df["smoking_history"] = df["smoking_history"].map({"never": 0, "current": 1, "former": 2, "ever": 3, "not current": 4})

print(df["smoking_history"].head())


**First Checkpoint**

In [None]:
df_checkpoint_1 = df

**Deep Learning**

*Split into inputs and target*

In [None]:
inputs = df.iloc[:, 0:8].to_numpy()
target = df.iloc[:,-1].to_numpy().reshape(-1,1)

*Split into training, validation, test sets and Standardization*

In [None]:
# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Perform feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

*Creating Deep Learning Model*

In [None]:
input_size = 8
output_size = 1
hidden_layer_1 = 64
hidden_layer_2 = 32
NUM_EPOCHS = 20

model = tf.keras.Sequential([tf.keras.layers.Dense(units= hidden_layer_1,
                                                   activation = "relu",
                                                   input_shape=(input_size,)),
                             tf.keras.layers.Dense(units = hidden_layer_2,
                                                   activation = "relu"),
                             tf.keras.layers.Dense(units = output_size,
                                                   activation = "sigmoid"),
                            ])

model.compile(optimizer = "adam",
              loss = "binary_crossentropy",
              metrics = ["accuracy"])

earlystopping = tf.keras.callbacks.EarlyStopping(patience=1, restore_best_weights=True)

model_ = model.fit(X_train_scaled,
          y_train,
          epochs=NUM_EPOCHS,
          verbose=1,
          batch_size=32,
          callbacks=earlystopping,
          validation_data=(X_val_scaled,
                           y_val))

*Finding test loss and accuracy*

In [None]:
loss, accuracy = model.evaluate(X_test_scaled, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

*Plot Train and Validation Loss*

In [None]:
# Plot the loss graph
plt.plot(model_.history['loss'], label='Training Loss')
plt.plot(model_.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

*Saving model and predict new data*

In [None]:
model.save('diabetes_model.h5')
loaded_model = tf.keras.models.load_model('diabetes_model.h5')

In [None]:
new_data = np.array([[0, 24, 0, 0, 0, 29.32, 5.52, 138.05]])

new_data_scaled = scaler.transform(new_data)
predictions = loaded_model.predict(new_data_scaled)
print("The probability of having diabetes:", 100*predictions[0][0])