In [25]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)

In [26]:
from google.colab import drive

drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [27]:
df = pd.read_csv('/content/gdrive/MyDrive/DiabetesDetection/diabetes_012_health_indicators_BRFSS2015.csv')

In [28]:
df = df.drop(columns=['AnyHealthcare'])
df = df.drop(columns=['NoDocbcCost'])
df = df.drop(columns=['Education'])
df.head()

duplicates = df[df.duplicated()]
print("Duplicate records found: ",len(duplicates))
duplicates.head()

df.drop_duplicates(inplace = True)

Duplicate records found:  37295


In [29]:
df = df.rename(columns={'Diabetes_012': 'IsDiabetes'})
df = df.rename(columns={'HighChol': 'HighCholestrol'})
df = df.rename(columns={'CholCheck': 'CholestrolCheck'})
df = df.rename(columns={'BMI': 'BodyMassIndex'})
df = df.rename(columns={'HeartDiseaseorAttack': 'HeartDisease'})
df = df.rename(columns={'PhysActivity': 'PhysicalActivity'})
df = df.rename(columns={'HvyAlcoholConsump': 'HighAlcohol'})
df = df.rename(columns={'GenHlth': 'GeneralHealth'})
df = df.rename(columns={'MentHlth': 'MentalHealth'})
df = df.rename(columns={'PhysHlth': 'PhysicalHealth'})
df = df.rename(columns={'DiffWalk': 'DifficultyWalking'})

In [30]:
print("Before: ", df.shape)
df = df[df['CholestrolCheck'] != 0]
print("After: ", df.shape)

# as now all the CholestrolCheck columns contains 1 only we don't need this column anymore
df = df.drop(columns=['CholestrolCheck'])

# null values in Income
income_na = df['Income'].isna().sum()
print("Income null count: ", income_na)
print(df['Income'].value_counts())

# filling missing values for Education, Income Columns
df['Income'] = df['Income'].fillna(round(df['Income'].mean()))
print(df.head())

df = df.dropna()
df.shape

Q25 = df['BodyMassIndex'].quantile(0.25)
Q75 = df['BodyMassIndex'].quantile(0.75)
print("Inter quartile range: ", Q75-Q25)

IQR = Q75 - Q25
df = df[~( (df['BodyMassIndex'] < (Q25 - 1.5 * IQR)) | (df['BodyMassIndex'] > (Q75 + 1.5 * IQR)) )]

Before:  (216385, 19)
After:  (207247, 19)
Income null count:  0
8.0    62121
7.0    35816
6.0    31577
5.0    23434
4.0    18686
3.0    15066
2.0    11248
1.0     9299
Name: Income, dtype: int64
   IsDiabetes  HighBP  HighCholestrol  BodyMassIndex  Smoker  Stroke  \
0         0.0     1.0             1.0           40.0     1.0     0.0   
2         0.0     1.0             1.0           28.0     0.0     0.0   
3         0.0     1.0             0.0           27.0     0.0     0.0   
4         0.0     1.0             1.0           24.0     0.0     0.0   
5         0.0     1.0             1.0           25.0     1.0     0.0   

   HeartDisease  PhysicalActivity  Fruits  Veggies  HighAlcohol  \
0           0.0               0.0     0.0      1.0          0.0   
2           0.0               0.0     1.0      0.0          0.0   
3           0.0               1.0     1.0      1.0          0.0   
4           0.0               1.0     1.0      1.0          0.0   
5           0.0               1.0   

In [31]:
df = df.astype(int)
df.dtypes
df['Fruits_and_Veggies'] = df['Fruits'] | df['Veggies']

In [32]:
df = df.drop(columns=['Fruits'])
df = df.drop(columns=['Veggies'])

In [33]:
dfML = df.copy()

In [34]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Assuming you have already loaded and preprocessed your dataset into the DataFrame 'dfML'

# Splitting the data into features (X) and target (y)
X = dfML.drop('IsDiabetes', axis=1)
y = dfML['IsDiabetes']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [35]:
# Creating a TensorFlow model for multi-class classification
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
])

# Compiling the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Training the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x77feff7e0610>

In [36]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Calculating accuracy, precision, and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')



  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
# Calculating the confusion matrix
confusion_mat = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(confusion_mat, columns=['Predicted No Diabetes', 'Predicted Prediabetes', 'Predicted Diabetes'],
                     index=['Actual No Diabetes', 'Actual Prediabetes', 'Actual Diabetes'])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("\nConfusion Matrix:")
print(cm_df)

Accuracy: 0.8215409779187629
Precision: 0.4504219911741578
Recall: 0.3931291415665264

Confusion Matrix:
                    Predicted No Diabetes  Predicted Prediabetes  \
Actual No Diabetes                  31735                      0   
Actual Prediabetes                    749                      0   
Actual Diabetes                      5072                      0   

                    Predicted Diabetes  
Actual No Diabetes                1279  
Actual Prediabetes                 101  
Actual Diabetes                   1415  


In [38]:
print(X_test)

        HighBP  HighCholestrol  BodyMassIndex  Smoker  Stroke  HeartDisease  \
244538       0               1             35       0       0             0   
10940        0               1             23       1       0             0   
39654        1               1             29       1       0             0   
62904        0               1             27       1       0             0   
37564        0               1             38       1       0             0   
...        ...             ...            ...     ...     ...           ...   
55394        0               0             17       1       0             0   
18481        0               0             29       1       0             0   
86274        1               0             34       0       0             0   
136148       0               1             27       0       0             0   
68768        1               0             24       0       0             0   

        PhysicalActivity  HighAlcohol  GeneralHealt

In [39]:
correctly_classified_indices = (y_pred == y_test)
correctly_classified_samples = X_test[correctly_classified_indices]
correctly_classified_labels = y_test[correctly_classified_indices]

In [40]:
# Print the correctly classified samples and their outcome classification
for i in range(1000):
    sample = correctly_classified_samples.iloc[i]
    outcome = correctly_classified_labels.iloc[i]

    if outcome == 0:
        outcome_label = "No Diabetes"
    elif outcome == 1:
        outcome_label = "Pre-Diabetes"
    else:
        outcome_label = "Diabetes"

    print(f"Sample {i+1}: {sample.values.tolist()}, Outcome: {outcome_label}")

Sample 1: [0, 1, 35, 0, 0, 0, 0, 0, 3, 5, 15, 1, 0, 7, 3, 1], Outcome: No Diabetes
Sample 2: [0, 1, 23, 1, 0, 0, 1, 0, 4, 20, 10, 0, 0, 9, 5, 1], Outcome: No Diabetes
Sample 3: [1, 1, 29, 1, 0, 0, 1, 0, 2, 1, 5, 0, 0, 12, 8, 1], Outcome: No Diabetes
Sample 4: [0, 1, 27, 1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 11, 6, 1], Outcome: No Diabetes
Sample 5: [0, 1, 38, 1, 0, 0, 1, 0, 2, 0, 0, 0, 0, 5, 8, 1], Outcome: No Diabetes
Sample 6: [0, 0, 34, 1, 0, 0, 1, 0, 2, 1, 0, 0, 1, 4, 8, 1], Outcome: No Diabetes
Sample 7: [1, 0, 24, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 13, 7, 1], Outcome: No Diabetes
Sample 8: [0, 0, 27, 0, 0, 0, 1, 0, 2, 9, 3, 0, 1, 4, 6, 1], Outcome: No Diabetes
Sample 9: [0, 0, 23, 1, 0, 0, 1, 0, 2, 2, 2, 0, 1, 3, 6, 1], Outcome: No Diabetes
Sample 10: [1, 1, 24, 0, 0, 0, 0, 0, 3, 0, 2, 0, 1, 9, 5, 0], Outcome: No Diabetes
Sample 11: [1, 1, 29, 0, 0, 1, 0, 0, 2, 0, 0, 0, 1, 12, 5, 0], Outcome: No Diabetes
Sample 12: [0, 1, 28, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 8, 8, 1], Outcome: No Diabetes
Sample

In [41]:
tf.saved_model.save(model, "/content/gdrive/MyDrive/DiabetesDetection/model3")



In [42]:
# Convert the SavedModel to TFLite format
converter = tf.lite.TFLiteConverter.from_saved_model("/content/gdrive/MyDrive/DiabetesDetection/model3")
tflite_model = converter.convert()
with open("/content/gdrive/MyDrive/DiabetesDetection/DiabetesModel.tflite", "wb") as f:
    f.write(tflite_model)

In [44]:
import tensorflow as tf
import numpy as np

# Load the TFLite model and allocate tensors.
interpreter = tf.lite.Interpreter(model_path="/content/gdrive/MyDrive/DiabetesDetection/DiabetesModel.tflite")
interpreter.allocate_tensors()

# Get input and output details.
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Prepare input data (replace with your actual input data).
input_data = [
    -0.94758694970811, -0.9227696834932192, -0.4086446091737978, 1.0532291718555198,
    -0.22691891091653169, -0.3559921046478112, -1.6259068464326134, 0.8036443042558418,
    0.5202495980934378, -0.25953834265044434, 0.3385385571843806, 0.175213993547309,
    -0.5342166966016999, -0.4874143476574142, 1.1303293777235701, -1.3558461080578057
]

# Convert input data to FLOAT32.
input_data = np.array(input_data, dtype=np.float32)

# Reshape input data to match the expected input shape of the model.
input_data = np.reshape(input_data, input_details[0]['shape'])

# Set the input tensor.
interpreter.set_tensor(input_details[0]['index'], input_data)

# Run inference.
interpreter.invoke()

# Get the output tensor.
output_data = interpreter.get_tensor(output_details[0]['index'])

# Process the output (replace with your actual output processing logic).
# For example, if it's a classification model, you can get the predicted class index.
predicted_class_index = output_data.argmax()

print("Predicted class index:", predicted_class_index)


Predicted class index: 0
