<a href="https://colab.research.google.com/github/priscilla1812/T2DSure-Smart-Prediction-for-Type-2-Diabetes/blob/main/T2D2_Gradio_GUI_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import and load the diabetes prediction dataset
dft2d = pd.read_csv("/content/drive/My Drive/Colab Notebooks/diabetes_prediction_dataset.csv")

# View the top 5 rows of df
dft2d.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Checking for null values
# df.isnull().sum().sum() returns the number of missing values in the dataset.
dft2d.isnull().values.any() # returns True when there is at least one missing value occurring in the data.

False

In [None]:
#dropping duplicate values - checking if there are any duplicate rows and dropping if any
dft2d.duplicated().sum()
dft2d = dft2d.drop_duplicates()

In [None]:
# Checking for values that are 0 in numerical columns of the dataset
print("No. of zero values in Blood Glucose level column : ", dft2d[dft2d['blood_glucose_level']==0].shape[0])
print("No. of zero values in HbA1c level column : ", dft2d[dft2d['HbA1c_level']==0].shape[0])
print("No. of zero values in BMI column : ", dft2d[dft2d['bmi']==0].shape[0])
print("No. of zero values in Age column : ", dft2d[dft2d['age']==0].shape[0])

No. of zero values in Blood Glucose level column :  0
No. of zero values in HbA1c level column :  0
No. of zero values in BMI column :  0
No. of zero values in Age column :  0


In [None]:
dft2d.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history',
       'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes'],
      dtype='object')

In [None]:
# View the dimensions of df
dft2d.shape

(96146, 9)

In [None]:
# Encode categorical features
#create instance of label encoder
lab = LabelEncoder()
#perform label encoding on 'gender' and 'smoking_history' column
dft2d["gender"] = lab.fit_transform(dft2d["gender"])
dft2d["smoking_history"] = lab.fit_transform(dft2d["smoking_history"])

In [None]:
dft2d['HbA1c_level'].values

array([6.6, 6.6, 5.7, ..., 5.7, 4. , 6.6])

In [None]:
# Split the dataframe into X and y variables
x = dft2d.drop(['diabetes'], axis=1)
y = dft2d['diabetes']

In [None]:
# import SMOTE module from imblearn library
# pip install imblearn (if you don't have imblearn in your system)
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 2)
x_res, y_res = sm.fit_resample(x, y)

print("Before OverSampling, counts of label '1' in y: {}".format(sum(y == 1)))
print("Before OverSampling, counts of label '0' in y: {} \n".format(sum(y == 0)))

print('After OverSampling, the shape of x: {}'.format(x_res.shape))
print('After OverSampling, the shape of y: {} \n'.format(y_res.shape))

print("After OverSampling, counts of label '1' in y: {}".format(sum(y_res == 1)))
print("After OverSampling, counts of label '0' in y: {}".format(sum(y_res == 0)))

Before OverSampling, counts of label '1' in y: 8482
Before OverSampling, counts of label '0' in y: 87664 

After OverSampling, the shape of x: (175328, 8)
After OverSampling, the shape of y: (175328,) 

After OverSampling, counts of label '1' in y: 87664
After OverSampling, counts of label '0' in y: 87664


In [None]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
# Import XGBoost
from xgboost import XGBClassifier
# Initialize XGBoost model
xgb = XGBClassifier()
# train the model
xgb.fit(x_train, y_train)

# Evaluate model
# predict on test split
y_pred = xgb.predict(x_test)
XGBscore = xgb.score(x_test,y_test)

In [None]:
# Evaluate model performance
from sklearn.metrics import accuracy_score
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))

XGBoost Accuracy: 0.9708084870336985


Smoking History - not current,former,No Info,current,never and ever.
Gender - Male,Female, Other.

In [None]:
%pip install gradio

Collecting gradio
  Using cached gradio-4.42.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Using cached aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi (from gradio)
  Using cached fastapi-0.112.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Using cached ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Using cached gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Using cached httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting orjson~=3.0 (from gradio)
  Using cached orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
Collecting pydub (from gradio)
  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.9 (from gradio)
  Using cached python_multipart-0.0.9-py3-none-any.whl.metadata (2.5 kB)
Collecting ruff>=0.2.2 (from gradio)

In [None]:
import gradio as gr

In [None]:
import numpy as np

In [None]:
def predict_diabetes(gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level):
  def risk(HbA1c_level):
    if np.min(HbA1c_level) < 5.7:
      return "Low risk of Diabetes"
    elif np.max(HbA1c_level) >= 6.5:
      return "High risk of Type 2 Diabetes (Consult a doctor)"
    else:
      return "Risk of Prediabetes"

  def prediction(x,y):
    # Initialize XGBoost model
    xgb = XGBClassifier()
    xgb.fit(x,y)
    prediction = xgb.predict(x)[0]
    return "You Have Type 2 Diabetes" if prediction == 1 else "No, You Don't Have Type 2 Diabetes"

  return prediction(x,y), risk(HbA1c_level)


In [None]:
# Define Gradio interface
inputs = [
    gr.Radio(['Male','Female','Other'],label="Select your gender:"),
    gr.Number(label="Enter your age:",minimum=0, maximum=80),
    gr.Radio([0,1],label='Do you have hypertension?  [0 = No  , 1 = 1 Yes]'),
    gr.Radio([0,1],label='Do you have heart disease?  [0 = No  , 1 = 1 Yes]'),
    gr.Radio(['No Info','never','former','not current','current','ever'],label='What is your smoking history?'),
    gr.Number(label="Enter Your BMI:",minimum=10, maximum=96),
    gr.Slider(label='Enter Your HbA1c level:',minimum=3, maximum=10),
    gr.Number(label="Blood Glucose Level",minimum=80, maximum=300)
]

outputs = [
    gr.Textbox(label="Prediction"),
    gr.Textbox(label="Risk")
]

interface = gr.Interface(fn=predict_diabetes, inputs=inputs, outputs=outputs, title="Prediction of Type 2 Diabetes",
                         description="The prediction is not 100% accurate. Please consult a doctor to get proper diagnosis and treatment. ")

In [None]:
interface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://4a2b9cd39b7cbca99c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://4a2b9cd39b7cbca99c.gradio.live


