<a href="https://colab.research.google.com/github/saraalharthi357/multiclass_diabetes_analysis/blob/main/multiclass_diabetes_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Initialization

In [1]:
# Install if needed
!pip install kagglehub[pandas-datasets]



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# 2. Loading the Dataset

## Load the dataset from Kaggle

In [5]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "Multiclass Diabetes Dataset/Multiclass Diabetes Dataset.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "yasserhessein/multiclass-diabetes-dataset",
  file_path
)

  df = kagglehub.load_dataset(


First 5 records: 
    Gender  AGE  Urea  Cr  HbA1c  Chol   TG  HDL  LDL  VLDL   BMI  Class
0       0   50   4.7  46    4.9   4.2  0.9  2.4  1.4   0.5  24.0      0
1       1   26   4.5  62    4.9   3.7  1.4  1.1  2.1   0.6  23.0      0
2       1   33   7.1  46    4.9   4.9  1.0  0.8  2.0   0.4  21.0      0
3       0   45   2.3  24    4.0   2.9  1.0  1.0  1.5   0.4  21.0      0
4       0   50   2.0  50    4.0   3.6  1.3  0.9  2.1   0.6  24.0      0


# 3. Exploring the Dataset

In [7]:
# Check the number of records and features the dataset has
df.shape

(264, 12)

In [13]:
# Show general info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264 entries, 0 to 263
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  264 non-null    int64  
 1   AGE     264 non-null    int64  
 2   Urea    264 non-null    float64
 3   Cr      264 non-null    int64  
 4   HbA1c   264 non-null    float64
 5   Chol    264 non-null    float64
 6   TG      264 non-null    float64
 7   HDL     264 non-null    float64
 8   LDL     264 non-null    float64
 9   VLDL    264 non-null    float64
 10  BMI     264 non-null    float64
 11  Class   264 non-null    int64  
dtypes: float64(8), int64(4)
memory usage: 24.9 KB


In [15]:
# Display metrices on the dataset with precision of 2 decimal numbers to reduce the clutter
pd.set_option("display.precision", 2)
df.describe()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
count,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0,264.0
mean,0.55,49.52,5.67,85.81,6.86,4.59,2.15,1.18,2.53,1.48,26.63,1.12
std,0.5,10.13,4.0,99.4,2.54,1.29,1.27,0.46,1.0,3.1,5.09,0.91
min,0.0,25.0,1.1,6.0,0.9,0.0,0.6,0.4,0.3,0.2,19.0,0.0
25%,0.0,43.0,3.6,46.0,5.0,3.88,1.3,0.9,1.8,0.67,23.0,0.0
50%,1.0,50.0,4.7,61.0,6.1,4.5,1.8,1.1,2.5,0.9,25.0,1.0
75%,1.0,55.25,6.1,82.25,8.2,5.3,2.73,1.32,3.2,1.3,30.0,2.0
max,1.0,77.0,26.4,800.0,14.6,9.5,8.7,4.0,5.6,31.8,43.25,2.0


In [24]:
class_labels = {0: "non-diabetic", 1: "predicted-diabetic", 2: "diabetic"}
class_counts = df["Class"].value_counts()

for key in class_labels.keys():
  print(f"Class: {class_labels[key]} - Count: {class_counts[key]}")

Class: non-diabetic - Count: 96
Class: predicted-diabetic - Count: 40
Class: diabetic - Count: 128


In [25]:
gender_labels = {0: "Male", 1: "Female"}
gender_counts = df["Gender"].value_counts()

for key in gender_labels.keys():
  print(f"Gender: {gender_labels[key]} - Count: {gender_counts[key]}")

Gender: Male - Count: 120
Gender: Female - Count: 144


In [12]:
df.head()

Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
0,0,50,4.7,46,4.9,4.2,0.9,2.4,1.4,0.5,24.0,0
1,1,26,4.5,62,4.9,3.7,1.4,1.1,2.1,0.6,23.0,0
2,1,33,7.1,46,4.9,4.9,1.0,0.8,2.0,0.4,21.0,0
3,0,45,2.3,24,4.0,2.9,1.0,1.0,1.5,0.4,21.0,0
4,0,50,2.0,50,4.0,3.6,1.3,0.9,2.1,0.6,24.0,0


# 4. Visualization of the Data