## Data quality checks

We check:
1) class balance (targets),
2) missing values,
3) duplicate rows,
4) basic sanity checks for ranges and inconsistent labels.

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Load data
df = pd.read_csv("../data/diabetes_dataset.csv")

print("Shape:", df.shape)
df.head()

Shape: (100000, 31)


Unnamed: 0,age,gender,ethnicity,education_level,income_level,employment_status,smoking_status,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,...,hdl_cholesterol,ldl_cholesterol,triglycerides,glucose_fasting,glucose_postprandial,insulin_level,hba1c,diabetes_risk_score,diabetes_stage,diagnosed_diabetes
0,58,Male,Asian,Highschool,Lower-Middle,Employed,Never,0,215,5.7,...,41,160,145,136,236,6.36,8.18,29.6,Type 2,1
1,48,Female,White,Highschool,Middle,Employed,Former,1,143,6.7,...,55,50,30,93,150,2.0,5.63,23.0,No Diabetes,0
2,60,Male,Hispanic,Highschool,Middle,Unemployed,Never,1,57,6.4,...,66,99,36,118,195,5.07,7.51,44.7,Type 2,1
3,74,Female,Black,Highschool,Low,Retired,Never,0,49,3.4,...,50,79,140,139,253,5.28,9.03,38.2,Type 2,1
4,46,Male,White,Graduate,Middle,Retired,Never,1,109,7.2,...,52,125,160,137,184,12.74,7.2,23.5,Type 2,1


In [4]:
# 1) Class balance (imbalance)
print("diagnosed_diabetes distribution:")
print(df["diagnosed_diabetes"].value_counts(dropna=False))
print("\nProportions:")
print(df["diagnosed_diabetes"].value_counts(normalize=True, dropna=False))

print("\ndiabetes_stage distribution:")
print(df["diabetes_stage"].value_counts(dropna=False))
print("\nProportions:")
print(df["diabetes_stage"].value_counts(normalize=True, dropna=False))


diagnosed_diabetes distribution:
diagnosed_diabetes
1    59998
0    40002
Name: count, dtype: int64

Proportions:
diagnosed_diabetes
1    0.59998
0    0.40002
Name: proportion, dtype: float64

diabetes_stage distribution:
diabetes_stage
Type 2          59774
Pre-Diabetes    31845
No Diabetes      7981
Gestational       278
Type 1            122
Name: count, dtype: int64

Proportions:
diabetes_stage
Type 2          0.59774
Pre-Diabetes    0.31845
No Diabetes     0.07981
Gestational     0.00278
Type 1          0.00122
Name: proportion, dtype: float64


In [16]:
# 2) Missing values
missing = int(df.isna().sum().sum())
print("Missing values:", missing)

Missing values: 0


In [13]:
# 3) Duplicate rows 

# Exact duplicate rows (all columns identical)
dup_all = df.duplicated().sum()
print("Exact duplicate rows (all columns):", dup_all)

# Duplicates based on all feature columns (excluding targets)
possible_targets = ["diagnosed_diabetes", "diabetes_stage"]
feature_cols = [c for c in df.columns if c not in possible_targets]

dup_features = df.duplicated(subset=feature_cols).sum()
print("Duplicate rows based on features (ignoring targets):", dup_features)

Exact duplicate rows (all columns): 0
Duplicate rows based on features (ignoring targets): 0


In [20]:
# 4) checks
import numpy as np
import pandas as pd

# Binary:
num_cols = df.select_dtypes(include="number").columns

binary_like = [c for c in num_cols if df[c].nunique(dropna=False) <= 3]

print("Binary columns:", binary_like)

for c in binary_like:
    vals = np.sort(df[c].dropna().unique())
    print(f"{c}: unique values = {vals}")

# Continuous numeric columns:
continuous_cols = [c for c in num_cols if df[c].nunique(dropna=False) > 20]

cont = df[continuous_cols]

q1 = cont.quantile(0.25)
q3 = cont.quantile(0.75)
iqr = q3 - q1

outlier_mask = (cont < (q1 - 1.5*iqr)) | (cont > (q3 + 1.5*iqr))
outlier_counts = outlier_mask.sum().sort_values(ascending=False)
outlier_rates = (outlier_mask.mean() * 100).sort_values(ascending=False)

print("Top outlier counts (IQR, continuous only):")
display(outlier_counts[outlier_counts > 0].head(15))

print("Top outlier rates % (IQR, continuous only):")
display(outlier_rates[outlier_rates > 0].head(15))


Binary columns: ['family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'diagnosed_diabetes']
family_history_diabetes: unique values = [0 1]
hypertension_history: unique values = [0 1]
cardiovascular_history: unique values = [0 1]
diagnosed_diabetes: unique values = [0 1]
Top outlier counts (IQR, continuous only):


physical_activity_minutes_per_week    3199
diabetes_risk_score                    914
sleep_hours_per_day                    900
heart_rate                             855
glucose_fasting                        745
bmi                                    744
diastolic_bp                           731
glucose_postprandial                   634
hba1c                                  618
hdl_cholesterol                        565
systolic_bp                            530
ldl_cholesterol                        349
diet_score                             337
insulin_level                          326
cholesterol_total                      309
dtype: int64

Top outlier rates % (IQR, continuous only):


physical_activity_minutes_per_week    3.199
diabetes_risk_score                   0.914
sleep_hours_per_day                   0.900
heart_rate                            0.855
glucose_fasting                       0.745
bmi                                   0.744
diastolic_bp                          0.731
glucose_postprandial                  0.634
hba1c                                 0.618
hdl_cholesterol                       0.565
systolic_bp                           0.530
ldl_cholesterol                       0.349
diet_score                            0.337
insulin_level                         0.326
cholesterol_total                     0.309
dtype: float64