In [2]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


# Load the data

In [28]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)

# Data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets

# Combine features and target into a single DataFrame
df = pd.concat([X, y], axis=1)
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,45,0,0,0,0,1,1,...,0,3,0,5,0,1,5,6,7,0
253676,1,1,1,18,0,0,0,0,0,0,...,0,4,0,0,1,0,11,2,4,1
253677,0,0,1,28,0,0,0,1,1,0,...,0,1,0,0,0,0,2,5,2,0
253678,1,0,1,23,0,0,0,0,1,1,...,0,3,0,0,0,1,7,5,1,0


# General features statistics

In [29]:
# Calculate summary statistics for each feature
summary = df.describe()

# Calculate the mode for each feature
mode = df.mode().iloc[0]  # Get the first mode if there are multiple

# Add mode to the summary statistics
summary.loc['mode'] = mode

# Display the summary including mean, median, mode, min, max, and std
summary

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
count,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,...,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0,253680.0
mean,0.429001,0.424121,0.96267,28.382364,0.443169,0.040571,0.094186,0.756544,0.634256,0.81142,...,0.084177,2.511392,3.184772,4.242081,0.168224,0.440342,8.032119,5.050434,6.053875,0.139333
std,0.494934,0.49421,0.189571,6.608694,0.496761,0.197294,0.292087,0.429169,0.481639,0.391175,...,0.277654,1.068477,7.412847,8.717951,0.374066,0.496429,3.05422,0.985774,2.071148,0.346294
min,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,1.0,24.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,6.0,4.0,5.0,0.0
50%,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,7.0,0.0
75%,1.0,1.0,1.0,31.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,3.0,2.0,3.0,0.0,1.0,10.0,6.0,8.0,0.0
max,1.0,1.0,1.0,98.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,5.0,30.0,30.0,1.0,1.0,13.0,6.0,8.0,1.0
mode,0.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,8.0,0.0


# Seperating binary/ordinal from continous features

In [30]:
# Identify binary features
binary_features = []
for col in df.columns:
    if df[col].nunique() == 2:
        binary_features.append(col)

print("Binary Features:", binary_features)

# Identify discrete features (excluding binary features)
discrete_features = []
for col in df.columns:
    if df[col].nunique() < 10 and col not in binary_features: #Assumed discrete if unique values are less than 10
        discrete_features.append(col)
print("\nDiscrete Features:", discrete_features)

Binary Features: ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Diabetes_binary']

Discrete Features: ['GenHlth', 'Education', 'Income']


# Checking if scaling is needed for ordinal features

In [31]:
for col in ['GenHlth', 'Education', 'Income']:
  print(f"Feature: {col}")
  print(f"Number of unique values: {df[col].nunique()}")
  print(f"Value distribution:\n{df[col].value_counts()}\n")

Feature: GenHlth
Number of unique values: 5
Value distribution:
GenHlth
2    89084
3    75646
1    45299
4    31570
5    12081
Name: count, dtype: int64

Feature: Education
Number of unique values: 6
Value distribution:
Education
6    107325
5     69910
4     62750
3      9478
2      4043
1       174
Name: count, dtype: int64

Feature: Income
Number of unique values: 8
Value distribution:
Income
8    90385
7    43219
6    36470
5    25883
4    20135
3    15994
2    11783
1     9811
Name: count, dtype: int64



# Scaling continous features

In [32]:
from sklearn.preprocessing import MinMaxScaler

# Features to scale (excluding the specified list)
features_to_scale = [col for col in df.columns if col not in ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'GenHlth', 'Education', 'Income', 'Diabetes_binary']]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the selected features
df[features_to_scale] = scaler.fit_transform(df[features_to_scale])

# Display the updated DataFrame
df

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,0.325581,1,0,0,0,0,1,...,0,5,0.6,0.500000,1,0,0.666667,4,3,0
1,0,0,0,0.151163,1,0,0,1,0,0,...,1,3,0.0,0.000000,0,0,0.500000,6,1,0
2,1,1,1,0.186047,0,0,0,0,1,0,...,1,5,1.0,1.000000,1,0,0.666667,4,8,0
3,1,0,1,0.174419,0,0,0,1,1,1,...,0,2,0.0,0.000000,0,0,0.833333,3,6,0
4,1,1,1,0.139535,0,0,0,1,1,1,...,0,2,0.1,0.000000,0,0,0.833333,5,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,1,1,1,0.383721,0,0,0,0,1,1,...,0,3,0.0,0.166667,0,1,0.333333,6,7,0
253676,1,1,1,0.069767,0,0,0,0,0,0,...,0,4,0.0,0.000000,1,0,0.833333,2,4,1
253677,0,0,1,0.186047,0,0,0,1,1,0,...,0,1,0.0,0.000000,0,0,0.083333,5,2,0
253678,1,0,1,0.127907,0,0,0,0,1,1,...,0,3,0.0,0.000000,0,1,0.500000,5,1,0


# Checking for missing values

In [34]:
# Check for missing values in the DataFrame
missing_values = df.isnull().sum()

# Alternatively, check for missing values and their percentage
missing_values_percentage = (df.isnull().sum() / len(df)) * 100
missing_values_percentage

Unnamed: 0,0
HighBP,0.0
HighChol,0.0
CholCheck,0.0
BMI,0.0
Smoker,0.0
Stroke,0.0
HeartDiseaseorAttack,0.0
PhysActivity,0.0
Fruits,0.0
Veggies,0.0
