In [None]:
import pandas as pd

# Load nafld1.csv
df = pd.read_csv("nafld1.csv")

# Drop unnecessary columns
df = df[['age', 'male', 'weight', 'height', 'bmi', 'status']]

# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Convert male (0 = Female, 1 = Male) to string for categorical
df['gender'] = df['male'].map({0: 'female', 1: 'male'})

# Bin continuous values into categorical labels
df['age_cat'] = pd.cut(df['age'], bins=[0, 40, 60, 80, 100], labels=['young', 'middle-aged', 'senior', 'elderly'])
df['bmi_cat'] = pd.cut(df['bmi'], bins=[0, 18.5, 25, 30, 100], labels=['underweight', 'normal', 'overweight', 'obese'])
df['height_cat'] = pd.cut(df['height'], bins=[0, 150, 165, 180, 250], labels=['short', 'avg', 'tall', 'very tall'])
df['weight_cat'] = pd.cut(df['weight'], bins=[0, 60, 80, 100, 150], labels=['light', 'med', 'heavy', 'very heavy'])

# Drop rows with missing status
df = df[df['status'].notna()]

# Select only categorical columns
df_cat = df[['gender', 'age_cat', 'bmi_cat', 'height_cat', 'weight_cat', 'status']]

# One-hot encode and convert booleans to integers
df_encoded = pd.get_dummies(df_cat.drop(columns=['status']))
df_encoded = df_encoded.astype(int)

# Add the label column at the end
df_encoded['disease'] = df_cat['status']

# Save as CSV
df_encoded.to_csv("nafld_timbertrek.csv", index=False)

print("Saved: nafld_timbertrek.csv (categorical & one-hot encoded, using 0/1)")
print("Missing status values:", df['status'].isna().sum())  # Should be 0


Saved: nafld_timbertrek.csv (categorical & one-hot encoded, using 0/1)
Missing status values: 0
