# Feture Engineering & Processing


### Imports


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE

### Load data

In [20]:
df = pd.read_csv('../data/diabetes_binary_health_indicators_BRFSS2015.csv')
print(df.head())

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

### Encode binary, ordinal, and nominal features

In [32]:
binary_cols = ['Smoker', 'HighBP', 'Stroke', 'HighChol', 'BMI', 'HeartDiseaseorAttack', 'PhysActivity']
for col in binary_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})


In [33]:
print(df['GenHlth'].unique())

[5. 3. 2. 4. 1.]


In [34]:
# GenHlth is already numeric, nothing else needed
print(df['GenHlth'].head())

0    5.0
1    3.0
2    5.0
3    2.0
4    2.0
Name: GenHlth, dtype: float64


In [35]:
# Nominal columns
nominal_cols = ['Age', 'Education', 'Income']
df = pd.get_dummies(df, columns=nominal_cols)

KeyError: "None of [Index(['Age', 'Education', 'Income'], dtype='object')] are in the [columns]"

In [None]:
# Create new features (BMI categories)
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'
df['BMI_Category'] = df['BMI'].apply(bmi_category)
df = pd.get_dummies(df, columns=['BMI_Category'])

#### Handle data imbalance using SMOTE

In [36]:
target_col = 'Diabetes_binary' 
X = df.drop(target_col, axis=1)
y = df[target_col]

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

#### Normalize numerical features

In [37]:
num_cols = X_res.select_dtypes(include='number').columns
scaler = StandardScaler()
X_res[num_cols] = scaler.fit_transform(X_res[num_cols])

#### Split data using stratified train/validation/test sets

In [38]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_res, y_res, test_size=0.3, stratify=y_res, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

#### Print results

In [39]:
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (305667, 49)
Validation shape: (65500, 49)
Test shape: (65501, 49)
