# Feture Engineering & Processing


### Imports


In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import mlflow

### Load data

In [67]:
df = pd.read_csv('../data/diabetes_binary_health_indicators_BRFSS2015.csv')
print(df.head())

   Diabetes_binary  HighBP  HighChol  CholCheck   BMI  Smoker  Stroke  \
0              0.0     1.0       1.0        1.0  40.0     1.0     0.0   
1              0.0     0.0       0.0        0.0  25.0     1.0     0.0   
2              0.0     1.0       1.0        1.0  28.0     0.0     0.0   
3              0.0     1.0       0.0        1.0  27.0     0.0     0.0   
4              0.0     1.0       1.0        1.0  24.0     0.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  ...  AnyHealthcare  \
0                   0.0           0.0     0.0  ...            1.0   
1                   0.0           1.0     0.0  ...            0.0   
2                   0.0           0.0     1.0  ...            1.0   
3                   0.0           1.0     1.0  ...            1.0   
4                   0.0           1.0     1.0  ...            1.0   

   NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex   Age  Education  \
0          0.0      5.0      18.0      15.0       1.0  0.0   9.0   

### Encode binary, ordinal, and nominal features

In [68]:
binary_cols = ['Smoker', 'HighBP', 'Stroke', 'HighChol', 'BMI', 'HeartDiseaseorAttack', 'PhysActivity']
for col in binary_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].map({'Yes': 1, 'No': 0, 'Male': 1, 'Female': 0})


In [69]:
print(df['GenHlth'].unique())

[5. 3. 2. 4. 1.]


In [70]:
# GenHlth is already numeric, nothing else needed
print(df['GenHlth'].head())

0    5.0
1    3.0
2    5.0
3    2.0
4    2.0
Name: GenHlth, dtype: float64


In [71]:
# Nominal columns
nominal_cols = ['Age', 'Education', 'Income']
df = pd.get_dummies(df, columns=nominal_cols)

In [72]:
# Create new features (BMI categories)
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif bmi < 25:
        return 'Normal'
    elif bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'
df['BMI_Category'] = df['BMI'].apply(bmi_category)
df = pd.get_dummies(df, columns=['BMI_Category'])

#### Handle data imbalance using SMOTE

In [73]:
target_col = 'Diabetes_binary' 
X = df.drop(target_col, axis=1)
y = df[target_col]

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

#### Normalize numerical features

In [74]:
num_cols = X_res.select_dtypes(include='number').columns
scaler = StandardScaler()
X_res[num_cols] = scaler.fit_transform(X_res[num_cols])

#### Split data using stratified train/validation/test sets

In [75]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X_res, y_res, test_size=0.3, stratify=y_res, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

#### Print results

In [76]:
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (305667, 49)
Validation shape: (65500, 49)
Test shape: (65501, 49)


# Week 3 : Model Development & Experimentation

In [77]:
# Train baseline models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": GaussianNB()
}


results = {}

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        prec = precision_score(y_val, y_pred)
        rec = recall_score(y_val, y_pred)
        f1 = f1_score(y_val, y_pred)
        cm = confusion_matrix(y_val, y_pred)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        results[name] = {
            "accuracy": acc,
            "precision": prec,
            "recall": rec,
            "f1_score": f1,
            "confusion_matrix": cm
        }
        print(f"\n{name} Results:")
        print(classification_report(y_val, y_pred))
        print("Confusion Matrix:\n", cm)


Logistic Regression Results:
              precision    recall  f1-score   support

         0.0       0.81      0.90      0.85     32750
         1.0       0.88      0.80      0.84     32750

    accuracy                           0.85     65500
   macro avg       0.85      0.85      0.85     65500
weighted avg       0.85      0.85      0.85     65500

Confusion Matrix:
 [[29364  3386]
 [ 6705 26045]]

Decision Tree Results:
              precision    recall  f1-score   support

         0.0       0.88      0.87      0.88     32750
         1.0       0.87      0.88      0.88     32750

    accuracy                           0.88     65500
   macro avg       0.88      0.88      0.88     65500
weighted avg       0.88      0.88      0.88     65500

Confusion Matrix:
 [[28581  4169]
 [ 3809 28941]]

Naive Bayes Results:
              precision    recall  f1-score   support

         0.0       0.79      0.64      0.71     32750
         1.0       0.70      0.83      0.76     32750

    ac