In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Read the XPT file into a pandas DataFrame
import pandas as pd
df_2024 = pd.read_sas('LLCP2024.XPT')
# df_2023 = pd.read_sas('LLCP2023.XPT')
# df_2022 = pd.read_sas('LLCP2022.XPT')
# Display the first 5 rows of the DataFrame


In [None]:
display(df_2024.head(25))

In [None]:
# print list of columns and count for df_2024_selected
cols = df_2024.columns.tolist()
print(f"{len(cols)} columns:")
print(cols)

In [None]:
# select rows where DISPCODE == 1100 and save in df
if 'DISPCODE' not in df_2024.columns:
    raise KeyError("DISPCODE column not found in df_2024")

df = df_2024[df_2024['DISPCODE'] == 1100].copy()
print("df.shape =", df.shape)
display(df.head())

In [None]:
selected_columns = ['_MICHD','_ASTHMS1','_DRDXAR2','DIABETE4','_PHYS14D','_TOTINDA','_SEX','_AGE_G','_BMI5CAT','_EDUCAG','_INCOMG1','_RFSMOK3','DRNKANY6','SSBSUGR2']
df_2024_new = df.reindex(columns= selected_columns).copy()

# report results
missing_cols = [c for c in selected_columns if c not in df_2024.columns]
print("df_2024_new.shape =", df_2024_new.shape)
if missing_cols:
    print("Missing columns in df_2024:", missing_cols)

# quick preview
display(df_2024_new.head())

In [None]:
df_2024_new.info()

In [None]:
# remove leading underscores from all column names in df_2024_new
df_2024_new.rename(columns=lambda c: c.lstrip('_') if isinstance(c, str) else c, inplace=True)
# quick check
print(df_2024_new.columns.tolist())
df_2024_new.head()

In [None]:
import numpy as np
missing_values_map = {
    'MICHD': [7, 9],
    'PHYS14D': [9],
    'TOTINDA': [9],
    'ASTHMS1': [9],
    'DRDXAR2': [7, 9],
    'EDUCAG': [9],
    'INCOMG1': [9],
    'RFSMOK3': [9],
    'DRNKANY6': [7, 9],
    'SSBSUGR2': [777, 999],
    'DIABETE4': [9,7,2]
}

# --- Loop through the map and replace the codes with np.nan ---
# np.nan is the standard representation for NULL in pandas.
for column, codes in missing_values_map.items():
    if column in df_2024_new.columns:
        # The .replace() method can take a list of values to be replaced
        df_2024_new[column] = df_2024_new[column].replace(codes, np.nan)

In [None]:
df_2024_new.info()

In [None]:
df_cleaned = df_2024_new.dropna()

In [None]:
df_cleaned.info()

In [None]:
# --- Step 2: Define and Apply the Categorization Logic ---

# Define the conditions (bins) for each category
conditions = [
    (df_cleaned['SSBSUGR2'] >= 101) & (df_cleaned['SSBSUGR2'] <= 199),
    (df_cleaned['SSBSUGR2'] >= 201) & (df_cleaned['SSBSUGR2'] <= 299),
    ((df_cleaned['SSBSUGR2'] >= 301) & (df_cleaned['SSBSUGR2'] <= 399)) | (df_cleaned['SSBSUGR2'] == 888)
]

# Define the category labels that correspond to the conditions
categories = ['High', 'Medium', 'Low']

# Use np.select to create the new column based on the conditions
# The 'default' argument handles any case that doesn't meet a condition
df_cleaned['SSBSUGR2_CAT'] = np.select(conditions, categories, default='Uncategorized')


In [None]:
df_cleaned["SSBSUGR2_CAT"].value_counts()

In [None]:
# write a code to save df_cleaned to a CSV file named 'cleaned_data_2024.csv' without the index column.
df_cleaned.to_csv('cleaned_data.csv', index=False)

In [None]:
# read the cleaned CSV produced earlier
df_loaded = pd.read_csv('cleaned_data.csv')
# drop the specified columns and save to a new DataFrame
df_dib = df_loaded.drop(columns=['SSBSUGR2', 'MICHD', 'ASTHMS1', 'DRDXAR2'])
df_dib.shape
display(df_dib.head())
df_heart= df_loaded.drop(columns=['SSBSUGR2','MICHD','ASTHMS1','DIABETE4'])
df_heart.shape
display(df_heart.head())

In [None]:
numerical_col=['PHYS14D','DIABETE4', 'TOTINDA', 'SEX', 'AGE_G', 'BMI5CAT', 'EDUCAG', 'INCOMG1', 'RFSMOK3', 'DRNKANY6']
for numerical_col in df_dib.columns:
    # attempt numeric conversion (coerce non-numeric -> NaN)
    coerced = pd.to_numeric(df_dib[numerical_col], errors='coerce')
    if not coerced.isna().all():
        df_dib[numerical_col] = coerced.astype("Int64")


In [None]:
numerical_col_heart=['PHYS14D','DRDXAR2', 'TOTINDA', 'SEX', 'AGE_G', 'BMI5CAT', 'EDUCAG', 'INCOMG1', 'RFSMOK3', 'DRNKANY6']
for numerical_col_heart in df_heart.columns:
    # attempt numeric conversion (coerce non-numeric -> NaN)
    coerced = pd.to_numeric(df_heart[numerical_col_heart], errors='coerce')
    if not coerced.isna().all():
        df_heart[numerical_col_heart] = coerced.astype("Int64")

In [None]:
df_dib.head()
df_heart.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
ordinal_cols = ['SSBSUGR2_CAT']
ordinal_encoder = OrdinalEncoder(categories=[['Low', 'Medium', 'High']])
# fit_transform returns a numpy array; convert to a pandas Series and cast to nullable Int64
df_dib["SSBSUGR2_en"] = pd.Series(ordinal_encoder.fit_transform(df_dib[ordinal_cols]).ravel()).astype("Int64")
df_heart["SSBSUGR2_en"] = pd.Series(ordinal_encoder.fit_transform(df_heart[ordinal_cols]).ravel()).astype("Int64")
df_dib.head()
df_heart.head()

In [None]:
df_dib= df_dib.drop(columns=['SSBSUGR2_CAT'])
df_heart= df_heart.drop(columns=['SSBSUGR2_CAT'])

In [None]:
df_dib.head()
df_heart.head()

In [None]:
df_dib['DIABETE4'].value_counts()
df_heart['DRDXAR2'].value_counts()

In [None]:
# df_dib.to_csv('df_dib.csv', index=False)
# df_heart.to_csv('df_heart.csv', index=False)

In [None]:
df_dib = pd.read_csv('df_dib.csv')

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_dib.drop('DIABETE4', axis=1)
y = df_dib['DIABETE4']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, roc_auc_score

models = [
    ("LogisticRegression", LogisticRegression(max_iter=1000, random_state=42)),
    ("RandomForest", RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)),
    ("HistGradientBoosting", HistGradientBoostingClassifier(random_state=42)),
    ("GradientBoosting", GradientBoostingClassifier(random_state=42))
]

for name, model in models:
    model.fit(X_train_scaled, y_train)
    train_score = model.score(X_train_scaled, y_train)
    test_score = model.score(X_test_scaled, y_test)
    print(f"{name} - Train Accuracy: {train_score:.4f}, Test Accuracy: {test_score:.4f}")

In [None]:
results = {}
for name, model in models:
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # --- Get Scores for Training Data ---
    train_score = model.score(X_train_scaled, y_train)
    
    # --- Get Predictions for Test Data ---
    y_pred_test = model.predict(X_test_scaled)
    
    # Get probability estimates for the 'positive' class (class 1)
    # Note: predict_proba returns [prob_class_0, prob_class_1]
    try:
        y_proba_test = model.predict_proba(X_test_scaled)
    except AttributeError:
        # Some models (like KNN by default) might not have predict_proba
        # Or handle it differently. For this example, we'll skip AUC if not available.
        print(f"\nCould not get probabilities for {name}. Skipping ROC AUC.")
        y_proba_test = None

    # --- Calculate Test Metrics ---
    test_score = model.score(X_test_scaled, y_test)
    test_f1 = f1_score(y_test, y_pred_test, average='weighted')
    
    if y_proba_test is not None:
        test_roc_auc = roc_auc_score(y_test, y_proba_test, multi_class='ovr', average='weighted')
        
        # Store data needed for plotting
        # We store lists for JSON serialization
        results[name] = {
            'y_true': y_test.tolist(), 
            'y_proba': y_proba_test.tolist(), 
            'roc_auc': test_roc_auc
        }
    else:
        test_roc_auc = "N/A"
        results[name] = None # Mark as not plottable

    # --- Print Results ---
    print(f"\nModel: {name}")
    print(f"  Train Accuracy: {train_score:.4f}")
    print(f"  Test Accuracy:  {test_score:.4f}")
    print(f"  Test F1 Score:  {test_f1:.4f}")
    if y_proba_test is not None:
        print(f"  Test ROC AUC:   {test_roc_auc:.4f}")


In [None]:
model = HistGradientBoostingClassifier(max_leaf_nodes=70, max_iter=300, learning_rate=0.01)

# Fit the model on your training data
model.fit(X_train_scaled, y_train)

# Calculate scores
train_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)

print(f"Train Accuracy: {train_score:.4f}, Test Accuracy: {test_score:.4f}")

In [None]:
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)


In [None]:
from sklearn.metrics import classification_report, roc_auc_score

test_accuracy = accuracy_score(y_test, y_pred)
    
    # Calculate and print AUC-ROC score
    # Using 'ovr' (One-vs-Rest) for multi-class and 'macro' average
test_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='macro')
    
    # --- NEW: Generate Classification Report ---
    # This report includes Precision, Recall, and F1-Score
report = classification_report(y_test, y_pred, zero_division=0)
    
print(f"--- {name} ---")
print(f"  - Test Accuracy: {test_accuracy:.4f}")
print(f"  - Test AUC (OvR): {test_auc:.4f}")
print("\n  --- Classification Report ---")
    # Indent the report for better readability
report_lines = report.split('\n')
for line in report_lines:
    print(f"    {line}")
print("-" * (len(name) + 8) + "\n") # Separator

In [None]:
y_pred_proba

In [None]:
# Compute AUC for the trained model on the test set (multi-class OvR + per-class AUC)
proba = model.predict_proba(X_test_scaled)  # shape (n_samples, n_classes)

# Macro (averaged) multi-class AUC (One-vs-Rest)
auc_macro = roc_auc_score(y_test, proba, multi_class='ovr', average='macro')
auc_weighted = roc_auc_score(y_test, proba, multi_class='ovr', average='weighted')

print(f"Multi-class AUC (macro)   : {auc_macro:.4f}")
print(f"Multi-class AUC (weighted): {auc_weighted:.4f}")

# Per-class (binary OvR) AUCs
for i, cls in enumerate(model.classes_):
    auc_cls = roc_auc_score((y_test == cls).astype(int), proba[:, i])
    print(f"AUC for class {cls} vs rest: {auc_cls:.4f}")

In [None]:
import joblib
 
#  Save scaler and model separately
joblib.dump(scaler, "scaler_hist_dib.pkl")
joblib.dump(model, "hist_model_dib.pkl")
print("✅ Scaler and model saved successfully!")

In [None]:
print(X_train.columns)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

models = {
    # "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    # "RandomForest": RandomForestClassifier(random_state=42, n_jobs=-1),
    "HistGradientBoosting": HistGradientBoostingClassifier(random_state=42)
}

# Parameter distributions for RandomizedSearchCV
# Note: Using distributions (like uniform, randint) is possible but for simplicity, we use lists of values.
param_grids = {
    # "LogisticRegression": {
    #     'C': [0.01, 0.1, 1, 10, 100],
    #     'solver': ['liblinear', 'saga']
    # },
    # "RandomForest": {
    #     'n_estimators': [100, 200, 300],
    #     'max_depth': [None, 10, 20, 30],
    #     'min_samples_split': [2, 5, 10],
    #     'min_samples_leaf': [1, 2, 4]
    # },
    "HistGradientBoosting": {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_iter': [100, 200, 300],
        'max_leaf_nodes': [31, 50, 70]
    }
}

# --- 3. Run Hyperparameter Tuning ---

# Store results
best_models = {}

for name in models:
    print(f"--- Tuning {name} ---")
    
    # Initialize RandomizedSearchCV
    # n_iter controls how many different parameter combinations are tried.
    # cv is the number of folds in cross-validation.
    random_search = RandomizedSearchCV(
        estimator=models[name],
        param_distributions=param_grids[name],
        n_iter=10,  # Increase for more thorough search
        cv=3,       # Use 3-fold cross-validation
        verbose=1,
        random_state=42,
        n_jobs=-1   # Use all available CPU cores
    )
    
    # Fit the model
    random_search.fit(X_train_scaled, y_train)
    
    # Store the best estimator
    best_models[name] = random_search.best_estimator_
    
    # Print the results
    print(f"Best Parameters: {random_search.best_params_}")
    print(f"Best CV Score (Accuracy): {random_search.best_score_:.4f}")
    print("-" * (len(name) + 12) + "\n")


# --- 4. Evaluate Best Models on Test Set ---
print("\n--- Evaluating Best Models on Test Data ---\n")

for name, model in best_models.items():
    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Calculate accuracy
    test_accuracy = accuracy_score(y_test, y_pred)
    
    print(f"{name}:")
    print(f"  - Test Accuracy: {test_accuracy:.4f}")