

```
# author: puru panta (purupanta@uky.edu)
# date: 11/30/2024
# filename: PredictMLDL_Separate
```



**STEP1: Install and Import libraries required**

In [1]:
!pip install --upgrade tf-keras
!pip install --upgrade tensorflow
!pip install --upgrade scikit-learn imbalanced-learn

Collecting tf-keras
  Downloading tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.


In [2]:
import json

In [3]:
# Mounting google drive if it is already not mounted
def LoadGoogleDrive(googleDriveDir):
  # Link Google Drive
  import os
  # Check if Google Drive is already mounted
  if not os.path.exists(googleDriveDir + "/MyDrive"):
      print("Mounting Google Drive...")
      from google.colab import drive
      drive.mount(googleDriveDir)
  else:
      print("Google Drive is already mounted!")

# Loading the AppConfig file
def LoadAppConfig(json_file_path):
  # Load JSON file into a Python dictionary
  with open(json_file_path, "r") as json_file:
      config_data = json.load(json_file)

  # Print loaded JSON data
  print("AppConfig JSON File Loaded Successfully!")
  # print(config_data)
  return config_data

# Loading the AppLib file
def AppLib_reload_script(script_path):
  # Load AppLib.py
  with open(script_path, 'r') as file:
      script_code = file.read()
      exec(script_code, globals())

**STEP2: Load goofle-drive, libraries, config file and application library file**

In [4]:
# Mount the google drive
googleDriveFolder = "/content/drive"
LoadGoogleDrive(googleDriveFolder)

# Load the AppConfig json file
json_file_path = googleDriveFolder + "/MyDrive/Colab Notebooks/HINTS6Finale/AppConfig.json"
config_data = LoadAppConfig(json_file_path)

# Load AppLib
lib_script_path = config_data["project_location"] + config_data["app_lib_path"] + config_data["app_lib_filename"]
print(f"Loading AppLib from: {lib_script_path}")
AppLib_reload_script(lib_script_path)


Mounting Google Drive...
Mounted at /content/drive
AppConfig JSON File Loaded Successfully!
Loading AppLib from: drive/MyDrive/Colab Notebooks/HINTS6Finale/lib/AppLib.py


In [5]:
requirements_path = config_data["project_location"] + config_data["requirements_file_name"]
AppLib.LoadRequirements(requirements_path)

Installing missing packages: ['tf-keras', 'scikit-learn', 'imbalanced-learn', 'xlsxwriter']
Installation completed successfully!
Please restart the runtime for changes to take effect.


**STEP3: Load the original data with filtered columns**

In [6]:
# Define the output file path
ip_file_path = config_data["project_location"] + config_data["ip_data_dir"] + config_data["ip_file_name"];
ip_sheet_name = config_data["ip_file_sheet_name"]
ip_load_data_cols = config_data["study_cols"]["target_cols"] + config_data["study_cols"]["predictor_cols"] + config_data["study_cols"]["predictor_cols0"]

# Loading cleaded excel data
df_orig = AppLib.load_data(ip_file_path, ip_sheet_name, "")


Data Size: 2982204, Data Shape: (6252, 477), (Flag: Loaded, original data)


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import mutual_info_classif, RFE, RFECV, SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Define target variable
# targets = ['MedConditions_HeartCondition']
target = "MedConditions_HeartCondition"

# Drop rows with missing target values
df = df_orig.dropna(subset=[target])

# Drop rows with missing target values
df = df.dropna(subset=[target])

In [None]:
# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Identify and drop datetime columns
datetime_cols = X.select_dtypes(include=["datetime64"]).columns
if len(datetime_cols) > 0:
    print(f"Dropping datetime columns: {list(datetime_cols)}")
    X = X.drop(columns=datetime_cols)

# Encode categorical variables
label_encoders = {}  # Store encoders for reference
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le  # Save encoder if needed later

# Fill missing values (mean for numerical, mode for categorical)
for col in X.columns:
    if X[col].isnull().sum() > 0:
        if X[col].dtype == "object":
            X[col] = X[col].fillna(X[col].mode()[0])  # Fix: Avoid inplace warning
        else:
            X[col] = X[col].fillna(X[col].mean())  # Fix: Avoid inplace warning

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---- FEATURE SELECTION ----
# Mutual Information (Feature Importance)
mi_scores = mutual_info_classif(X_train, y_train)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# Recursive Feature Elimination (RFE) with Logistic Regression
model = LogisticRegression(max_iter=500)
rfe = RFE(model, n_features_to_select=18)  # Selecting top 10 features
rfe.fit(X_train, y_train)

# Get selected features
selected_features = X.columns[rfe.support_]

# Display Results
print("Top Features from Mutual Information:\n", mi_series.head(10))
print("\nSelected Features from RFE:\n", selected_features.tolist())

Dropping datetime columns: ['updatedate']
Top Features from Mutual Information:
 MedConditions_HighBP         0.193772
MedConditions_LungDisease    0.183926
MedConditions_Diabetes       0.182365
GeneralHealth                0.163411
MedConditions_Depression     0.162156
TalkHealthFriends            0.150666
OwnAbilityTakeCareHealth     0.149303
Height_Feet                  0.148320
Deaf                         0.147500
Hopeless                     0.143810
dtype: float64

Selected Features from RFE:
 ['OfferedTelehealthOption', 'OfferedAccessInsurer3', 'Caregiving_LongTerm', 'Caregiving_Other', 'UndGenTest3_Spouse', 'UndGenTest3_Other', 'BehavChg_Other_OS', 'GeneralHealth', 'MedConditions_HighBP', 'MedConditions_LungDisease', 'Weight', 'TimesSunburned', 'HCPAlcohol_Alcoholism', 'HCPAlcohol_Heart', 'CaBone', 'CaMelanoma', 'BirthGender', 'FULLTIMEOCC_CAT']


In [None]:
# Separate features and target
X = df.drop(columns=[target])
y = df[target]

# Drop datetime columns
datetime_cols = X.select_dtypes(include=["datetime64"]).columns
X = X.drop(columns=datetime_cols)

# Encode categorical variables
label_encoders = {}
for col in X.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Handle missing values
for col in X.columns:
    if X[col].isnull().sum() > 0:
        if X[col].dtype == "object":
            X[col] = X[col].fillna(X[col].mode()[0])
        else:
            X[col] = X[col].fillna(X[col].mean())

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# ---- BETTER FEATURE SELECTION ----

# **Mutual Information**
mi_scores = mutual_info_classif(X_train, y_train)
mi_series = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)

# **Recursive Feature Elimination with Cross-Validation (RFECV)**
log_reg = LogisticRegression(max_iter=500)
rfecv = RFECV(estimator=log_reg, step=1, cv=5, scoring='accuracy')
rfecv.fit(X_train, y_train)
rfe_selected_features = X.columns[rfecv.support_]

# **Lasso Regression for Feature Selection**
lasso = LassoCV(cv=5, random_state=42).fit(X_train, y_train)
lasso_selected_features = X.columns[lasso.coef_ != 0]

# **Random Forest Feature Importance**
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
rf_selected_features = rf_importance.head(10).index.tolist()  # Top 10 features

# ---- FINAL SELECTED FEATURES ----
selected_features = list(set(rfe_selected_features) & set(lasso_selected_features) & set(rf_selected_features))

# Display Results
print(" **Top Features from Mutual Information**:\n", mi_series.head(10))
print("\n **Selected Features from RFECV**:\n", rfe_selected_features.tolist())
print("\n **Selected Features from Lasso Regression**:\n", lasso_selected_features.tolist())
print("\n **Selected Features from Random Forest**:\n", rf_selected_features)
print("\n **Final Selected Features (Common across Methods)**:\n", selected_features)


 **Top Features from Mutual Information**:
 MedConditions_HighBP         0.195754
MedConditions_LungDisease    0.176136
MedConditions_Diabetes       0.173190
MedConditions_Depression     0.167419
GeneralHealth                0.151208
UndMedicalStats              0.149512
TalkHealthFriends            0.148701
ClearSenseDir                0.148513
PROMIS_Meaning_t             0.146628
Deaf                         0.146019
dtype: float64

 **Selected Features from RFECV**:
 ['OfferedTelehealthOption', 'OfferedAccessInsurer3', 'Caregiving_LongTerm', 'Caregiving_Other', 'BehavChg_Other_OS', 'GeneralHealth', 'MedConditions_HighBP', 'MedConditions_LungDisease', 'HCPAlcohol_Alcoholism', 'HCPAlcohol_Heart', 'BirthGender', 'FULLTIMEOCC_CAT']

 **Selected Features from Lasso Regression**:
 ['CancerTrustFamily', 'CancerTrustScientists', 'Internet_Cell', 'WillingShareData_Fam', 'FreqGoProvider', 'HelpUncertainty', 'Telehealth_GoodCare', 'OfferedAccessInsurer3', 'HCPEncourageOnlineRec2', 'RecordsOnl