In [1]:
!pip install --upgrade tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow)
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (615.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m615.3/615.3 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorboard-2.18.0-py3-none-any.whl (5.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard, tensorflow
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.17.1
    Uninstalling tensorboard-2.17.1:
      Successfully uninstalled tensorboard-2.17.1
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.17.1
    Uninstalling tensorflo

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
def load_data():
  data_file = 'drive/MyDrive/Colab Notebooks/HINTS/hints6_public.xlsx'
  # prompt: link google drive
  from google.colab import drive
  drive.mount('/content/drive')
  # Load the dataset
  data = pd.read_excel(data_file)
  print('Data Size: ' + str(data.size) + ' Data Shape: ' + str(data.shape))
  return data

In [None]:
orig_data = load_data()

Mounted at /content/drive
Data Size: 2982204 Data Shape: (6252, 477)


**Boruta and LASSO**

In [None]:
data = orig_data.copy()

In [None]:
# Target column
target_column = 'MedConditions_HeartCondition'

# Filter out invalid target values (-7 and -9)
data_cleaned = data[data[target_column].isin([1, 2])]

# Map target values to binary classification: {2: 0 (No), 1: 1 (Yes)}
data_cleaned[target_column] = data_cleaned[target_column].map({2: 0, 1: 1})

# Checking the cleaned target variable
print(data_cleaned[target_column].value_counts())


MedConditions_HeartCondition
0    5407
1     607
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cleaned[target_column] = data_cleaned[target_column].map({2: 0, 1: 1})


In [None]:
# Splitting features and target variable
X = data_cleaned.drop(columns=[target_column])  # Features
y = data_cleaned[target_column]  # Target

# Separate numeric and non-numeric columns
X_NumericCol = X.select_dtypes(include=['float64', 'int64']).columns
X_NonNumericCol = X.select_dtypes(exclude=['float64', 'int64']).columns

print(f"Numeric columns: {len(X_NumericCol)}")
print(f"Non-numeric columns: {len(X_NonNumericCol)}")

# Option 1: Drop non-numeric columns (if irrelevant or challenging to encode)
X_prepared = X[X_NumericCol]

# Option 2: One-hot encode non-numeric columns (if relevant for modeling)
# Uncomment the following if you want to encode instead of dropping:
# X_non_numeric_encoded = pd.get_dummies(X[non_numeric_columns], drop_first=True)
# X_prepared = pd.concat([X[numeric_columns], X_non_numeric_encoded], axis=1)

# Display the final prepared dataset structure
print(X_prepared.info())


Numeric columns: 463
Non-numeric columns: 13
<class 'pandas.core.frame.DataFrame'>
Index: 6014 entries, 0 to 6251
Columns: 463 entries, HHID to IncomeRanges_IMP
dtypes: float64(58), int64(405)
memory usage: 21.3 MB
None


In [None]:
# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_NumericCol, y, test_size=0.3, random_state=42)

# Step 4: Handle missing values (Impute missing values with the column mean)
imputer = SimpleImputer(strategy='mean')
X_train_cleaned = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_cleaned = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)


ValueError: Found input variables with inconsistent numbers of samples: [463, 6014]

In [None]:
!pip install boruta



In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Define the Random Forest model (rf_model)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Initialize Boruta with modified parameters
boruta = BorutaPy(
    estimator=rf_model,
    n_estimators='auto',
    random_state=42,
    max_iter=200,  # Increase the number of iterations
    alpha=0.05  # Be less strict in feature selection
)

# Fit Boruta to the cleaned training data
boruta.fit(X_train_cleaned.values, y_train.values)

# Get the top 10 features from Boruta
selected_features = X_train_cleaned.columns[boruta.support_].tolist()
if len(selected_features) < 10:
    # Add tentative features until we reach 10
    tentative_features = X_train_cleaned.columns[boruta.support_weak_].tolist()
    selected_features += tentative_features[:10 - len(selected_features)]

print("Top 10 features by Boruta:")
print(selected_features[:10])


Top 10 features by Boruta:
['Age']


In [None]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
import numpy as np

# Define the Lasso model with cross-validated alpha for regularization
lasso = Lasso(alpha=0.01, random_state=42, max_iter=10000)  # Adjust alpha for stricter/looser selection

# Fit Lasso on the training data
lasso.fit(X_train_cleaned, y_train)

# Use SelectFromModel to automatically select features with non-zero coefficients
model = SelectFromModel(lasso, prefit=True)

# Get the selected features
selected_features = X_train_cleaned.columns[model.get_support()].tolist()

# If fewer than 10 features are selected, take the top by absolute coefficient magnitude
if len(selected_features) < 10:
    lasso_coefficients = np.abs(lasso.coef_)
    feature_ranking = np.argsort(lasso_coefficients)[::-1]  # Sort by descending coefficient magnitude
    additional_features = X_train_cleaned.columns[feature_ranking].tolist()
    selected_features = list(set(selected_features + additional_features[:10 - len(selected_features)]))

print("Top 10 features by Lasso:")
print(selected_features[:10])


Top 10 features by Lasso:
['APP_REGION', 'VAR_CLUSTER', 'MAILHHADULTS', 'CancerTrustFamily', 'CancerTrustCharities', 'HAVEDEVICE_CAT', 'UsedHealthWellnessApps2', 'SocMed_Visited', 'SocMed_WatchedVid', 'SocMed_MakeDecisions']


  model = cd_fast.enet_coordinate_descent(
