In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
def load_data():
  data_file = 'drive/MyDrive/Colab Notebooks/HINTS/hints6_public.xlsx'
  # prompt: link google drive
  from google.colab import drive
  drive.mount('/content/drive')
  # Load the dataset
  data = pd.read_excel(data_file)
  print('Data Size: ' + str(data.size) + ' Data Shape: ' + str(data.shape))
  return data

In [None]:
orig_data = load_data()
# Load the dataset
data = orig_data.copy()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data Size: 2982204 Data Shape: (6252, 477)


In [None]:
# Load the dataset
data = orig_data.copy()

# Step 1: Data Preparation
# Target variable: MedConditions_HeartCondition (1 = Yes, 2 = No, recoded to binary 1 and 0)
data['MedConditions_HeartCondition'] = data['MedConditions_HeartCondition'].replace({1: 1, 2: 0})

# Select relevant predictor variables (columns chosen based on typical predictors for heart disease)

predictors_base = ['Age','BirthGender','MedConditions_HighBP','MedConditions_Diabetes','MedConditions_LungDisease','PHQ4','AvgDrinksPerWeek','WeeklyMinutesModerateExercise','EverHadCancer','AverageTimeSitting','FreqGoProvider']

predictors_more1 = ['TimesModerateExercise','DrinksPerDay','Smoke100','SmokeNow']

predictors_more2 = ['TimesModerateExercise','MedConditions_Depression','AverageSleepNight','BMI','smokeStat','Deaf']

## predictors = predictors_base + predictors_more1
predictors = predictors_base + predictors_more2

In [None]:
# Replace values in the following columns
data['SmokeNow'] = data['SmokeNow'].replace({3: 0, 1: 2, 2: 1})
data['smokeStat'] = data['smokeStat'].replace({'never': 0, 'former': 1, 'current': 2})
data['Smoke100'] = data['Smoke100'].replace({2: 0, 1: 1})
data['EverHadCancer'] = data['EverHadCancer'].replace({2: 0, 1: 1})
data['MedConditions_LungDisease'] = data['MedConditions_LungDisease'].replace({2: 0, 1: 1})
data['MedConditions_Diabetes'] = data['MedConditions_Diabetes'].replace({2: 0, 1: 1})
data['MedConditions_HighBP'] = data['MedConditions_HighBP'].replace({2: 0, 1: 1})

# Drop rows with missing values in relevant columns
data = data.dropna(subset=['MedConditions_HeartCondition'] + predictors)

In [None]:
data.shape

(6252, 477)

In [None]:
data_cleaned = data

In [None]:
# Keep only rows where the target and predictors have no negative values
data_cleaned = data[(data[['MedConditions_HeartCondition'] + predictors] >= 0).all(axis=1)]

In [None]:
data_cleaned.shape

(6252, 477)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

model = RandomForestClassifier(random_state=42)
model.fit(X_resampled, y_resampled)

# Get feature importance
feature_importance = pd.Series(model.feature_importances_, index=X.columns)
top_features = feature_importance.nlargest(10)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(x=top_features.values, y=top_features.index)
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Top 10 Important Features")
plt.show()


NameError: name 'RandomForestClassifier' is not defined

In [None]:
target = ['MedConditions_HeartCondition']
columns_filter = predictors+target
data_cleaned = data_cleaned.filter(items= columns_filter)

In [None]:
data_cleaned = data_cleaned[~data_cleaned.apply(lambda row: any(col is None or str(col).strip() == '' for col in row), axis=1)]
data_cleaned.shape

(4892, 18)

In [None]:
if not isinstance(data_cleaned, pd.DataFrame):
  raise TypeError("Input must be a pandas DataFrame.")

# Select numeric columns for comparison
numeric_cols = data_cleaned.select_dtypes(include=[np.number])

rows_to_drop = data_cleaned.index[numeric_cols.lt(0).any(axis=1)]
data_cleaned = data_cleaned.drop(rows_to_drop)

data_cleaned.shape

(4892, 18)

In [None]:
# Define the target and predictors
X = data_cleaned[predictors]
y = data_cleaned['MedConditions_HeartCondition']

# Step 2: Filter out rows where y has values other than 0 or 1
valid_indices = y.isin([0, 1])
X = X[valid_indices]
y = y[valid_indices]

# Check for unique values in the target
print(f"Unique values in the target variable after filtering: {y.unique()}")

# Standardize predictors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 3: Handle class imbalance using SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

Unique values in the target variable after filtering: [1 0]


In [None]:
#LASSO Regression

from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=5).fit(X_resampled, y_resampled)
selected_features = X.columns[lasso.coef_ != 0]
print(f"Selected Features: {selected_features}")



Selected Features: Index(['Age', 'BirthGender', 'MedConditions_HighBP', 'MedConditions_Diabetes',
       'MedConditions_LungDisease', 'PHQ4', 'AvgDrinksPerWeek',
       'WeeklyMinutesModerateExercise', 'EverHadCancer', 'AverageTimeSitting',
       'FreqGoProvider', 'TimesModerateExercise', 'MedConditions_Depression',
       'AverageSleepNight', 'BMI', 'smokeStat', 'Deaf'],
      dtype='object')
