# Predicting Life Satisfaction with NHS Data: Which Lifestyle factors are most important for LS?

Website: https://www.cdc.gov/nchs/nhis/documentation/2023-nhis.html

Dataset: C:\Users\sacar\OneDrive\Documents\Projects\Predicting MH with NHS Data\adult23.csv

Description: 



In [57]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold

In [10]:
#Import data
df = pd.read_csv(r'C:\Users\sacar\OneDrive\Documents\Projects\Predicting MH with NHS Data\adult23.csv')

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29522 entries, 0 to 29521
Columns: 647 entries, URBRRL to POVRATTC_A
dtypes: float64(442), int64(204), object(1)
memory usage: 145.7+ MB


In [12]:
df.head(2)

Unnamed: 0,URBRRL,RATCAT_A,INCTCFLG_A,IMPINCFLG_A,LANGSPECR_A,LANGSOC_A,LANGDOC_A,LANGMED_A,LANGHM_A,PPSU,...,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,IMPNUM_A,WTFA_A,HHX,POVRATTC_A
0,3,4,0,0,,,,,,2,...,,,1,1,1,10,1,7371.139,H029691,1.01
1,4,8,0,0,,,,,,2,...,,,1,1,1,10,1,3146.794,H028812,2.49


In [13]:
df.describe()

Unnamed: 0,URBRRL,RATCAT_A,INCTCFLG_A,IMPINCFLG_A,LANGSPECR_A,LANGSOC_A,LANGDOC_A,LANGMED_A,LANGHM_A,PPSU,...,PHSTAT_A,PROXYREL_A,PROXY_A,AVAIL_A,HHSTAT_A,INTV_MON,RECTYPE,IMPNUM_A,WTFA_A,POVRATTC_A
count,29522.0,29522.0,29522.0,29522.0,4049.0,3973.0,4049.0,4049.0,22104.0,29522.0,...,29522.0,537.0,553.0,29522.0,29522.0,29522.0,29522.0,29522.0,29522.0,29522.0
mean,2.317119,9.666757,0.041664,0.373721,1.453445,1.480997,1.301309,1.392937,2.063563,31.375246,...,2.460369,1.286778,1.039783,1.205542,1.0,6.472089,10.0,1.0,8747.291918,4.10634
std,1.061522,4.048065,0.199823,0.712244,0.801351,0.790779,0.631127,0.73416,1.28779,29.253976,...,1.074983,0.668921,0.338166,1.087867,0.0,3.444791,0.0,0.0,5690.650182,2.961649
min,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,10.0,1.0,1792.441,0.0
25%,1.0,7.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,8.0,...,2.0,1.0,1.0,1.0,1.0,3.0,10.0,1.0,4643.53175,1.8
50%,2.0,10.0,0.0,0.0,1.0,1.0,1.0,1.0,2.0,24.0,...,2.0,1.0,1.0,1.0,1.0,7.0,10.0,1.0,7374.546,3.31
75%,3.0,14.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,48.0,...,3.0,1.0,1.0,1.0,1.0,9.0,10.0,1.0,10994.8095,5.65
max,4.0,14.0,1.0,2.0,9.0,9.0,9.0,9.0,9.0,153.0,...,9.0,4.0,8.0,8.0,1.0,12.0,10.0,1.0,39925.6,11.0


In [14]:
df_copy = df.copy()

# Pre-Processing

In [15]:
print(df_copy["LSATIS4_A"].info())  # Check column type and non-null values

print(df_copy["LSATIS4_A"].unique())  # Show unique values


<class 'pandas.core.series.Series'>
RangeIndex: 29522 entries, 0 to 29521
Series name: LSATIS4_A
Non-Null Count  Dtype
--------------  -----
29522 non-null  int64
dtypes: int64(1)
memory usage: 230.8 KB
None
[2 1 3 9 4 7]


# Missing Values - Dropping Columns and Rows

In [16]:
# Calculate missing value percentages
missing_percent = df_copy.isnull().sum() / len(df_copy) * 100

# Print summary
print(missing_percent.describe())

count    647.000000
mean      52.059685
std       42.190768
min        0.000000
25%        0.000000
50%       62.421245
75%       93.630174
max      100.000000
dtype: float64


In [17]:
# Check missing values for all columns and display the count and percentage
missing_values = df_copy.isnull().sum()
missing_percentage = (missing_values / len(df_copy)) * 100

# Count columns with missing values
missing_columns_count = (df_copy.isnull().sum() > 0).sum()
print(f"Number of columns with missing values: {missing_columns_count}")


# Filter only columns with missing values
missing_data = missing_values[missing_values > 0]

# Print missing values and percentages
print("Columns with Missing Values:")
for col in missing_data.index:
    print(f"{col}: {missing_data[col]} missing ({missing_percentage[col]:.2f}%)")


Number of columns with missing values: 440
Columns with Missing Values:
LANGSPECR_A: 25473 missing (86.28%)
LANGSOC_A: 25549 missing (86.54%)
LANGDOC_A: 25473 missing (86.28%)
LANGMED_A: 25473 missing (86.28%)
LANGHM_A: 7418 missing (25.13%)
SCHDYMSSTC_A: 27881 missing (94.44%)
AFNOW: 8099 missing (27.43%)
REPWRKDYTC_A: 28193 missing (95.50%)
YRSINUS_A: 24820 missing (84.07%)
PRTNREDUCP_A: 27652 missing (93.67%)
SPOUSEDUCP_A: 17305 missing (58.62%)
SASPPRACE_A: 15434 missing (52.28%)
SASPPHISP_A: 15434 missing (52.28%)
PRTNRAGETC_A: 27651 missing (93.66%)
SPOUSAGETC_A: 17305 missing (58.62%)
PRTNRWKFT_A: 28118 missing (95.24%)
PRTNRWRK_A: 27660 missing (93.69%)
SPOUSWKFT_A: 21989 missing (74.48%)
SPOUSWRK_A: 17356 missing (58.79%)
SPOUSESEX_A: 17306 missing (58.62%)
PRTNRSEX_A: 27651 missing (93.66%)
INJWRKDYTC_A: 27562 missing (93.36%)
NUMINJTC_A: 27562 missing (93.36%)
SHINGYEARP_A: 22275 missing (75.45%)
HHRESPSA_FLG: 8637 missing (29.26%)
EPINUMSEZP_A: 28954 missing (98.08%)
EMPDYS

In [18]:
# Recalculate missing percentages based on df_copy
missing_percent = (df_copy.isnull().sum() / len(df_copy)) * 100  

# Select only columns where missing percentage is less than 50%
df_copy = df_copy.loc[:, missing_percent < 50]

print(f"Remaining columns after dropping: {df_copy.shape[1]}")

Remaining columns after dropping: 294


In [19]:
# Check how many missing values remain
total_missing = df_copy.isnull().sum().sum()
print(f"Total missing values after dropping columns: {total_missing}")

Total missing values after dropping columns: 733072


In [20]:
# Calculate the threshold (50% of total columns)
valid_data_threshold = int(df_copy.shape[1] * 0.8)

# Count rows that have at least 50% valid (non-missing) data
rows_with_80_valid = (df_copy.notnull().sum(axis=1) >= valid_data_threshold).sum()

# Print results
print(f"Total rows: {df_copy.shape[0]}")
print(f"Rows with at least 80% valid data: {rows_with_80_valid}")
print(f"Percentage of rows with ≥80% valid data: {(rows_with_80_valid / df_copy.shape[0]) * 100:.2f}%")


Total rows: 29522
Rows with at least 80% valid data: 29225
Percentage of rows with ≥80% valid data: 98.99%


In [21]:
df_copy = df_copy.dropna(thresh=int(df_copy.shape[1] * 0.8))
print(f"New total rows after dropping low-validity rows: {df_copy.shape[0]}")


New total rows after dropping low-validity rows: 29225


In [22]:
print(f"Total missing values after row filtering: {df_copy.isnull().sum().sum()}")


Total missing values after row filtering: 713714


Drop single categorical column for now.

In [23]:
df_copy = df_copy.select_dtypes(exclude=['object'])

# Train/Test Split

In [24]:
# Separate target variable (Life Satisfaction)
y = df_copy["LSATIS4_A"]  # Target variable
X = df_copy.drop(columns=["LSATIS4_A"])  # Features (everything except target)

# Confirm separation
print(f"Feature Set (X): {X.shape}")
print(f"Target (y): {y.shape}")


Feature Set (X): (29225, 292)
Target (y): (29225,)


In [25]:
#Train/Test Split
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3, 
                                                  random_state = 0, 
                                                  stratify = y, 
                                                  shuffle = True)

print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")
print(f"y_train shape: {y_train.shape}, y_val shape: {y_val.shape}")

X_train shape: (20457, 292), X_val shape: (8768, 292)
y_train shape: (20457,), y_val shape: (8768,)


In [26]:
# Keep only valid target labels (1, 2, 3, 4)
valid_classes = [1, 2, 3, 4]
mask = y_train.isin(valid_classes)
X_train = X_train[mask]
y_train = y_train[mask]

mask_val = y_val.isin(valid_classes)
X_val = X_val[mask_val]
y_val = y_val[mask_val]

print(f"🔹 New training set size: {X_train.shape[0]}")
print(f"🔹 New validation set size: {X_val.shape[0]}")


🔹 New training set size: 20408
🔹 New validation set size: 8747


# Imputation

In [27]:
# Replace NaNs with median values

# Fill missing values in X_train using median
X_train.fillna(X_train.median(), inplace=True)

# Apply same imputation to X_val using X_train's median values
X_val.fillna(X_train.median(), inplace=True)

# Confirm no missing values remain
print(f"Missing values in X_train after imputation: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_val after imputation: {X_val.isnull().sum().sum()}")


Missing values in X_train after imputation: 0
Missing values in X_val after imputation: 0


# Feature Selection

# Spearman Correlation
-Randomly select which feature to drop from two highly-correlated features.

In [28]:
# Compute correlation matrix
corr_matrix = X_train.corr(method='spearman').abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find columns with correlation > 0.9
high_corr_features = [column for column in upper.columns if any(upper[column] > 0.8)]

# Drop highly correlated features from both train and validation sets
X_train = X_train.drop(columns=high_corr_features)
X_val = X_val.drop(columns=high_corr_features)

print(f"Dropped {len(high_corr_features)} highly correlated features.")
print(f"New X_train shape: {X_train.shape}, New X_val shape: {X_val.shape}")


Dropped 32 highly correlated features.
New X_train shape: (20408, 260), New X_val shape: (8747, 260)


# Random Forest RFE

# RFE that selects the top 30 features

In [None]:

'''# Train RFE with Random Forest
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=0), n_features_to_select=30)
rfe_selector.fit(X_train, y_train)

# Store RFE feature rankings
rfe_ranking = pd.DataFrame({
    "Feature": X_train.columns,
    "RFE_Rank": rfe_selector.ranking_  # Lower rank = more important
})

# Sort by rank (lower is better)
rfe_ranking = rfe_ranking.sort_values(by="RFE_Rank", ascending=True)

# Store the top 30 RFE features
selected_features_rfe = rfe_ranking["Feature"].head(30).tolist()

# Print the top 30 RFE features
print("\n🔹 Top 30 Most Important Features (RFE):")
print(rfe_ranking.head(30))

print(f"\n✅ Selected {len(selected_features_rfe)} features using RFE.")



🔹 Top 30 Most Important Features (RFE):
           Feature  RFE_Rank
0           URBRRL         1
27      HEIGHTTC_A         1
47         EDUCP_A         1
51          REGION         1
52        INTV_QRT         1
55          AGEP_A         1
62     HOUYRSLIV_A         1
91    HRTESTLAST_A         1
96      AHEARST1_A         1
99      AVISEXAM_A         1
116     PAIFRQ3M_A         1
125     DISCRIM1_A         1
128        PHQ42_A         1
134      DEPFREQ_A         1
137      ANXFREQ_A         1
150    CVDVAC1M1_A         1
151  SHTCVD19NM1_A         1
154      SHTFLUM_A         1
188      DENPREV_A         1
254       PHSTAT_A         1
26    WEIGHTLBTC_A         1
22    EMDINDSTN2_A         1
129        PHQ41_A         1
259         WTFA_A         1
6           PSTRAT         1
21    EMDOCCUPN2_A         1
5             PPSU         1
13       MARSTAT_A         1
20     EMPWKHRS3_A         1
1         RATCAT_A         1

✅ Selected 30 features using RFE.


# RFE with rankings for all features

In [60]:
# Train RFE but keep rankings for ALL features
rfe_selector = RFE(estimator=RandomForestClassifier(n_estimators=100, random_state=0), n_features_to_select=1, step=1)
rfe_selector.fit(X_train, y_train)

# Get rankings for all features
rfe_ranking = pd.DataFrame({
    "Feature": X_train.columns,
    "RFE_Rank": rfe_selector.ranking_  # Lower rank = more important
})

# Sort features by ranking
rfe_ranking = rfe_ranking.sort_values(by="RFE_Rank", ascending=True)

# Print rankings (top 30)
print("\n🔹 Full RFE Feature Rankings:")
print(rfe_ranking.head(30))


KeyboardInterrupt: 

# RFECV to determine optimal number of features 

In [None]:
# 🔹 Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=0, class_weight="balanced")

# 🔹 Use RFECV to determine the optimal number of features
rfecv = RFECV(estimator=rf_classifier, step=1, cv=StratifiedKFold(5), scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train, y_train)

# 🔹 Get the optimal number of features
optimal_features = X_train.columns[rfecv.support_]  # Boolean mask of selected features
num_selected_features = sum(rfecv.support_)  # Count selected features

# 🔹 Print results
print(f"✅ Optimal number of features selected by RFECV: {num_selected_features}")
print(f"🔹 Selected Features: {list(optimal_features)}")

# 🔹 Filter dataset to keep only selected features
X_train_selected = X_train[optimal_features]
X_val_selected = X_val[optimal_features]


In [35]:
print(selected_features_rfe)

Index(['RATCAT_A', 'PPSU', 'PSTRAT', 'MARSTAT_A', 'EMDOCCUPN2_A',
       'EMDINDSTN2_A', 'WEIGHTLBTC_A', 'HEIGHTTC_A', 'EDUCP_A', 'AGEP_A',
       'AVISEXAM_A', 'DISCRIM2_A', 'PHQ42_A', 'DEPFREQ_A', 'ANXFREQ_A',
       'CVDVAC1M1_A', 'SHTCVD19NM1_A', 'SHTFLUM_A', 'PHSTAT_A', 'WTFA_A'],
      dtype='object')


# Lasso

# Stores top 30 most important lasso features.

In [47]:
# Standardize Features (LASSO is sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 🔹 Train LASSO with Cross-Validation
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_train_scaled, y_train)

# Store LASSO feature coefficients
lasso_ranking = pd.DataFrame({
    "Feature": X_train.columns,
    "Lasso_Coeff": lasso.coef_
})

# Convert Lasso Coefficients to absolute values
lasso_ranking["Abs_Lasso_Coeff"] = np.abs(lasso_ranking["Lasso_Coeff"])

# Sort by absolute LASSO coefficient (higher is better)
lasso_ranking = lasso_ranking.sort_values(by="Abs_Lasso_Coeff", ascending=False)

# Store the top 30 LASSO features
selected_features_lasso = lasso_ranking["Feature"].head(30).tolist()

# 🔹 Print the top 30 LASSO features
print("\n🔹 Top 30 Most Important Features (LASSO):")
print(lasso_ranking.head(30))

print(f"\n✅ Selected {len(selected_features_lasso)} features using LASSO.")



🔹 Top 30 Most Important Features (LASSO):
          Feature  Lasso_Coeff  Abs_Lasso_Coeff
254      PHSTAT_A     0.151777         0.151777
134     DEPFREQ_A    -0.088054         0.088054
128       PHQ42_A     0.054759         0.054759
137     ANXFREQ_A    -0.046543         0.046543
13      MARSTAT_A     0.046137         0.046137
129       PHQ41_A     0.039336         0.039336
73     CEVOLUN1_A     0.032701         0.032701
32   PHQ2SCREEN_A    -0.032232         0.032232
1        RATCAT_A    -0.025629         0.025629
191    PAYWORRY_A    -0.024567         0.024567
30     PCNT18UPTC    -0.022252         0.022252
63   FDSBALANCE_A    -0.019823         0.019823
96     AHEARST1_A     0.018884         0.018884
54          SEX_A    -0.018648         0.018648
125    DISCRIM1_A    -0.017858         0.017858
19   EMPWRKLSW1_A    -0.017215         0.017215
209   SOCSCLPAR_A     0.016341         0.016341
59     TRANSPOR_A    -0.016050         0.016050
180    MEDDL12M_A    -0.015133         0.0151

# Lasso that ranks all important features.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import pandas as pd
import numpy as np

# 🔹 Standardize Features (LASSO is sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 🔹 Train LASSO with Cross-Validation
lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_train_scaled, y_train)

# 🔹 Store LASSO feature coefficients
lasso_ranking = pd.DataFrame({
    "Feature": X_train.columns,
    "Lasso_Coeff": lasso.coef_
})

# 🔹 Convert Lasso Coefficients to absolute values
lasso_ranking["Abs_Lasso_Coeff"] = np.abs(lasso_ranking["Lasso_Coeff"])

# 🔹 Sort ALL features by absolute LASSO coefficient (higher = better)
lasso_ranking = lasso_ranking.sort_values(by="Abs_Lasso_Coeff", ascending=False)

# 🔹 Store ALL important features (i.e., non-zero coefficients)
selected_features_lasso = lasso_ranking[lasso_ranking["Abs_Lasso_Coeff"] > 0]["Feature"].tolist()

# 🔹 Print the full feature importance ranking
print("\n🔹 Ranked Features by Importance (LASSO):")
print(lasso_ranking[["Feature", "Abs_Lasso_Coeff"]])

print(f"\n✅ Selected {len(selected_features_lasso)} important features using LASSO.")


# Unweighted combination of lasso and RFE

In [None]:
'''
# Combine RFE and LASSO features into a DataFrame
feature_ranking = pd.DataFrame({
    "Feature": X_train.columns
})

# 🔹 Assign scores based on selection:
feature_ranking["RFE_Score"] = feature_ranking["Feature"].apply(lambda x: 1 if x in selected_features_rfe else 0)
feature_ranking["Lasso_Score"] = feature_ranking["Feature"].apply(lambda x: 1 if x in selected_features_lasso else 0)

# 🔹 Compute final importance score (sum of both methods)
feature_ranking["Final_Score"] = feature_ranking["RFE_Score"] + feature_ranking["Lasso_Score"]

# Sort by final score (higher score = more important)
feature_ranking = feature_ranking.sort_values(by="Final_Score", ascending=False)

# Select the top 30 final features
selected_features_final = feature_ranking["Feature"].head(30).tolist()

# 🔹 Print the top 30 final selected features
print("\n🔹 Final Top 30 Features (Based on RFE & LASSO):")
print(feature_ranking.head(30))

print(f"\n✅ Final selection: {len(selected_features_final)} most important features.")

# 🔹 Filter dataset to keep only selected features
X_train_selected = X_train[selected_features_final]
X_val_selected = X_val[selected_features_final]



🔹 Final Top 30 Features (Based on RFE & LASSO):
           Feature  RFE_Score  Lasso_Score  Final_Score
1         RATCAT_A          1            1            2
62     HOUYRSLIV_A          1            1            2
128        PHQ42_A          1            1            2
129        PHQ41_A          1            1            2
96      AHEARST1_A          1            1            2
134      DEPFREQ_A          1            1            2
137      ANXFREQ_A          1            1            2
13       MARSTAT_A          1            1            2
125     DISCRIM1_A          1            1            2
188      DENPREV_A          1            1            2
254       PHSTAT_A          1            1            2
59      TRANSPOR_A          0            1            1
99      AVISEXAM_A          1            0            1
73      CEVOLUN1_A          0            1            1
77     EMPSICKLV_A          0            1            1
57      CEVOTELC_A          0            1            1

# Weighted combination of lasso and RFE
-Requires RFE to rank ALL features

In [None]:
# Create DataFrame to store feature rankings
feature_ranking = pd.DataFrame({"Feature": X_train.columns})

# 🔹 Assign scores based on rankings & importance:
feature_ranking["RFE_Rank"] = feature_ranking["Feature"].map(lambda x: rfe_ranking_dict.get(x, np.nan))  # Lower is better
feature_ranking["Lasso_Coeff"] = feature_ranking["Feature"].map(lambda x: lasso_coeff_dict.get(x, 0))  # Higher is better

# 🔹 Normalize scores for fair comparison
feature_ranking["Norm_RFE"] = (feature_ranking["RFE_Rank"].max() - feature_ranking["RFE_Rank"]) / (feature_ranking["RFE_Rank"].max() - feature_ranking["RFE_Rank"].min())  # Invert so higher = better
feature_ranking["Norm_Lasso"] = np.abs(feature_ranking["Lasso_Coeff"]) / np.abs(feature_ranking["Lasso_Coeff"]).max()  # Higher = better

# 🔹 Compute final weighted importance score
feature_ranking["Final_Score"] = (
    feature_ranking["Norm_RFE"] * 0.5 +  # RFE importance (50%)
    feature_ranking["Norm_Lasso"] * 0.5  # LASSO importance (50%)
)

# 🔹 Sort by final importance score
feature_ranking = feature_ranking.sort_values(by="Final_Score", ascending=False)

# 🔹 Select top 30 most important features
selected_features_final = feature_ranking["Feature"].head(30).tolist()

# 🔹 Print the final ranked top 30 features
print("\n🔹 Final Top 30 Features (Weighted Ranking):")
print(feature_ranking[["Feature", "Final_Score"]].head(30))

print(f"\n✅ Final selection: {len(selected_features_final)} most important features.")

# 🔹 Filter dataset to keep only selected features
X_train_selected = X_train[selected_features_final]
X_val_selected = X_val[selected_features_final]


# Apparently this is the final optimized RFE combined with lasso

In [59]:
#  Run RFECV to Select Optimal Features
rf_classifier = RandomForestClassifier(n_estimators=200, random_state=0, class_weight="balanced")

rfecv = RFECV(estimator=rf_classifier, step=1, cv=StratifiedKFold(5), scoring='accuracy', n_jobs=-1)
rfecv.fit(X_train, y_train)

# Get RFECV-selected features
rfecv_features = set(X_train.columns[rfecv.support_])
num_rfecv_features = len(rfecv_features)
print(f"✅ RFECV selected {num_rfecv_features} features.")

# 2️⃣ Run LASSO to Identify Important Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

lasso = LassoCV(cv=5, random_state=0)
lasso.fit(X_train_scaled, y_train)

# Get LASSO-selected features (non-zero coefficients)
lasso_features = set(X_train.columns[lasso.coef_ != 0])
num_lasso_features = len(lasso_features)
print(f"✅ LASSO selected {num_lasso_features} features.")

# Combine RFECV + LASSO Features
final_features = rfecv_features.union(lasso_features)  # Merge both sets
num_final_features = len(final_features)

# Print the final feature selection results
print(f"\n🔹 Final feature set includes {num_final_features} features (RFECV + LASSO).")
print(f"📝 Features Selected: {list(final_features)}")

# Filter Dataset to Keep Only Selected Features
X_train_selected = X_train[list(final_features)]
X_val_selected = X_val[list(final_features)]


KeyboardInterrupt: 

# Modelling

# Linear Regression

In [36]:
model = LogisticRegression()

In [37]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
# Calculate predictions for both training and validation sets
lr_train_preds = model.predict(X_train)
lr_val_preds = model.predict(X_val)

In [43]:
# Function to evaluate classification performance
def evaluate_classification(y_true, y_pred, dataset="Validation"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"🔹 {dataset} Set Metrics:")
    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall: {recall:.4f}")
    print(f"  - F1-score: {f1:.4f}\n")

# Evaluate on Training Set
evaluate_classification(y_train, lr_train_preds, dataset="Training")

# Evaluate on Validation Set
evaluate_classification(y_val, lr_val_preds, dataset="Validation")

🔹 Training Set Metrics:
  - Accuracy: 0.5154
  - Precision: 0.4513
  - Recall: 0.5154
  - F1-score: 0.3625

🔹 Validation Set Metrics:
  - Accuracy: 0.5140
  - Precision: 0.4348
  - Recall: 0.5140
  - F1-score: 0.3612



In [15]:
# Get feature importance from model coefficients (for Linear Regression)
feature_importance = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
})

# Sort by absolute coefficient value (most important first)
feature_importance["Abs_Coefficient"] = feature_importance["Coefficient"].abs()
feature_importance = feature_importance.sort_values(by="Abs_Coefficient", ascending=False)

# Display the top 20 most important features
print("Top 20 Most Important Features:")
print(feature_importance.head(20))


Top 20 Most Important Features:
          Feature  Coefficient  Abs_Coefficient
33         CHIP_A    -0.310883         0.310883
150    HIKIND07_A     0.275841         0.275841
31          IHS_A    -0.241593         0.241593
197      PHSTAT_A     0.146455         0.146455
152    HIKIND05_A     0.106269         0.106269
191       STREV_A    -0.088309         0.088309
24     PCNT18UPTC    -0.083287         0.083287
184     DEMENEV_A     0.079963         0.079963
107       PHQ42_A     0.076982         0.076982
113     DEPFREQ_A    -0.071777         0.071777
192        MIEV_A     0.067566         0.067566
134    MEDDL12M_A    -0.064623         0.064623
178      PSOREV_A    -0.059176         0.059176
148    HIKIND09_A     0.059033         0.059033
177       CFSEV_A     0.057343         0.057343
55     TRANSPOR_A    -0.056124         0.056124
42     PCNTADLT_A     0.055730         0.055730
40    MLTFAMFLG_A    -0.053319         0.053319
68    NATUSBORN_A     0.049873         0.049873
16   EMP

# Random Forest

In [54]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=200,  # More trees
    max_depth=10,  # Prevent overfitting
    min_samples_split=5,  # Require more samples to split
    min_samples_leaf=2,  # Require more samples per leaf
    class_weight='balanced',  # Adjust for imbalanced data
    random_state=0
)

# Train classifier
rf_classifier.fit(X_train_selected, y_train)

# Make predictions
rf_train_preds = rf_classifier.predict(X_train_selected)
rf_val_preds = rf_classifier.predict(X_val_selected)


In [55]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate classification performance
def evaluate_classification(y_true, y_pred, dataset="Validation"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"🔹 {dataset} Set Metrics:")
    print(f"  - Accuracy: {accuracy:.4f}")
    print(f"  - Precision: {precision:.4f}")
    print(f"  - Recall: {recall:.4f}")
    print(f"  - F1-score: {f1:.4f}\n")

# Evaluate on Training Set
evaluate_classification(y_train, rf_train_preds, dataset="Training")

# Evaluate on Validation Set
evaluate_classification(y_val, rf_val_preds, dataset="Validation")


🔹 Training Set Metrics:
  - Accuracy: 0.7098
  - Precision: 0.7203
  - Recall: 0.7098
  - F1-score: 0.7102

🔹 Validation Set Metrics:
  - Accuracy: 0.6377
  - Precision: 0.6478
  - Recall: 0.6377
  - F1-score: 0.6399



Current High-Score: F1 = .6399 using Unweighted selection of top 30 RFE + Lasso

Tried:
All features
Top 30 RFE + Lasso Unweighted