In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
# Load the dataset
df = pd.read_csv('NSCH_2021E_TOPICAL.csv')

# Specify covariate and dependent variable columns
covariates = ['K10Q11', 'K10Q40_R', 'SC_SEX', 'SC_RACE_R', 'TENURE', 'SC_AGE_YEARS']
dependents = ['OVERWEIGHT', 'PHYSACTIV', 'K4Q22_R']

# Convert the covariate columns to strings to ensure they are treated as categorical
df[covariates] = df[covariates].astype(str)

# Apply one-hot encoding to covariates
df_covariates = pd.get_dummies(df[covariates], drop_first=False)

# Extract the dependent variable
y = df['OVERWEIGHT']

  df = pd.read_csv('NSCH_2021E_TOPICAL.csv')


In [47]:
# Standardize the covariates
scaler = StandardScaler()
X = scaler.fit_transform(df_covariates)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [48]:
# Initialize the logistic regression model with Ridge regularization (L2 penalty)
log_reg_ridge = LogisticRegression(penalty='l2', C=1.0, max_iter=1000)

# Fit the model on the training data
log_reg_ridge.fit(X_train, y_train)

In [49]:
# Make predictions on the test data
y_pred = log_reg_ridge.predict(X_test)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9355514802200681


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00       906
           2       0.94      1.00      0.97     14284
           M       0.00      0.00      0.00        78

    accuracy                           0.94     15268
   macro avg       0.31      0.33      0.32     15268
weighted avg       0.88      0.94      0.90     15268

Confusion Matrix:
 [[    0   906     0]
 [    0 14284     0]
 [    0    78     0]]


  _warn_prf(average, modifier, msg_start, len(result))


In [50]:
# Extract the coefficients of the model
coefficients = log_reg_ridge.coef_[0]

# Create a DataFrame to hold the feature names and their corresponding coefficients
coefficients_df = pd.DataFrame({
    'Feature': df_covariates.columns,
    'Coefficient': coefficients
})

# Calculate the odds ratios
coefficients_df['Odds Ratio'] = np.exp(coefficients_df['Coefficient'])

# Print the coefficients and odds ratios
print("\nCoefficients and Odds Ratios:\n", coefficients_df)



Coefficients and Odds Ratios:
             Feature  Coefficient  Odds Ratio
0          K10Q11_1    -0.002016    0.997986
1          K10Q11_2     0.036307    1.036974
2          K10Q11_M    -0.091999    0.912106
3        K10Q40_R_1    -0.031942    0.968562
4        K10Q40_R_2     0.038866    1.039631
5        K10Q40_R_3     0.041763    1.042647
6        K10Q40_R_4     0.013930    1.014027
7        K10Q40_R_M    -0.058179    0.943481
8          SC_SEX_1     0.019274    1.019461
9          SC_SEX_2    -0.019274    0.980911
10      SC_RACE_R_1    -0.012955    0.987129
11      SC_RACE_R_2     0.024773    1.025083
12      SC_RACE_R_3     0.028888    1.029309
13      SC_RACE_R_4    -0.046800    0.954278
14      SC_RACE_R_5     0.001974    1.001976
15      SC_RACE_R_7     0.025475    1.025803
16         TENURE_1    -0.036539    0.964120
17         TENURE_2     0.016481    1.016618
18         TENURE_3     0.026010    1.026352
19         TENURE_4     0.017727    1.017885
20   SC_AGE_YEARS_0    

In [51]:
print("95% CIs:", pd.DataFrame({'Feature': df_covariates.columns, '95% CI Lower': np.exp(coefficients - 1.96 * np.std(coefficients)), '95% CI Upper': np.exp(coefficients + 1.96 * np.std(coefficients))}))


95% CIs:             Feature  95% CI Lower  95% CI Upper
0          K10Q11_1      0.837806      1.188790
1          K10Q11_2      0.870537      1.235233
2          K10Q11_M      0.765710      1.086490
3        K10Q40_R_1      0.813105      1.153741
4        K10Q40_R_2      0.872767      1.238397
5        K10Q40_R_3      0.875299      1.241990
6        K10Q40_R_4      0.851273      1.207899
7        K10Q40_R_M      0.792049      1.123864
8          SC_SEX_1      0.855835      1.214371
9          SC_SEX_2      0.823472      1.168450
10      SC_RACE_R_1      0.828692      1.175857
11      SC_RACE_R_2      0.860554      1.221068
12      SC_RACE_R_3      0.864102      1.226102
13      SC_RACE_R_4      0.801114      1.136726
14      SC_RACE_R_5      0.841156      1.193543
15      SC_RACE_R_7      0.861158      1.221925
16         TENURE_1      0.809376      1.148449
17         TENURE_2      0.853448      1.210984
18         TENURE_3      0.861619      1.222579
19         TENURE_4      0.8545