In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# === 1. Load the dataset ===
df = pd.read_csv("C:/Dell/All Documents/Vợ iu/University/DSTI/Course 1 - Machine Learning with Python Labs/Project 2/Patient_Characteristics_Survey__PCS___2019_Cleaned.csv")

# === 2. Define column groupings ===
output_columns = ["Principal Diagnosis Class", "Additional Diagnosis Class"]

demographic_cols = [
    "Survey Year", "Region Served", "Age Group", "Sex", "Transgender", "Sexual Orientation",
    "Hispanic Ethnicity", "Race", "Preferred Language", "Religious Preference", 
    "Three Digit Residence Zip Code"
]

social_cols = [
    "Living Situation", "Household Composition", "Veteran Status",
    "Employment Status", "Number Of Hours Worked Each Week", "Education Status", 
    "Special Education Services", "Criminal Justice Status"
]

clinic_cols = [
    "Mental Illness", "Intellectual Disability", "Autism Spectrum", "Other Developmental Disability",
    "Alcohol Related Disorder", "Drug Substance Disorder", "Opioid Related Disorder",
    "Mobility Impairment Disorder", "Hearing Impairment", "Visual Impairment", "Speech Impairment",
    "Hyperlipidemia", "High Blood Pressure", "Diabetes", "Obesity", "Heart Attack", "Stroke",
    "Other Cardiac", "Pulmonary Asthma", "Alzheimer or Dementia", "Kidney Disease", "Liver Disease",
    "Endocrine Condition", "Neurological Condition", "Traumatic Brain Injury", "Joint Disease", 
    "Cancer", "Other Chronic Med Condition", "No Chronic Med Condition", "Unknown Chronic Med Condition",
    "Cannabis Recreational Use", "Cannabis Medicinal Use", "Smokes", "Received Smoking Medication",
    "Received Smoking Counseling", "Serious Mental Illness", "Alcohol 12m Service", "Opioid 12m Service",
    "Drug/Substance 12m Service", "Program Category"
]

insurance_cols = [
    "SSI Cash Assistance", "SSDI Cash Assistance", "Veterans Disability Benefits", 
    "Veterans Cash Assistance", "Public Assistance Cash Program", "Other Cash Benefits",
    "Medicaid and Medicare Insurance", "No Insurance", "Unknown Insurance Coverage", 
    "Medicaid Insurance", "Medicaid Managed Insurance", "Medicare Insurance", 
    "Private Insurance", "Child Health Plus Insurance", "Other Insurance"
]

# Combine all input columns
input_columns = demographic_cols + social_cols + clinic_cols + insurance_cols

# === 3. Encode categorical variables ===
df_encoded = df[input_columns].copy()
label_encoders = {}

for col in df_encoded.columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))
    label_encoders[col] = le

# === 4. Correlation analysis (on a sample for performance) ===
sample = df_encoded.sample(n=190000, random_state=42)
correlation_matrix = sample.corr().abs()

# === 5. Identify redundant columns (correlation > 0.9) ===
redundant_columns = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > 0.9:
            colname = correlation_matrix.columns[i]
            redundant_columns.add(colname)

# === 6. Keep only non-redundant representative columns ===
non_redundant_columns = [col for col in input_columns if col not in redundant_columns]

# === 7. Select top ~20 based on correlation to output (optional logic here) ===
# For simplicity, select first 20 non-redundant ones
final_input_columns = non_redundant_columns[:20]

# Combine with output columns for export
final_columns = final_input_columns + output_columns

# === 8. Export the reduced dataset ===
df_reduced = df[final_columns]
df_reduced.to_csv("reduced_patient_dataset.csv", index=False)

# === 9. Summary ===
print("Initial input columns:", len(input_columns))
print("Redundant columns removed:", len(redundant_columns))
print("Final selected input columns (~20):")
for col in final_input_columns:
    print(" -", col)
print("\nReduced dataset saved to 'reduced_patient_dataset.csv'")


Initial input columns: 74
Redundant columns removed: 1
Final selected input columns (~20):
 - Survey Year
 - Region Served
 - Age Group
 - Sex
 - Transgender
 - Sexual Orientation
 - Hispanic Ethnicity
 - Race
 - Preferred Language
 - Religious Preference
 - Three Digit Residence Zip Code
 - Living Situation
 - Household Composition
 - Veteran Status
 - Employment Status
 - Number Of Hours Worked Each Week
 - Education Status
 - Special Education Services
 - Criminal Justice Status
 - Mental Illness

Reduced dataset saved to 'reduced_patient_dataset.csv'


In [5]:
# === Export reduced dataset to CSV ===
df_reduced.to_csv("C:/Dell/All Documents/Vợ iu/University/DSTI/Course 1 - Machine Learning with Python Labs/Project 2/reduced_patient_dataset.csv", index=False)

print("✅ Reduced dataset exported successfully to:")
print("C:/Dell/All Documents/Vợ iu/University/DSTI/Course 1 - Machine Learning with Python Labs/Project 2/reduced_patient_dataset.csv")


✅ Reduced dataset exported successfully to:
C:/Dell/All Documents/Vợ iu/University/DSTI/Course 1 - Machine Learning with Python Labs/Project 2/reduced_patient_dataset.csv


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib

# === 1. Load reduced dataset ===
df = pd.read_csv("C:/Dell/All Documents/Vợ iu/University/DSTI/Course 1 - Machine Learning with Python Labs/Project 2/reduced_patient_dataset.csv")

# === 2. Select inputs and output ===
output_col = "Principal Diagnosis Class"  # You can change this to "Additional Diagnosis Class"
X = df.drop(columns=["Principal Diagnosis Class", "Additional Diagnosis Class"])
y = df[output_col]

# === 3. Encode target if it's categorical ===
if y.dtype == 'object':
    le_target = LabelEncoder()
    y = le_target.fit_transform(y)
    joblib.dump(le_target, "label_encoder_target.pkl")  # Save encoder for later decoding

# === 4. Encode categorical inputs if needed ===
X_encoded = X.copy()
label_encoders = {}
for col in X_encoded.columns:
    if X_encoded[col].dtype == "object":
        le = LabelEncoder()
        X_encoded[col] = le.fit_transform(X_encoded[col].astype(str))
        label_encoders[col] = le

# Save input encoders
joblib.dump(label_encoders, "input_label_encoders.pkl")

# === 5. Split into training and testing sets ===
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# === 6. Train a Random Forest classifier ===
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# === 7. Make predictions and evaluate ===
y_pred = model.predict(X_test)
print("=== Classification Report ===")
print(classification_report(y_test, y_pred))

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

# === 8. Save the model ===
joblib.dump(model, "mental_illness_predictor_model.pkl")
print("✅ Model saved as 'mental_illness_predictor_model.pkl'")


=== Classification Report ===
              precision    recall  f1-score   support

           0       0.93      0.98      0.96     35613
           1       0.04      0.01      0.02       307
           2       0.12      0.02      0.03       716
           3       0.19      0.04      0.06       336
           4       0.27      0.08      0.12       708
           5       0.67      0.41      0.51      1541

    accuracy                           0.91     39221
   macro avg       0.37      0.26      0.28     39221
weighted avg       0.88      0.91      0.89     39221

=== Confusion Matrix ===
[[35066    65    90    47   124   221]
 [  288     3     1     2     3    10]
 [  683     0    14     1     1    17]
 [  286     1     2    13     6    28]
 [  609     0     4     1    56    38]
 [  884     2     2     4    17   632]]
✅ Model saved as 'mental_illness_predictor_model.pkl'
