In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv("student_mental_health_dataset.csv")

# Separate features and target
X = df.drop("MentalHealthIssue", axis=1)
y = df["MentalHealthIssue"]

# Identify categorical and numeric columns
categorical_cols = ['Gender', 'Race', 'Year', 'DisabilityStatus', 'FirstGen', 'Sexuality']
numeric_cols = ['KesslerScore', 'PHQScore', 'PC_PTSDScore']

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Apply preprocessing and split into train/test sets
X_preprocessed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Shapes for sanity check
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (400, 18)
X_test shape: (100, 18)


In [None]:
# 1. Setup & Libraries

!pip install pandas scikit-learn tensorflow joblib --quiet

import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib


# 2. Load & Clean Data

df = pd.read_csv("student_mental_health_dataset.csv")

# Optional fallback creation for testing
if "Age" not in df.columns:
    df["Age"] = np.random.randint(17, 30, size=len(df))
if "PHQScore" in df.columns:
    df["DepressionScore"] = df["PHQScore"]
if "AnxietyScore" not in df.columns:
    df["AnxietyScore"] = np.random.uniform(5, 15, size=len(df))
if "PC_PTSDScore" not in df.columns:
    df["TraumaScore"] = np.random.randint(0, 5, size=len(df))
else:
    df["TraumaScore"] = df["PC_PTSDScore"]

# Required columns
df = df.rename(columns={
    "Gender": "Gender",
    "DisabilityStatus": "Disability",
})

required = ["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore", "MentalHealthIssue"]
missing = [col for col in required if col not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

X = df[["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore"]]
y = df["MentalHealthIssue"]

# 3. Preprocessing

cat_cols = ["Gender", "Disability"]
num_cols = ["Age", "DepressionScore", "AnxietyScore", "TraumaScore"]

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("cat", cat_transformer, cat_cols),
    ("num", num_transformer, num_cols)
])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Save preprocessor
joblib.dump(preprocessor, "scaler.pkl")


# 4. Neural Network Model

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_processed.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30, batch_size=16, validation_data=(X_test, y_test), verbose=0)


# 5. Save Model

model.save("mental_health_risk_model.h5")
print("✅ Trained & saved model and scaler with 6 features.")




✅ Trained & saved model and scaler with 6 features.


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train Logistic Regression
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)
log_reg_model.fit(X_train, y_train)

# Predict
y_pred = log_reg_model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%\n")

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 71.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.43      0.51        35
           1       0.74      0.86      0.79        65

    accuracy                           0.71       100
   macro avg       0.68      0.65      0.65       100
weighted avg       0.70      0.71      0.69       100

Confusion Matrix:
[[15 20]
 [ 9 56]]


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix

# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Setup early stopping to avoid overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=16,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate on test data
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {accuracy * 100:.2f}%")

# Predict and report
y_pred_nn = (model.predict(X_test) > 0.5).astype("int32")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_nn))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_nn))


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 27ms/step - accuracy: 0.5061 - loss: 0.7178 - val_accuracy: 0.6250 - val_loss: 0.6212
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6770 - loss: 0.6112 - val_accuracy: 0.6625 - val_loss: 0.5798
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6713 - loss: 0.5868 - val_accuracy: 0.6625 - val_loss: 0.5631
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7267 - loss: 0.5425 - val_accuracy: 0.6750 - val_loss: 0.5570
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7856 - loss: 0.4796 - val_accuracy: 0.7125 - val_loss: 0.5507
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7440 - loss: 0.5243 - val_accuracy: 0.7375 - val_loss: 0.5562
Epoch 7/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━

In [None]:
# Save logistic regression model
import joblib
joblib.dump(log_reg_model, 'logistic_model.pkl')

# Save neural network model
model.save('neural_net_model.h5')




In [None]:
def predict_mental_health(input_data, model_type='logistic'):
    if model_type == 'logistic':
        model = joblib.load('logistic_model.pkl')
        return model.predict(input_data)
    elif model_type == 'neural':
        from tensorflow.keras.models import load_model
        model = load_model('neural_net_model.h5')
        prediction = model.predict(input_data)
        return (prediction > 0.5).astype("int32")


In [None]:
# Install if not available
# !pip install xgboost

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train an XGBoost classifier
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate
print(f"XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb) * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_xgb))

# Save the model
import joblib
joblib.dump(xgb_model, "xgboost_model.pkl")


XGBoost Accuracy: 82.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.69      0.73        35
           1       0.84      0.89      0.87        65

    accuracy                           0.82       100
   macro avg       0.81      0.79      0.80       100
weighted avg       0.82      0.82      0.82       100


Confusion Matrix:
[[24 11]
 [ 7 58]]


Parameters: { "use_label_encoder" } are not used.



['xgboost_model.pkl']

In [None]:
from xgboost import XGBClassifier

classifier = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    max_depth=3,
    n_estimators=50,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)


In [None]:
# Re-import necessary libraries if needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import xgboost as xgb


# Define the correct categorical and numerical columns based on the current X
cat_cols = ["Gender", "Disability"]
num_cols = ["Age", "DepressionScore", "AnxietyScore", "TraumaScore"]

# Redefine the preprocessor with the correct columns
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("cat", cat_transformer, cat_cols),
    ("num", num_transformer, num_cols)
])

# Define the classifier (using XGBoost as an example based on subsequent code)
classifier = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',

    n_estimators=100,
    learning_rate=0.1
)

# Create the pipeline including preprocessing and the classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])

# Perform cross-validation using the newly defined pipeline
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation scores: [0.82 0.79 0.76 0.82 0.79]
Mean CV accuracy: 0.796


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [None]:

!pip install pandas scikit-learn xgboost matplotlib --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import xgboost as xgb
import matplotlib.pyplot as plt

df = pd.read_csv("student_mental_health_dataset.csv")

# Rename columns to match pipeline expectations
df = df.rename(columns={
    "DisabilityStatus": "Disability",
    "PHQScore": "DepressionScore",
    "PC_PTSDScore": "TraumaScore"
})

# Add missing columns if needed (for testing only)
if "AnxietyScore" not in df.columns:
    df["AnxietyScore"] = np.random.uniform(5, 15, size=len(df))
if "Age" not in df.columns:
    df["Age"] = np.random.randint(17, 30, size=len(df))

# Check required columns
required = ["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore", "MentalHealthIssue"]
missing = [col for col in required if col not in df.columns]
if missing:
    raise ValueError(f"❌ Missing required columns: {missing}")

# Define features and target
X = df[["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore"]]
y = df["MentalHealthIssue"]


cat_cols = ["Gender", "Disability"]
num_cols = ["Age", "DepressionScore", "AnxietyScore", "TraumaScore"]

cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

preprocessor = ColumnTransformer([
    ("cat", cat_transformer, cat_cols),
    ("num", num_transformer, num_cols)
])

# ========================
# 5. XGBoost Classifier
# ========================
classifier = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
])


# 6. Cross-validation

cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("✅ Cross-validation scores:", cv_scores)
print("✅ Mean CV accuracy:", np.mean(cv_scores))

# 7. Fit Final Model

pipeline.fit(X, y)

# Predict on full data (or split if you want)
y_pred = pipeline.predict(X)

print("\n✅ Accuracy:", accuracy_score(y, y_pred))
print("\n📊 Classification Report:\n", classification_report(y, y_pred))
print("\n🧮 Confusion Matrix:\n", confusion_matrix(y, y_pred))

# 8. Feature Importance Plot

# Get feature names
encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"].named_steps["encoder"]
encoded_cat_names = encoder.get_feature_names_out(cat_cols).tolist()
feature_names = encoded_cat_names + num_cols

# Get importances
xgb_model = pipeline.named_steps["classifier"]
importances = xgb_model.feature_importances_



Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



✅ Cross-validation scores: [0.85 0.78 0.78 0.84 0.83]
✅ Mean CV accuracy: 0.8160000000000001

✅ Accuracy: 0.986

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       185
           1       0.99      0.98      0.99       315

    accuracy                           0.99       500
   macro avg       0.98      0.99      0.99       500
weighted avg       0.99      0.99      0.99       500


🧮 Confusion Matrix:
 [[183   2]
 [  5 310]]


In [None]:

!pip install scikeras[tensorflow] xgboost scikit-learn pandas --quiet

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import tensorflow as tf
from scikeras.wrappers import KerasClassifier


df = pd.read_csv("student_mental_health_dataset.csv")

df['is_Hispanic_Woman'] = ((df['Gender'] == 'Female') & (df['Race'] == 'Hispanic')).astype(int)
df['is_Disabled_LGBTQA'] = ((df['DisabilityStatus'] == 'Yes') & (df['Sexuality'] != 'Heterosexual')).astype(int)
df['is_FirstGen_Woman'] = ((df['Gender'] == 'Female') & (df['FirstGen'] == 'Yes')).astype(int)

features = ['Gender', 'Race', 'Year', 'DisabilityStatus', 'FirstGen', 'Sexuality',
            'is_Hispanic_Woman', 'is_Disabled_LGBTQA', 'is_FirstGen_Woman']
target = 'MentalHealthIssue'

X = df[features]
y = df[target]

cat_cols = ['Gender', 'Race', 'Year', 'DisabilityStatus', 'FirstGen', 'Sexuality']
num_cols = ['is_Hispanic_Woman', 'is_Disabled_LGBTQA', 'is_FirstGen_Woman']

cat_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
num_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols),
    ('cat', cat_transformer, cat_cols)
])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


# 4. Logistic Regression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
print("🔵 Logistic Regression")
print(classification_report(y_test, log_pred))


# 5. XGBoost

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print("🟢 XGBoost")
print(classification_report(y_test, xgb_pred))


# 6. Neural Network

def create_nn_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

nn_model = KerasClassifier(model=create_nn_model, epochs=20, batch_size=16, verbose=0)
nn_model.fit(X_train, y_train)
nn_pred = nn_model.predict(X_test)
print("🔴 Neural Network")
print(classification_report(y_test, nn_pred))


🔵 Logistic Regression
              precision    recall  f1-score   support

           0       0.12      0.03      0.05        35
           1       0.63      0.89      0.74        65

    accuracy                           0.59       100
   macro avg       0.38      0.46      0.39       100
weighted avg       0.45      0.59      0.50       100



Parameters: { "use_label_encoder" } are not used.



🟢 XGBoost
              precision    recall  f1-score   support

           0       0.40      0.17      0.24        35
           1       0.66      0.86      0.75        65

    accuracy                           0.62       100
   macro avg       0.53      0.52      0.49       100
weighted avg       0.57      0.62      0.57       100



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


🔴 Neural Network
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.65      1.00      0.79        65

    accuracy                           0.65       100
   macro avg       0.33      0.50      0.39       100
weighted avg       0.42      0.65      0.51       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Convert predictions to NumPy arrays
log_np = np.array(log_pred)
xgb_np = np.array(xgb_pred)
nn_np = np.array(nn_pred)

# Majority vote
combined = (log_np + xgb_np + nn_np) >= 2
combined = combined.astype(int)

print("🟣 Ensemble Majority Vote")
print(classification_report(y_test, combined))


🟣 Ensemble Majority Vote
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.63      0.92      0.75        65

    accuracy                           0.60       100
   macro avg       0.32      0.46      0.38       100
weighted avg       0.41      0.60      0.49       100



In [None]:

df = pd.read_csv("student_mental_health_dataset.csv")

# Rename columns to match expected pipeline input
df = df.rename(columns={
    "DisabilityStatus": "Disability",
    "PHQScore": "DepressionScore",
    "PC_PTSDScore": "TraumaScore"
})

# Add missing columns if needed (for testing only)
if "AnxietyScore" not in df.columns:
    df["AnxietyScore"] = np.random.uniform(5, 15, size=len(df))
if "Age" not in df.columns:
    df["Age"] = np.random.randint(17, 30, size=len(df))

# Check for required columns
required = ["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore", "MentalHealthIssue"]
missing = [col for col in required if col not in df.columns]
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Define features and target as a DataFrame (not numpy array!)
X = df[["Gender", "Disability", "Age", "DepressionScore", "AnxietyScore", "TraumaScore"]]
y = df["MentalHealthIssue"]


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import xgboost as xgb
from sklearn.model_selection import cross_val_score

# Column setup
cat_cols = ["Gender", "Disability"]
num_cols = ["Age", "DepressionScore", "AnxietyScore", "TraumaScore"]

# Preprocessor
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
num_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])
preprocessor = ColumnTransformer([
    ("cat", cat_transformer, cat_cols),
    ("num", num_transformer, num_cols)
])

# Model pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

# ✅ Now this works:
cv_scores = cross_val_score(pipeline, X, y, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-validation scores: [0.79 0.76 0.73 0.79 0.79]
Mean CV accuracy: 0.772


In [None]:
pipeline.fit(X, y) # It's good practice to fit the pipeline before saving if you intend to use it for predictions later.
joblib.dump(pipeline, "mental_health_xgb_pipeline.pkl")
print("✅ Saved a FITTED pipeline.")

Parameters: { "use_label_encoder" } are not used.



✅ Saved a FITTED pipeline.


In [None]:
pipeline.fit(X, y)
joblib.dump(pipeline, "mental_health_xgb_pipeline.pkl")
print("✅ Saved a FITTED pipeline.")


Parameters: { "use_label_encoder" } are not used.



✅ Saved a FITTED pipeline.


In [None]:
pipeline = joblib.load("mental_health_xgb_pipeline.pkl")


In [None]:
import joblib

# Save the whole preprocessing + model pipeline
joblib.dump(pipeline, "mental_health_xgb_pipeline.pkl")
print("✅ Saved: mental_health_xgb_pipeline.pkl")


✅ Saved: mental_health_xgb_pipeline.pkl


In [None]:
!pip install gradio --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.2/54.2 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m323.1/323.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.2/95.2 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.5/11.5 MB[0m [31m113.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.0/72.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import gradio as gr
import pandas as pd
import numpy as np
import joblib

# Load trained pipeline
pipeline = joblib.load("mental_health_xgb_pipeline.pkl")

# Prediction function
def predict_risk(gender, disability, age, depression, anxiety, trauma):
    try:
        # Construct DataFrame for a single row
        input_df = pd.DataFrame([{
            "Gender": gender,
            "Disability": disability,
            "Age": age,
            "DepressionScore": depression,
            "AnxietyScore": anxiety,
            "TraumaScore": trauma
        }])

        # Predict using pipeline
        pred = pipeline.predict(input_df)[0]
        prob = pipeline.predict_proba(input_df)[0][1]

        result = "🔴 High Risk" if pred == 1 else "🟢 Low Risk"
        return f"{result} ({prob:.2%} chance)"

    except Exception as e:
        return f"⚠️ Error: {str(e)}"

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("<h1 style='text-align:center'>🧠 Mental Health Risk Prediction</h1>")
    gr.Markdown("<p style='text-align:center'>This app uses an XGBoost model to assess mental health risk based on your inputs.</p>")

    with gr.Row():
        with gr.Column():
            gender = gr.Radio(["Female", "Male", "Other"], label="Gender")
            disability = gr.Radio(["Yes", "No"], label="Disability Status")
            age = gr.Slider(17, 30, value=20, step=1, label="Age")
            depression = gr.Slider(0, 27, step=0.1, value=10, label="Depression Score")
            anxiety = gr.Slider(0, 21, step=0.1, value=9, label="Anxiety Score")
            trauma = gr.Slider(0, 5, step=1, value=2, label="Trauma Score")
            submit = gr.Button("🚀 Submit")

        with gr.Column():
            output = gr.Textbox(label="Mental Health Risk Level", lines=2)

    submit.click(predict_risk, inputs=[gender, disability, age, depression, anxiety, trauma], outputs=output)

# Launch the app
app.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cac24a644a43719ffd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


