In [2]:
import pandas as pd
data = pd.read_csv('../data/preprocessed_data.csv')
print(data.head())

   Age  Number of Children  Physical Activity Level  Employment Status  \
0   31                   2                        2                  0   
1   55                   1                        0                  1   
2   78                   1                        0                  1   
3   58                   3                        1                  0   
4   18                   0                        0                  0   

      Income  Alcohol Consumption  Dietary Habits  Sleep Patterns  \
0   26265.67                    1               1               1   
1   42710.36                    2               0               1   
2  125332.79                    0               0               2   
3    9992.78                    1               1               0   
4    8595.08                    0               1               1   

   History of Mental Illness  History of Substance Abuse  ...  \
0                          1                           0  ...   
1         

split the data set

In [3]:
from sklearn.model_selection import train_test_split
X = data.drop('History of Mental Illness', axis=1)
y = data['History of Mental Illness']

X_train, X_test , y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

verify the split 

In [4]:
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

Training features shape: (331014, 20)
Testing features shape: (82754, 20)
Training target shape: (331014,)
Testing target shape: (82754,)


train the model 

In [5]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the model with class weights
model = RandomForestClassifier(random_state=42, class_weight='balanced')

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Accuracy: 0.6642216690431882
Confusion Matrix:
 [[51817  5654]
 [22133  3150]]
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.90      0.79     57471
           1       0.36      0.12      0.18     25283

    accuracy                           0.66     82754
   macro avg       0.53      0.51      0.49     82754
weighted avg       0.60      0.66      0.60     82754



install the imblearn library 

In [7]:
!pip install imbalanced-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Downloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the model on the resampled data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Accuracy: 0.6220968170722866
Confusion Matrix:
 [[44982 12489]
 [18784  6499]]
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.78      0.74     57471
           1       0.34      0.26      0.29     25283

    accuracy                           0.62     82754
   macro avg       0.52      0.52      0.52     82754
weighted avg       0.59      0.62      0.61     82754



In [6]:
!pip install tensorflow 

Defaulting to user installation because normal site-packages is not writeable
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-p

  You can safely remove it manually.
  You can safely remove it manually.


build and train the neural network 

In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Build the Neural Network
model = Sequential()

# Input layer
model.add(Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'))  # 128 neurons, ReLU activation
model.add(Dropout(0.2))  # Dropout to prevent overfitting

# Hidden layers
model.add(Dense(64, activation='relu'))  # 64 neurons, ReLU activation
model.add(Dropout(0.2))  # Dropout

# Output layer
model.add(Dense(1, activation='sigmoid'))  # 1 neuron, sigmoid activation for binary classification

# Step 3: Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train the model
history = model.fit(X_train_scaled, y_train, epochs=20, batch_size=32, validation_split=0.2, class_weight={0: 1, 1: 2})

# Step 5: Make predictions
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 3ms/step - accuracy: 0.6089 - loss: 0.8881 - val_accuracy: 0.6265 - val_loss: 0.6506
Epoch 2/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.6252 - loss: 0.8822 - val_accuracy: 0.6274 - val_loss: 0.6696
Epoch 3/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2ms/step - accuracy: 0.6282 - loss: 0.8810 - val_accuracy: 0.6304 - val_loss: 0.6563
Epoch 4/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.6285 - loss: 0.8813 - val_accuracy: 0.6320 - val_loss: 0.6533
Epoch 5/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.6284 - loss: 0.8827 - val_accuracy: 0.6316 - val_loss: 0.6592
Epoch 6/20
[1m8276/8276[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.6306 - loss: 0.8808 - val_accuracy: 0.6319 - val_loss: 0.6557
Epoch 7/20

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Build the Neural Network
model = Sequential()

# Input layer
model.add(Dense(256, input_dim=X_train_scaled.shape[1], activation='relu'))  # 256 neurons, ReLU activation
model.add(Dropout(0.3))  # 30% dropout

# Hidden layers
model.add(Dense(128, activation='relu'))  # 128 neurons, ReLU activation
model.add(Dropout(0.3))  # 30% dropout

model.add(Dense(64, activation='relu'))  # 64 neurons, ReLU activation
model.add(Dropout(0.3))  # 30% dropout

# Output layer
model.add(Dense(1, activation='sigmoid'))  # 1 neuron, sigmoid activation for binary classification

# Step 3: Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Step 4: Train the model with early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train_scaled, y_train, epochs=30, batch_size=64, validation_split=0.2, 
                    class_weight={0: 1, 1: 2}, callbacks=[early_stopping])

# Step 5: Make predictions
y_pred_prob = model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Step 6: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.6084 - loss: 0.8892 - val_accuracy: 0.6188 - val_loss: 0.6624
Epoch 2/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.6221 - loss: 0.8813 - val_accuracy: 0.6277 - val_loss: 0.6631
Epoch 3/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.6243 - loss: 0.8826 - val_accuracy: 0.6314 - val_loss: 0.6498
Epoch 4/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.6266 - loss: 0.8836 - val_accuracy: 0.6309 - val_loss: 0.6594
Epoch 5/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 4ms/step - accuracy: 0.6261 - loss: 0.8828 - val_accuracy: 0.6325 - val_loss: 0.6531
Epoch 6/30
[1m4138/4138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 3ms/step - accuracy: 0.6302 - loss: 0.8817 - val_accuracy: 0.6320 - val_loss: 0.6503
Epoch 7/30

save the trained model 

In [9]:
import joblib

# Save the model
joblib.dump(model, '../models/mental_health_model.pkl')

['../models/mental_health_model.pkl']