# Kaggle Challenge - Exploring Mental Health

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

### Load Dataset

In [3]:
# Load the CSV file into a DataFrame
df = pd.read_csv('../data/train.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


## Clean the data

In [4]:
# Remove the 'Name' column
df_cleaned = df.drop(columns=['Name', 'id'])

# Get the depression column and save it as a separate DataFrame
depression = df_cleaned['Depression']

# Remove the depression column from the original DataFrame
df_cleaned = df_cleaned.drop(columns=['Depression'])

# Assign a unique value for each column
for column in df_cleaned.columns:
    df_cleaned[column] = pd.factorize(df_cleaned[column])[0]

df_cleaned.head()

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,0,0,0,0,0,-1,0,-1,-1,0,0,0,0,0,0,0,0
1,1,1,1,0,1,-1,1,-1,-1,1,1,1,1,1,1,1,0
2,1,2,2,1,-1,0,-1,0,0,-1,2,0,2,1,2,2,0
3,1,3,3,0,1,-1,0,-1,-1,2,1,2,3,1,3,2,1
4,0,4,4,0,2,-1,2,-1,-1,2,2,1,3,1,4,3,1


In [5]:
print(df_cleaned.shape, depression.shape)

(140700, 17) (140700,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned, depression,
    test_size=0.2, random_state=42
)
print(X_train.shape, y_train.shape)

(112560, 17) (112560,)


In [7]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)


In [8]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [32]:
class DepressionModel(nn.Module):
    def __init__(self, input_dim):
        super(DepressionModel, self).__init__()
        # Define a fully connected neural network with 3 layers
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),  # First layer with 128 neurons
            nn.ReLU(),  # Activation function
            # nn.Dropout(0.3),  # Dropout for regularization
            nn.Linear(128, 64),  # Second layer with 64 neurons
            nn.ReLU(),  # Activation function
            nn.Dropout(0.2),  # Dropout for regularization
            nn.Linear(64, 1),  # Output layer with 1 neuron
            nn.Sigmoid()  # Sigmoid activation for binary classification
            
            # nn.Linear(input_dim, 64),
            # nn.ReLU(),
            # nn.Linear(64, 32),
            # nn.ReLU(),
            # nn.Linear(32, 1),
            # nn.Sigmoid()
        )

    def forward(self, x):
        # Forward pass through the network
        return self.fc(x)

### Initialize the Model, Loss Function, and Optimizer

In this step, we initialize the model, define the loss function, and set up the optimizer.

- **Model Initialization**: We create an instance of the `DepressionModel` class, passing the number of input features as the argument.
- **Loss Function**: We use Binary Cross-Entropy Loss (`BCELoss`) which is suitable for binary classification tasks.
- **Optimizer**: We use the Adam optimizer with a learning rate of 0.001 to update the model's parameters during training.


In [29]:
model = DepressionModel(input_dim=df_cleaned.shape[1])
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Early Stopping

The following logic checks whether there is an increase in accuracy using a 'patience' and 'threshold' parameter and conditionally stops training

In [30]:
class EarlyStopping:
    def __init__(self, patience=5, thresh=0.001):
        """
        Args:
            patience (int, optional): Number of epochs to wait before stopping training if there is no increase in accuracy. Defaults to 5.
            thresh (float, optional): Value of expected accuracy increase. Defaults to 0.001.
        """
        self.patience=patience
        self.thresh=thresh
        self.best_acc=00.0
        self.counte=0

    def check(self, current_acc):
        """
        Check if early stopping should be triggered

        Args:
            current_acc (float): current validation accuracy that will be checked against best accuracy 
        """
        if self.thresh < current_acc - self.best_acc: 
            self.best_acc = current_acc
            self.counter=0
        else:
            self.counter+=1
        
        return self.counter >= self.patience
    

### Training the Model

In this step, we train the model using the training data and evaluate it on the validation data. The training process involves the following steps:

1. **Model Training**:
    In this step, we train the model using the training data and evaluate it on the validation data. The training process involves several key steps. First, we set the model to training mode and iterate over the training data in batches using `train_loader`. For each batch, we zero the gradients using `optimizer.zero_grad()`, perform a forward pass to compute the model's predictions, compute the loss using the criterion, perform a backward pass to compute the gradients, and finally update the model's weights using `optimizer.step()`.


2. **Model Evaluation** (optional):
    We evaluate the model by setting it to evaluation mode and iterate over the validation data in batches using `val_loader`. For each batch, we compute the model's predictions, accumulate the validation loss, and compute the number of correct predictions.

3. **Compute Accuracy**:
    Finally, we compute the accuracy of the model on the validation data. This involves calculating the accuracy based on the model's performance on the validation dataset.


In [31]:
num_epochs = 100  # Number of epochs for training
early_stopping = EarlyStopping(patience=5, thresh=0.001)

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for inputs, labels in train_loader:
        optimizer.zero_grad()  # Zero the gradients

        outputs = model(inputs)  # Forward pass

        loss = criterion(outputs, labels)  # Compute the loss

        loss.backward()  # Backward pass

        optimizer.step()  # Update the weights

    # Validation (optional)
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        val_loss = 0
        correct = 0
        total = 0
        for inputs, labels in val_loader:
            outputs = model(inputs)  # Forward pass
            val_loss += criterion(outputs, labels).item()  # Compute the validation loss
            predicted = (outputs > 0.5).float()  # Apply threshold for binary classification
            total += labels.size(0)  # Total number of samples
            correct += (predicted == labels).sum().item()  # Number of correct predictions

    accuracy = 100 * correct / total  # Compute accuracy
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}, Accuracy: {accuracy:.4f}%")

    
    if early_stopping.check(accuracy):
        print(f"Early Stopping at epoch {epoch+1}, Best Accuracy: {early_stopping.best_acc}")
        break

Epoch [1/100], Loss: 0.1670, Validation Loss: 0.2747, Accuracy: 88.4186%
Epoch [2/100], Loss: 0.1763, Validation Loss: 0.2465, Accuracy: 90.2416%
Epoch [3/100], Loss: 0.3877, Validation Loss: 0.2442, Accuracy: 90.3447%
Epoch [4/100], Loss: 0.4216, Validation Loss: 0.2500, Accuracy: 89.9112%
Epoch [5/100], Loss: 0.3705, Validation Loss: 0.2591, Accuracy: 89.6553%
Epoch [6/100], Loss: 0.1457, Validation Loss: 0.2350, Accuracy: 90.5828%
Epoch [7/100], Loss: 0.1364, Validation Loss: 0.2347, Accuracy: 90.5579%
Epoch [8/100], Loss: 0.2535, Validation Loss: 0.2321, Accuracy: 90.6610%
Epoch [9/100], Loss: 0.1710, Validation Loss: 0.2309, Accuracy: 90.6787%
Epoch [10/100], Loss: 0.4653, Validation Loss: 0.2308, Accuracy: 90.5650%
Epoch [11/100], Loss: 0.5605, Validation Loss: 0.2263, Accuracy: 90.8138%
Epoch [12/100], Loss: 0.3030, Validation Loss: 0.2295, Accuracy: 90.7711%
Epoch [13/100], Loss: 0.3183, Validation Loss: 0.2346, Accuracy: 90.1777%
Epoch [14/100], Loss: 0.3881, Validation Loss: 

### Save the Model

After training the model, it is important to save it so that it can be used later without retraining. The model is saved using the `torch.save` function, which saves the model's state dictionary to a file. This allows us to load the model later and use it for making predictions on new data.

In [39]:
torch.save(model.state_dict(), '../models/trained.pth')
print("Model saved to '../models/trained.pth'")

Model saved to '../models/trained.pth'


In [37]:
# 8. Load and Preprocess the Test Data
test_df = pd.read_csv('../data/test.csv')

# Remove unnecessary columns
test_cleaned = test_df.drop(columns=['Name', 'id'])

# Factorize categorical columns (apply the same transformations as training data)
for column in test_cleaned.columns:
    test_cleaned[column] = pd.factorize(test_cleaned[column])[0]

# Convert to tensors
X_test_tensor = torch.tensor(test_cleaned.values, dtype=torch.float32)

# 9. Make Predictions on the Test Set
test_loader = DataLoader(X_test_tensor, batch_size=32, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(batch)
        preds = (outputs > 0.5).float()  # Apply threshold for binary classification
        predictions.extend(preds.cpu().numpy())

# Save predictions to a CSV file
predictions = pd.DataFrame(predictions, columns=['Predicted_Depression'])
predictions.to_csv('../data/test_predictions.csv', index=False)

print("Predictions saved to '../data/test_predictions.csv'")

Predictions saved to '../data/test_predictions.csv'
