# Kaggle Challenge - Exploring Mental Health

### Load Dataset

In [57]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim

In [58]:
# Load the CSV file into a DataFrame
df = pd.read_csv('../data/train.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


## Clean the data

In [59]:
# Remove the 'Name' column
df_cleaned = df.drop(columns=['Name', 'id'])

# Get the depression column and save it as a separate DataFrame
depression = df_cleaned['Depression']

# Remove the depression column from the original DataFrame
df_cleaned = df_cleaned.drop(columns=['Depression'])

# Assign a unique value for each column
for column in df_cleaned.columns:
    df_cleaned[column] = pd.factorize(df_cleaned[column])[0]

df_cleaned.head()

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,0,0,0,0,0,-1,0,-1,-1,0,0,0,0,0,0,0,0
1,1,1,1,0,1,-1,1,-1,-1,1,1,1,1,1,1,1,0
2,1,2,2,1,-1,0,-1,0,0,-1,2,0,2,1,2,2,0
3,1,3,3,0,1,-1,0,-1,-1,2,1,2,3,1,3,2,1
4,0,4,4,0,2,-1,2,-1,-1,2,2,1,3,1,4,3,1


In [60]:
print(df_cleaned.shape, depression.shape)

(140700, 17) (140700,)


In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    df_cleaned, depression,
    test_size=0.2, random_state=42
)
print(X_train.shape, y_train.shape)

(112560, 17) (112560,)


In [62]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)

X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)  # Reshape to (batch_size, 1)


In [63]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [64]:
class DepressionModel(nn.Module):
    def __init__(self, input_dim):
        super(DepressionModel, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()  # Sigmoid for binary classification
        )

    def forward(self, x):
        return self.fc(x)

In [65]:
model = DepressionModel(input_dim=df_cleaned.shape[1])
criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [66]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for inputs, labels in train_loader:
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

    # Validation (optional)
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}")


Epoch [1/10], Loss: 0.1693, Validation Loss: 0.2506
Epoch [2/10], Loss: 0.3610, Validation Loss: 0.2561
Epoch [3/10], Loss: 0.1960, Validation Loss: 0.2689
Epoch [4/10], Loss: 0.7872, Validation Loss: 0.2526
Epoch [5/10], Loss: 0.0961, Validation Loss: 0.2503
Epoch [6/10], Loss: 0.1649, Validation Loss: 0.2659
Epoch [7/10], Loss: 0.3105, Validation Loss: 0.2425
Epoch [8/10], Loss: 0.2035, Validation Loss: 0.2380
Epoch [9/10], Loss: 0.3831, Validation Loss: 0.2362
Epoch [10/10], Loss: 0.5671, Validation Loss: 0.2342


In [67]:
# 8. Load and Preprocess the Test Data
test_df = pd.read_csv('../data/test.csv')

# Remove unnecessary columns
test_cleaned = test_df.drop(columns=['Name', 'id'])

# Factorize categorical columns (apply the same transformations as training data)
for column in test_cleaned.columns:
    test_cleaned[column] = pd.factorize(test_cleaned[column])[0]

# Convert to tensors
X_test_tensor = torch.tensor(test_cleaned.values, dtype=torch.float32)

# 9. Make Predictions on the Test Set
test_loader = DataLoader(X_test_tensor, batch_size=32, shuffle=False)

model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        outputs = model(batch)
        preds = (outputs > 0.5).float()  # Apply threshold for binary classification
        predictions.extend(preds.cpu().numpy())

# Save predictions to a CSV file
predictions = pd.DataFrame(predictions, columns=['Predicted_Depression'])
predictions.to_csv('../data/test_predictions.csv', index=False)

print("Predictions saved to '../data/test_predictions.csv'")

Predictions saved to '../data/test_predictions.csv'
