In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
df = pd.read_csv("train.csv")

# Drop columns not useful for prediction
df_clean = df.drop(columns=["id", "Name"])

# Identify categorical and numerical columns
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()
numerical_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('Depression')  # Target column

# Fill missing values
df_clean[categorical_cols] = df_clean[categorical_cols].fillna('Unknown')
df_clean[numerical_cols] = df_clean[numerical_cols].fillna(df_clean[numerical_cols].median())

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = StandardScaler()
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])

# Split into features and target
X = df_clean.drop(columns='Depression')
y = df_clean['Depression']

# Train-validation split (80-20), stratified to preserve class distribution
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Final shapes
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Training target distribution:\n", y_train.value_counts())
print("Validation target distribution:\n", y_val.value_counts())


Training set: (112560, 17)
Validation set: (28140, 17)
Training target distribution:
 Depression
0    92106
1    20454
Name: count, dtype: int64
Validation target distribution:
 Depression
0    23027
1     5113
Name: count, dtype: int64


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib

# Load the dataset
df = pd.read_csv("train.csv")

# Drop columns not useful for prediction
df_clean = df.drop(columns=["id", "Name"])

# Identify categorical and numerical columns
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()
numerical_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('Depression')  # Target column

# Fill missing values
df_clean[categorical_cols] = df_clean[categorical_cols].fillna('Unknown')
df_clean[numerical_cols] = df_clean[numerical_cols].fillna(df_clean[numerical_cols].median())

# Save medians for test-time imputation
medians = df_clean[numerical_cols].median()
joblib.dump(medians, "medians.pkl")

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Save label encoders
joblib.dump(label_encoders, "label_encoders.pkl")

# Normalize numerical features
scaler = StandardScaler()
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])

# Save the scaler
joblib.dump(scaler, "scaler.pkl")

# Save column names for reuse
joblib.dump(categorical_cols, "categorical_cols.pkl")
joblib.dump(numerical_cols, "numerical_cols.pkl")

# Split into features and target
X = df_clean.drop(columns='Depression')
y = df_clean['Depression']

# Train-validation split (80-20), stratified to preserve class distribution
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Final shapes
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Training target distribution:\n", y_train.value_counts())
print("Validation target distribution:\n", y_val.value_counts())


Training set: (112560, 17)
Validation set: (28140, 17)
Training target distribution:
 Depression
0    92106
1    20454
Name: count, dtype: int64
Validation target distribution:
 Depression
0    23027
1     5113
Name: count, dtype: int64


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Convert the NumPy data from preprocessing into PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)

# Create DataLoader objects
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the MLP architecture
class DepressionMLP(nn.Module):
    def __init__(self, input_dim):
        super(DepressionMLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Initialize model, loss, and optimizer
model = DepressionMLP(input_dim=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # Evaluation on validation set
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor).item()
        val_preds = (val_outputs >= 0.5).float()
        val_accuracy = (val_preds == y_val_tensor).float().mean().item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}, "
          f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")


Epoch 1/10, Loss: 472.0682, Val Loss: 0.1698, Val Accuracy: 0.9317
Epoch 2/10, Loss: 315.4934, Val Loss: 0.1634, Val Accuracy: 0.9335
Epoch 3/10, Loss: 304.1693, Val Loss: 0.1670, Val Accuracy: 0.9337
Epoch 4/10, Loss: 300.0641, Val Loss: 0.1681, Val Accuracy: 0.9331
Epoch 5/10, Loss: 296.6951, Val Loss: 0.1691, Val Accuracy: 0.9323
Epoch 6/10, Loss: 294.9240, Val Loss: 0.1611, Val Accuracy: 0.9356
Epoch 7/10, Loss: 293.0425, Val Loss: 0.1615, Val Accuracy: 0.9369
Epoch 8/10, Loss: 291.4882, Val Loss: 0.1614, Val Accuracy: 0.9373
Epoch 9/10, Loss: 292.6860, Val Loss: 0.1629, Val Accuracy: 0.9367
Epoch 10/10, Loss: 292.0347, Val Loss: 0.1612, Val Accuracy: 0.9362


In [None]:
import pandas as pd
# Load dataset
df = pd.read_csv("test.csv")

<bound method NDFrame.describe of            id     Name  Gender   Age           City  \
0      140700   Shivam    Male  53.0  Visakhapatnam   
1      140701    Sanya  Female  58.0        Kolkata   
2      140702     Yash    Male  53.0         Jaipur   
3      140703   Nalini  Female  23.0         Rajkot   
4      140704  Shaurya    Male  47.0         Kalyan   
...       ...      ...     ...   ...            ...   
93795  234495     Zoya  Female  49.0         Jaipur   
93796  234496    Shlok    Male  29.0      Ahmedabad   
93797  234497    Rishi    Male  24.0  Visakhapatnam   
93798  234498   Eshita  Female  23.0         Kalyan   
93799  234499    Gauri  Female  43.0       Varanasi   

      Working Professional or Student              Profession  \
0                Working Professional                   Judge   
1                Working Professional  Educational Consultant   
2                Working Professional                 Teacher   
3                             Student        

In [13]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch.utils.data import TensorDataset, DataLoader

# Load dataset
df = pd.read_csv("train.csv")
df = df.drop(columns=["id", "Name"])

# Preprocessing
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('Depression')

df[categorical_cols] = df[categorical_cols].fillna('Unknown')
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split data
X = df.drop(columns='Depression')
y = df['Depression']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)

# DataLoaders
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=64, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=64)

# MLP Model
class DepressionMLP(nn.Module):
    def __init__(self, input_dim):
        super(DepressionMLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Training and Evaluation Pipeline
def train_model(model, train_loader, val_loader, epochs=10, lr=0.001):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

        # Evaluate
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val_tensor)
            val_preds = (val_outputs >= 0.5).float()

            acc = accuracy_score(y_val_tensor, val_preds)
            prec = precision_score(y_val_tensor, val_preds)
            rec = recall_score(y_val_tensor, val_preds)
            f1 = f1_score(y_val_tensor, val_preds)

        print(f"Epoch {epoch+1}/{epochs} | Accuracy: {acc:.4f} | Precision: {prec:.4f} "
              f"| Recall: {rec:.4f} | F1 Score: {f1:.4f}")

# Initialize and train
input_dim = X_train.shape[1]
model = DepressionMLP(input_dim)
train_model(model, train_loader, val_loader)


Epoch 1/10 | Accuracy: 0.9308 | Precision: 0.8599 | Recall: 0.7397 | F1 Score: 0.7953
Epoch 2/10 | Accuracy: 0.9333 | Precision: 0.8208 | Recall: 0.8099 | F1 Score: 0.8153
Epoch 3/10 | Accuracy: 0.9348 | Precision: 0.8557 | Recall: 0.7710 | F1 Score: 0.8111
Epoch 4/10 | Accuracy: 0.9353 | Precision: 0.8508 | Recall: 0.7810 | F1 Score: 0.8144
Epoch 5/10 | Accuracy: 0.9357 | Precision: 0.8359 | Recall: 0.8040 | F1 Score: 0.8197
Epoch 6/10 | Accuracy: 0.9353 | Precision: 0.8136 | Recall: 0.8349 | F1 Score: 0.8241
Epoch 7/10 | Accuracy: 0.9361 | Precision: 0.8288 | Recall: 0.8171 | F1 Score: 0.8229
Epoch 8/10 | Accuracy: 0.9330 | Precision: 0.8829 | Recall: 0.7281 | F1 Score: 0.7981
Epoch 9/10 | Accuracy: 0.9366 | Precision: 0.8396 | Recall: 0.8046 | F1 Score: 0.8217
Epoch 10/10 | Accuracy: 0.9359 | Precision: 0.8580 | Recall: 0.7753 | F1 Score: 0.8145


In [3]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import numpy as np

# Load dataset
df = pd.read_csv("train.csv")

# Drop unnecessary columns
df_clean = df.drop(columns=["id", "Name"])

# Identify categorical and numerical columns
categorical_cols = df_clean.select_dtypes(include='object').columns.tolist()
numerical_cols = df_clean.select_dtypes(include=['float64', 'int64']).columns.tolist()
numerical_cols.remove('Depression')  # Target column

# Handle missing values
df_clean[categorical_cols] = df_clean[categorical_cols].fillna('Unknown')
df_clean[numerical_cols] = df_clean[numerical_cols].fillna(df_clean[numerical_cols].median())

# Save medians
medians = df_clean[numerical_cols].median()
joblib.dump(medians, "medians.pkl")

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df_clean[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le
joblib.dump(label_encoders, "label_encoders.pkl")

# Normalize numerical features
scaler = StandardScaler()
df_clean[numerical_cols] = scaler.fit_transform(df_clean[numerical_cols])
joblib.dump(scaler, "scaler.pkl")

# Save column names
joblib.dump(categorical_cols, "categorical_cols.pkl")
joblib.dump(numerical_cols, "numerical_cols.pkl")

# Features and target
X = df_clean.drop(columns='Depression')
y = df_clean['Depression']

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Convert to tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1)

# Define a simple MLP model
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()  # Binary classification
        )

    def forward(self, x):
        return self.model(x)

# Initialize model
input_dim = X_train.shape[1]
model = MLP(input_dim)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
n_epochs = 50
for epoch in range(n_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_tensor)
        val_loss = criterion(val_preds, y_val_tensor)
        val_acc = ((val_preds >= 0.5).float() == y_val_tensor).float().mean()
    print(f"Epoch {epoch+1}/{n_epochs}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss.item():.4f}, Val Acc: {val_acc.item():.4f}")

# Save trained model
torch.save(model.state_dict(), "model.pth")

print("Model and preprocessing artifacts saved.")


Epoch 1/50, Train Loss: 1.9177, Val Loss: 1.2790, Val Acc: 0.2819
Epoch 2/50, Train Loss: 1.3894, Val Loss: 0.8427, Val Acc: 0.4483
Epoch 3/50, Train Loss: 0.9820, Val Loss: 0.5810, Val Acc: 0.7140
Epoch 4/50, Train Loss: 0.7196, Val Loss: 0.4918, Val Acc: 0.8183
Epoch 5/50, Train Loss: 0.5957, Val Loss: 0.5079, Val Acc: 0.8183
Epoch 6/50, Train Loss: 0.5696, Val Loss: 0.5610, Val Acc: 0.8183
Epoch 7/50, Train Loss: 0.5914, Val Loss: 0.6163, Val Acc: 0.8183
Epoch 8/50, Train Loss: 0.6280, Val Loss: 0.6608, Val Acc: 0.8183
Epoch 9/50, Train Loss: 0.6628, Val Loss: 0.6910, Val Acc: 0.8183
Epoch 10/50, Train Loss: 0.6878, Val Loss: 0.7071, Val Acc: 0.8183
Epoch 11/50, Train Loss: 0.7020, Val Loss: 0.7108, Val Acc: 0.8183
Epoch 12/50, Train Loss: 0.7064, Val Loss: 0.7039, Val Acc: 0.8183
Epoch 13/50, Train Loss: 0.7026, Val Loss: 0.6885, Val Acc: 0.8183
Epoch 14/50, Train Loss: 0.6877, Val Loss: 0.6665, Val Acc: 0.8183
Epoch 15/50, Train Loss: 0.6695, Val Loss: 0.6400, Val Acc: 0.8183
Epoc

In [5]:
import torch
import joblib
import torch.nn as nn

# Define the same model class used during training
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.model(x)

# Load categorical and numerical column lists
categorical_cols = joblib.load("categorical_cols.pkl")
numerical_cols = joblib.load("numerical_cols.pkl")
input_dim = len(categorical_cols) + len(numerical_cols)

# Load model weights
model = MLP(input_dim)
model.load_state_dict(torch.load("model.pth"))
model.eval()


MLP(
  (model): Sequential(
    (0): Linear(in_features=17, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=32, bias=True)
    (4): ReLU()
    (5): Linear(in_features=32, out_features=1, bias=True)
    (6): Sigmoid()
  )
)

In [6]:
import pandas as pd
import torch
import numpy as np
import joblib

# Load preprocessing artifacts from training
label_encoders = joblib.load("label_encoders.pkl")
scaler = joblib.load("scaler.pkl")
categorical_cols = joblib.load("categorical_cols.pkl")
numerical_cols = joblib.load("numerical_cols.pkl")
medians = joblib.load("medians.pkl")

# Load the test dataset
test_df = pd.read_csv("test.csv")

# Drop unnecessary columns
test_df_clean = test_df.drop(columns=["id", "Name"])

# Handle missing values
test_df_clean[categorical_cols] = test_df_clean[categorical_cols].fillna('Unknown')
test_df_clean[numerical_cols] = test_df_clean[numerical_cols].fillna(medians)

# Replace unseen categories and encode
for col in categorical_cols:
    le = label_encoders[col]
    if 'Unknown' not in le.classes_:
        le.classes_ = np.append(le.classes_, 'Unknown')
    known_classes = set(le.classes_)
    test_df_clean[col] = test_df_clean[col].apply(lambda x: x if x in known_classes else 'Unknown')
    test_df_clean[col] = le.transform(test_df_clean[col])

# Normalize numerical columns
test_df_clean[numerical_cols] = scaler.transform(test_df_clean[numerical_cols])

# Convert to tensor for model input
X_test_tensor = torch.tensor(test_df_clean.values, dtype=torch.float32)

# Predict using the trained PyTorch model
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_preds = (test_outputs >= 0.5).int().numpy().flatten()

# Add predictions to the original test set
test_results = test_df.copy()
test_results['Predicted_Depression'] = test_preds

# Save to CSV (optional)
test_results.to_csv("test_predictions.csv", index=False)

# Print sample results
print("Predictions complete. Sample output:")
print(test_results[['Predicted_Depression']].head())


Predictions complete. Sample output:
   Predicted_Depression
0                     0
1                     0
2                     0
3                     0
4                     0
