In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print(train)

            id      Name  Gender   Age           City  \
0            0  Aaradhya  Female  49.0       Ludhiana   
1            1     Vivan    Male  26.0       Varanasi   
2            2    Yuvraj    Male  33.0  Visakhapatnam   
3            3    Yuvraj    Male  22.0         Mumbai   
4            4      Rhea  Female  30.0         Kanpur   
...        ...       ...     ...   ...            ...   
140695  140695     Vidya  Female  18.0      Ahmedabad   
140696  140696      Lata  Female  41.0      Hyderabad   
140697  140697   Aanchal  Female  24.0        Kolkata   
140698  140698    Prachi  Female  49.0       Srinagar   
140699  140699       Sai    Male  27.0          Patna   

       Working Professional or Student         Profession  Academic Pressure  \
0                 Working Professional               Chef                NaN   
1                 Working Professional            Teacher                NaN   
2                              Student                NaN                5.

In [2]:
# Check for distinct values in the 'Job Satisfaction' column
test['Job Satisfaction'].value_counts()

Job Satisfaction
2.0    16971
5.0    14827
1.0    14793
3.0    14464
4.0    13971
Name: count, dtype: int64

In [3]:
train.duplicated().sum()

# No duplicates

0

In [4]:
train['Job Satisfaction'] = train['Job Satisfaction'].fillna('-1')
test['Job Satisfaction'] = test['Job Satisfaction'].fillna('-1')

In [6]:
# Need to remove ID and Name
train = train.drop(columns='Name')
test = test.drop(columns='Name')

train = train.drop(columns='id')

In [7]:
# Replacing Degree categories with less than 50 occurrences with 'Unknown'
degree_counts = train['Degree'].value_counts()
rare_degrees = degree_counts[degree_counts < 50].index

train['Degree'] = train['Degree'].replace(rare_degrees, 'Unknown')
test['Degree'] = test['Degree'].replace(rare_degrees, 'Unknown')

category_mapping = {
    'Class 12': 'High School',
    'B.Ed': 'Professional',
    'B.Arch': 'Undergraduate',
    'B.Com': 'Undergraduate',
    'B.Pharm': 'Undergraduate',
    'BCA': 'Undergraduate',
    'M.Ed': 'Postgraduate',
    'MCA': 'Postgraduate',
    'BBA': 'Undergraduate',
    'BSc': 'Undergraduate',
    'MSc': 'Postgraduate',
    'LLM': 'Professional',
    'M.Pharm': 'Postgraduate',
    'M.Tech': 'Postgraduate',
    'B.Tech': 'Undergraduate',
    'LLB': 'Professional',
    'BHM': 'Undergraduate',
    'MBA': 'Postgraduate',
    'BA': 'Undergraduate',
    'ME': 'Postgraduate',
    'MD': 'Professional',
    'MHM': 'Postgraduate',
    'BE': 'Undergraduate',
    'PhD': 'Professional',
    'M.Com': 'Postgraduate',
    'MBBS': 'Professional',
    'MA': 'Postgraduate',
    'Unknown': 'Unknown'
}

test['Degree'] = test['Degree'].map(category_mapping)
train['Degree'] = train['Degree'].map(category_mapping)

In [8]:
train.columns

Index(['Gender', 'Age', 'City', 'Working Professional or Student',
       'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness', 'Depression'],
      dtype='object')

In [9]:
# Replacing Profession categories with less than 50 occurrences with 'Unknown'
profession_counts = train['Profession'].value_counts()
rare_professions = profession_counts[profession_counts < 50].index

train['Profession'] = train['Profession'].replace(rare_professions, 'Unknown')
test['Profession'] = test['Profession'].replace(rare_professions, 'Unknown')

train['Profession'].value_counts()

Profession
Teacher                   24906
Content Writer             7814
Architect                  4370
Consultant                 4229
HR Manager                 4022
Pharmacist                 3893
Doctor                     3255
Business Analyst           3161
Entrepreneur               2968
Chemist                    2967
Chef                       2862
Educational Consultant     2852
Data Scientist             2390
Researcher                 2328
Lawyer                     2212
Customer Support           2055
Marketing Manager          1976
Pilot                      1913
Travel Consultant          1860
Plumber                    1748
Sales Executive            1739
Manager                    1737
Judge                      1712
Electrician                1582
Financial Analyst          1574
Software Engineer          1510
Civil Engineer             1470
UX/UI Designer             1452
Digital Marketer           1372
Accountant                 1339
Finanancial Analyst        13

In [10]:
# Checking the unique values in the 'Sleep Duration' column
sleep_duration_counts = train['Sleep Duration'].value_counts()
print(sleep_duration_counts)

# After viewing the counts, set a threshold and group others as 'Unknown'
rare_sleep_durations = sleep_duration_counts[sleep_duration_counts < 50].index

# Replace rare categories with 'Unknown'
train['Sleep Duration'] = train['Sleep Duration'].replace(rare_sleep_durations, 'Unknown')
test['Sleep Duration'] = test['Sleep Duration'].replace(rare_sleep_durations, 'Unknown')

# Confirming the changes
print(train['Sleep Duration'].value_counts())

Sleep Duration
Less than 5 hours    38784
7-8 hours            36969
More than 8 hours    32726
5-6 hours            32142
3-4 hours               12
6-7 hours                8
4-5 hours                7
2-3 hours                5
4-6 hours                5
6-8 hours                4
1-6 hours                4
No                       4
9-11 hours               2
10-11 hours              2
Sleep_Duration           2
Unhealthy                2
45                       2
8-9 hours                2
10-6 hours               1
9-5                      1
45-48 hours              1
3-6 hours                1
Work_Study_Hours         1
49 hours                 1
than 5 hours             1
Pune                     1
9-6 hours                1
8 hours                  1
35-36 hours              1
Indore                   1
1-3 hours                1
55-66 hours              1
Moderate                 1
40-45 hours              1
1-2 hours                1
9-5 hours                1
Name: count, 

In [11]:
train['Sleep Duration'].unique()

array(['More than 8 hours', 'Less than 5 hours', '5-6 hours', '7-8 hours',
       'Unknown'], dtype=object)

In [12]:
# Mapping function to categorize
def categorize_sleep(duration):
    duration = str(duration).lower()  
    if "less than" in duration or "1-" in duration or "2-" in duration or "3-" in duration or "4-" in duration:
        return "Less than 5 hours"
    elif "5-8" in duration or "5-" in duration or "6-" in duration or "7-" in duration:
        return "5-8 hours"
    elif "more than" in duration or "8-" in duration or "9-" in duration or "10-" in duration:
        return "More than 8 hours"
    else:
        return "Unknown"

# Apply mapping
train["Sleep Category"] = train["Sleep Duration"].apply(categorize_sleep)
train = train.drop(columns='Sleep Duration')

# Drop the Unknown labels and then do all of the same things for test
train["Sleep Category"].value_counts()

test["Sleep Category"] = test["Sleep Duration"].apply(categorize_sleep)
test = test.drop(columns='Sleep Duration')

In [13]:
# Display columns in the train DataFrame with null values and their counts
null_counts = train.isnull().sum()
null_counts = null_counts[null_counts > 0]  
print(null_counts)

Profession             36630
Academic Pressure     112803
Work Pressure          27918
CGPA                  112802
Study Satisfaction    112803
Dietary Habits             4
Degree                     2
Financial Stress           4
dtype: int64


In [14]:
train

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Sleep Category
0,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,Healthy,Undergraduate,No,1.0,2.0,No,0,More than 8 hours
1,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Unhealthy,Professional,Yes,7.0,3.0,No,1,Less than 5 hours
2,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,-1,Healthy,Undergraduate,Yes,3.0,1.0,No,1,5-8 hours
3,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Moderate,Undergraduate,Yes,10.0,1.0,Yes,1,Less than 5 hours
4,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,Unhealthy,Undergraduate,Yes,9.0,4.0,Yes,0,5-8 hours
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,Female,18.0,Ahmedabad,Working Professional,,,5.0,,,4.0,Unhealthy,High School,No,2.0,4.0,Yes,1,5-8 hours
140696,Female,41.0,Hyderabad,Working Professional,Content Writer,,5.0,,,4.0,Moderate,Undergraduate,Yes,6.0,5.0,Yes,0,5-8 hours
140697,Female,24.0,Kolkata,Working Professional,Marketing Manager,,3.0,,,1.0,Moderate,Undergraduate,No,4.0,4.0,No,0,More than 8 hours
140698,Female,49.0,Srinagar,Working Professional,Plumber,,5.0,,,2.0,Moderate,Postgraduate,Yes,10.0,1.0,No,0,5-8 hours


In [15]:
# Convert categorical variables to dummy variables
train = pd.get_dummies(train, dtype=int, drop_first=True)
test = pd.get_dummies(test, dtype=int, drop_first=True)


In [16]:
# Replace null values with -1
train = train.fillna(-1)
test = test.fillna(-1)

train

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Work/Study Hours,Financial Stress,Depression,Gender_Male,City_Aaradhya,...,Dietary Habits_Yes,Degree_Postgraduate,Degree_Professional,Degree_Undergraduate,Degree_Unknown,Have you ever had suicidal thoughts ?_Yes,Family History of Mental Illness_Yes,Sleep Category_Less than 5 hours,Sleep Category_More than 8 hours,Sleep Category_Unknown
0,49.0,-1.0,5.0,-1.00,-1.0,1.0,2.0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,26.0,-1.0,4.0,-1.00,-1.0,7.0,3.0,1,1,0,...,0,0,1,0,0,1,0,1,0,0
2,33.0,5.0,-1.0,8.97,2.0,3.0,1.0,1,1,0,...,0,0,0,1,0,1,0,0,0,0
3,22.0,-1.0,5.0,-1.00,-1.0,10.0,1.0,1,1,0,...,0,0,0,1,0,1,1,1,0,0
4,30.0,-1.0,1.0,-1.00,-1.0,9.0,4.0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,18.0,-1.0,5.0,-1.00,-1.0,2.0,4.0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
140696,41.0,-1.0,5.0,-1.00,-1.0,6.0,5.0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
140697,24.0,-1.0,3.0,-1.00,-1.0,4.0,4.0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
140698,49.0,-1.0,5.0,-1.00,-1.0,10.0,1.0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


In [17]:
# using this code to select important features

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

# Train-test split
X = train.drop(columns='Depression')
y = train['Depression']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Feature importances from Random Forest
importances = clf.feature_importances_

# Create a DataFrame for better visualization
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

In [18]:
pd.set_option('display.max_rows', None) 

print(feature_importance_df)

pd.reset_option('display.max_rows')

                                               Feature    Importance
0                                                  Age  2.126138e-01
1                                    Academic Pressure  9.767996e-02
172          Have you ever had suicidal thoughts ?_Yes  8.691788e-02
3                                                 CGPA  7.189926e-02
2                                        Work Pressure  6.725194e-02
6                                     Financial Stress  5.652866e-02
5                                     Work/Study Hours  5.277968e-02
4                                   Study Satisfaction  4.501387e-02
145                                Job Satisfaction_-1  3.023426e-02
105  Working Professional or Student_Working Profes...  1.800898e-02
7                                          Gender_Male  1.292176e-02
173               Family History of Mental Illness_Yes  1.209194e-02
165                           Dietary Habits_Unhealthy  1.163188e-02
174                   Sleep Catego

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [22]:
from sklearn.model_selection import GridSearchCV

X = train.drop(columns='Depression')
y = train['Depression']

# Split train into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Make copy of test set
X_test = test  

# Step 2: Train the Random Forest model on the training set to get feature importance
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Step 3: Feature importances
importances = clf.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Step 4: Filter important features (e.g., Importance > 0.01)
important_features = feature_importance_df[feature_importance_df['Importance'] > 0.01]['Feature']

# Step 5: Select only the important features in training, validation, and test sets
X_train_important = X_train[important_features]
X_val_important = X_val[important_features]
X_test_important = X_test[important_features]  

# Step 6: Perform hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           scoring='accuracy', 
                           cv=3, 
                           n_jobs=-1)

grid_search.fit(X_train_important, y_train)

# Get the best model from grid search
best_rf = grid_search.best_estimator_

# Step 7: Predict on the validation set for evaluation
y_val_pred = best_rf.predict(X_val_important)

# Step 8: Evaluate the model on the validation set
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.2f}")
print("Classification Report (Validation):")
print(classification_report(y_val, y_val_pred))

# Step 9: Predict on the test set (without labels)
y_test_pred = best_rf.predict(X_test_important)
print("Test Set Predictions:")
print(y_test_pred)

Validation Accuracy: 0.93
Classification Report (Validation):
              precision    recall  f1-score   support

           0       0.95      0.97      0.96     22986
           1       0.84      0.79      0.81      5154

    accuracy                           0.93     28140
   macro avg       0.90      0.88      0.89     28140
weighted avg       0.93      0.93      0.93     28140

Test Set Predictions:
[0 0 0 ... 0 1 0]


In [23]:
# Print the predictions to a CSV file
test_predictions_df = pd.DataFrame({
    'id': test['id'],  
    'Depression': y_test_pred  
})

# test_predictions_df.to_csv('test_predictions.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=84030c87-ce8f-463e-9038-cd547633a9bf' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>