In [1]:
import sklearn
print(sklearn.__version__)


1.2.2


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
file_path = '/kaggle/input/student-depression-dataset/Student Depression Dataset.csv'
df = pd.read_csv(file_path)

# Fill missing values in 'Financial Stress' with the median
df['Financial Stress'].fillna(df['Financial Stress'].median(), inplace=True)

# Handle 'Sleep Duration' (map to numerical values)
sleep_duration_map = {
    'Less than 5 hours': 1,
    '5-6 hours': 2,
    '7-8 hours': 3,
    'More than 8 hours': 4
}
df['Sleep Duration'] = df['Sleep Duration'].map(sleep_duration_map)

# Label encoding for binary categorical variables
label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])  # Male: 1, Female: 0
df['Have you ever had suicidal thoughts ?'] = label_encoder.fit_transform(df['Have you ever had suicidal thoughts ?'])  # Yes: 1, No: 0
df['Family History of Mental Illness'] = label_encoder.fit_transform(df['Family History of Mental Illness'])  # Yes: 1, No: 0

# One-Hot Encoding for other categorical variables
df = pd.get_dummies(df, columns=['City', 'Profession', 'Dietary Habits', 'Degree'], drop_first=True)

# Scale numerical features (Age, CGPA, Academic Pressure, Work Pressure, etc.)
scaler = StandardScaler()
numerical_features = ['Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 
                      'Work/Study Hours', 'Financial Stress']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Check for any missing values after preprocessing
print("Missing values after preprocessing:\n", df.isnull().sum())

# Fill missing values if any (using median for all columns)
df.fillna(df.median(), inplace=True)

# Split the data into features (X) and target (y)
X = df.drop('Depression', axis=1)  # Features
y = df['Depression']  # Target variable

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if there are any missing values in X_train or y_train
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in y_train:", y_train.isnull().sum())

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Display the preprocessed data shape
print("Preprocessed Data Shape:", X_train.shape, X_test.shape)


Missing values after preprocessing:
 id                   0
Gender               0
Age                  0
Academic Pressure    0
Work Pressure        0
                    ..
Degree_ME            0
Degree_MHM           0
Degree_MSc           0
Degree_Others        0
Degree_PhD           0
Length: 108, dtype: int64
Missing values in X_train: 0
Missing values in y_train: 0
Preprocessed Data Shape: (22320, 107) (5581, 107)


In [3]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 83.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.77      0.79      2343
           1       0.84      0.87      0.86      3238

    accuracy                           0.83      5581
   macro avg       0.83      0.82      0.82      5581
weighted avg       0.83      0.83      0.83      5581


Confusion Matrix:
[[1808  535]
 [ 414 2824]]


In [4]:
import joblib

# Save the model
joblib.dump(model, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [5]:
path = '/kaggle/input/student-depression-dataset/Student Depression Dataset.csv'
df1 = pd.read_csv(path)

In [6]:
print(df1['City'].unique())
print(df1['Profession'].unique())
print(df1['Dietary Habits'].unique())
print(df1['Degree'].unique())


['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Saanvi' 'M.Tech' 'Bhavna' 'Less Delhi' 'City' '3.0'
 'Less than 5 Kalyan' 'Mira' 'Harsha' 'Vaanya' 'Gaurav' 'Harsh' 'Reyansh'
 'Kibara' 'Rashi' 'ME' 'M.Com' 'Nalyan' 'Mihir' 'Nalini' 'Nandini'
 'Khaziabad']
['Student' 'Civil Engineer' 'Architect' 'UX/UI Designer'
 'Digital Marketer' 'Content Writer' 'Educational Consultant' 'Teacher'
 'Manager' 'Chef' 'Doctor' 'Lawyer' 'Entrepreneur' 'Pharmacist']
['Healthy' 'Moderate' 'Unhealthy' 'Others']
['B.Pharm' 'BSc' 'BA' 'BCA' 'M.Tech' 'PhD' 'Class 12' 'B.Ed' 'LLB' 'BE'
 'M.Ed' 'MSc' 'BHM' 'M.Pharm' 'MCA' 'MA' 'B.Com' 'MD' 'MBA' 'MBBS' 'M.Com'
 'B.Arch' 'LLM' 'B.Tech' 'BBA' 'ME' 'MHM' 'Others']


In [7]:
column_names = X.columns.tolist()

# Print the column names
print(column_names)

['id', 'Gender', 'Age', 'Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 'Financial Stress', 'Family History of Mental Illness', 'City_Agra', 'City_Ahmedabad', 'City_Bangalore', 'City_Bhavna', 'City_Bhopal', 'City_Chennai', 'City_City', 'City_Delhi', 'City_Faridabad', 'City_Gaurav', 'City_Ghaziabad', 'City_Harsh', 'City_Harsha', 'City_Hyderabad', 'City_Indore', 'City_Jaipur', 'City_Kalyan', 'City_Kanpur', 'City_Khaziabad', 'City_Kibara', 'City_Kolkata', 'City_Less Delhi', 'City_Less than 5 Kalyan', 'City_Lucknow', 'City_Ludhiana', 'City_M.Com', 'City_M.Tech', 'City_ME', 'City_Meerut', 'City_Mihir', 'City_Mira', 'City_Mumbai', 'City_Nagpur', 'City_Nalini', 'City_Nalyan', 'City_Nandini', 'City_Nashik', 'City_Patna', 'City_Pune', 'City_Rajkot', 'City_Rashi', 'City_Reyansh', 'City_Saanvi', 'City_Srinagar', 'City_Surat', 'City_Thane', 'City_Vaanya', 'City_Vadodara', 'City_Va