# Data Cleaning

In [None]:
import pandas as pd

# Loading the dataset
file_path = "../Dataset/final_depression_dataset.csv"
df = pd.read_csv(file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 2556 non-null   object 
 1   Age                                    2556 non-null   int64  
 2   City                                   2556 non-null   object 
 3   Working Professional or Student        2556 non-null   object 
 4   Profession                             1883 non-null   object 
 5   Academic Pressure                      502 non-null    float64
 6   Work Pressure                          2054 non-null   float64
 7   CGPA                                   502 non-null    float64
 8   Study Satisfaction                     502 non-null    float64
 9   Job Satisfaction                       2054 non-null   float64
 10  Sleep Duration                         2556 non-null   object 
 11  Diet

In [None]:
# Checking for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())

# Checking for missing values
print("Missing values in each column:\n", df.isnull().sum())

Number of duplicate rows: 0
Missing values in each column:
 Gender                                      0
Age                                         0
City                                        0
Working Professional or Student             0
Profession                                673
Academic Pressure                        2054
Work Pressure                             502
CGPA                                     2054
Study Satisfaction                       2054
Job Satisfaction                          502
Sleep Duration                              0
Dietary Habits                              0
Degree                                      0
Have you ever had suicidal thoughts ?       0
Work/Study Hours                            0
Financial Stress                            0
Family History of Mental Illness            0
Depression                                  0
dtype: int64


In [None]:
# Fill missing values for numerical columns with -1
numerical_columns_with_na = ['Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction']
df[numerical_columns_with_na] = df[numerical_columns_with_na].fillna(-1)

# Fill missing values for the 'Profession' column with 'Not Described'
df['Profession'] = df['Profession'].fillna('Not Described')

In [None]:
import os
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import joblib

encoders_dir = "../Results/Encoders"
os.makedirs(encoders_dir, exist_ok=True)

# Features and categories in order
features = ['Sleep Duration', 'Dietary Habits', 'Degree']
categories = [['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours'],
                ['Unhealthy', 'Moderate', 'Healthy'],
                ['Class 12', 'B.Com', 'M.Com', 'MD', 'BE', 'MCA', 'BA', 'LLM', 'BCA', 'B.Ed', 'M.Tech', 'LLB', 'B.Arch', 'ME', 'MA' 'MBA',
                 'M.Pharm', 'MBBS', 'PhD', 'BSc', 'MSc', 'MHM', 'BBA', 'BHM', 'B.Tech', 'M.Ed', 'B.Pharm']]

# Performing Ordinal Encoding
df[features] = df[features].astype(str)
encoder = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1)
df[features] = encoder.fit_transform(df[features])

# Saving encoder
joblib.dump(encoder, os.path.join(encoders_dir, 'ordinal_encoder.pkl'))

['../Results/Encoders\\ordinal_encoder.pkl']