# Data Cleaning

In [None]:
import pandas as pd   # Importing pandas library for data handling

# Loading the dataset
file_path = "../Dataset/final_depression_dataset.csv"  # File path to the CSV dataset
df = pd.read_csv(file_path)  # Reading the CSV file into a pandas DataFrame
df.info()   # Displaying information about columns, datatypes, and missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 2556 non-null   object 
 1   Age                                    2556 non-null   int64  
 2   City                                   2556 non-null   object 
 3   Working Professional or Student        2556 non-null   object 
 4   Profession                             1883 non-null   object 
 5   Academic Pressure                      502 non-null    float64
 6   Work Pressure                          2054 non-null   float64
 7   CGPA                                   502 non-null    float64
 8   Study Satisfaction                     502 non-null    float64
 9   Job Satisfaction                       2054 non-null   float64
 10  Sleep Duration                         2556 non-null   object 
 11  Diet

In [None]:
# Checking for duplicate rows
print("Number of duplicate rows:", df.duplicated().sum())  # Counting and printing the number of duplicate records

# Checking for missing values
print("Missing values in each column:\n", df.isnull().sum())  # Printing missing value count for each column

Number of duplicate rows: 0
Missing values in each column:
 Gender                                      0
Age                                         0
City                                        0
Working Professional or Student             0
Profession                                673
Academic Pressure                        2054
Work Pressure                             502
CGPA                                     2054
Study Satisfaction                       2054
Job Satisfaction                          502
Sleep Duration                              0
Dietary Habits                              0
Degree                                      0
Have you ever had suicidal thoughts ?       0
Work/Study Hours                            0
Financial Stress                            0
Family History of Mental Illness            0
Depression                                  0
dtype: int64


In [None]:
# Filling missing values for numerical columns with -1
numerical_columns_with_na = ['Academic Pressure', 'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction']   # Columns with numerical data possibly containing NaNs
df[numerical_columns_with_na] = df[numerical_columns_with_na].fillna(-1)  # Replacing NaNs with -1

# Filling missing values for the 'Profession' column with 'Not Described'
df['Profession'] = df['Profession'].fillna('Not Described')  # Replacing missing profession with placeholder text

In [None]:
df  # Displaying the updated DataFrame

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Female,37,Ghaziabad,Working Professional,Teacher,-1.0,2.0,-1.0,-1.0,4.0,7-8 hours,Moderate,MA,No,6,2,No,No
1,Male,60,Kalyan,Working Professional,Financial Analyst,-1.0,4.0,-1.0,-1.0,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
2,Female,42,Bhopal,Working Professional,Teacher,-1.0,2.0,-1.0,-1.0,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No
3,Female,44,Thane,Working Professional,Teacher,-1.0,3.0,-1.0,-1.0,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
4,Male,48,Indore,Working Professional,UX/UI Designer,-1.0,4.0,-1.0,-1.0,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,Male,25,Bangalore,Working Professional,Consultant,-1.0,1.0,-1.0,-1.0,5.0,5-6 hours,Healthy,BBA,Yes,12,3,Yes,No
2552,Female,23,Pune,Working Professional,Teacher,-1.0,3.0,-1.0,-1.0,1.0,Less than 5 hours,Moderate,MA,Yes,8,3,No,Yes
2553,Female,24,Srinagar,Working Professional,HR Manager,-1.0,1.0,-1.0,-1.0,4.0,Less than 5 hours,Moderate,BA,Yes,4,4,No,No
2554,Female,56,Bangalore,Working Professional,Business Analyst,-1.0,2.0,-1.0,-1.0,3.0,7-8 hours,Healthy,BBA,No,4,5,Yes,No


In [None]:
import os  # For directory operations
from sklearn.preprocessing import OrdinalEncoder   # For encoding ordinal categorical data
import joblib   # To save and load Python objects

encoders_dir = "../Results/Encoders"  # Defining path to save encoders
os.makedirs(encoders_dir, exist_ok=True)  # Creating the directory if it doesn't exist

# Features and categories in order
features = ['Sleep Duration', 'Dietary Habits', 'Degree']  # List of features with ordinal meaning
categories = [['Less than 5 hours', '5-6 hours', '7-8 hours', 'More than 8 hours'],
                ['Unhealthy', 'Moderate', 'Healthy'],
                ['Class 12', 'B.Com', 'M.Com', 'MD', 'BE', 'MCA', 'BA', 'LLM', 'BCA', 'B.Ed', 'M.Tech', 'LLB', 'B.Arch', 'ME', 'MA' 'MBA',
                 'M.Pharm', 'MBBS', 'PhD', 'BSc', 'MSc', 'MHM', 'BBA', 'BHM', 'B.Tech', 'M.Ed', 'B.Pharm']]

# Performing Ordinal Encoding
df[features] = df[features].astype(str)   # Converting ordinal columns to string type for encoding
encoder = OrdinalEncoder(categories=categories, handle_unknown='use_encoded_value', unknown_value=-1)  # Handling unknown values gracefully
df[features] = encoder.fit_transform(df[features])  # Fitting encoder and transform the dataset

# Saving encoder
joblib.dump(encoder, os.path.join(encoders_dir, 'ordinal_encoder.pkl'))

['../Results/Encoders\\ordinal_encoder.pkl']