In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data/train.csv")

In [3]:
df.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [6]:
df.isnull().mean()*100

id                                        0.000000
Name                                      0.000000
Gender                                    0.000000
Age                                       0.000000
City                                      0.000000
Working Professional or Student           0.000000
Profession                               26.034115
Academic Pressure                        80.172708
Work Pressure                            19.842217
CGPA                                     80.171997
Study Satisfaction                       80.172708
Job Satisfaction                         19.836532
Sleep Duration                            0.000000
Dietary Habits                            0.002843
Degree                                    0.001421
Have you ever had suicidal thoughts ?     0.000000
Work/Study Hours                          0.000000
Financial Stress                          0.002843
Family History of Mental Illness          0.000000
Depression                     

In [7]:
# Columns with very high missingness (e.g., >80%) may not contribute meaningful information.
threshold = 0.8  # Drop columns with >80% missing values
df = df.loc[:, df.isnull().mean() < threshold]

In [9]:
from sklearn.impute import KNNImputer

# Example for numerical imputation with KNN
imputer = KNNImputer(n_neighbors=5)
numerical_cols = ['Job Satisfaction', 'Work Pressure']
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [10]:
df['Degree'] = df['Degree'].fillna('Unknown')

In [12]:
df.isnull().mean()*100

id                                        0.000000
Name                                      0.000000
Gender                                    0.000000
Age                                       0.000000
City                                      0.000000
Working Professional or Student           0.000000
Profession                               26.034115
Work Pressure                             0.000000
Job Satisfaction                          0.000000
Sleep Duration                            0.000000
Dietary Habits                            0.002843
Degree                                    0.000000
Have you ever had suicidal thoughts ?     0.000000
Work/Study Hours                          0.000000
Financial Stress                          0.002843
Family History of Mental Illness          0.000000
Depression                                0.000000
dtype: float64

In [17]:
from sklearn.preprocessing import LabelEncoder

# Encode Dietary Habits column
df['Dietary_Habits_encoded'] = LabelEncoder().fit_transform(df['Dietary Habits'].astype(str))

In [18]:
cols_to_impute = ['Profession_encoded', 'Dietary_Habits_encoded', 'Financial Stress']

In [19]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Configure IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=42)
df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])

In [20]:
# Reverse mapping (if required)
inverse_map = dict(enumerate(LabelEncoder().fit(df['Profession'].astype(str)).classes_))
df['Profession'] = df['Profession_encoded'].map(inverse_map)

In [21]:
inverse_map = dict(enumerate(LabelEncoder().fit(df['Dietary Habits'].astype(str)).classes_))
df['Dietary Habits'] = df['Dietary_Habits_encoded'].map(inverse_map)

In [22]:
print(df.isnull().sum())


id                                       0
Name                                     0
Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Work Pressure                            0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
Profession_encoded                       0
Dietary_Habits_encoded                   0
dtype: int64


In [24]:
# Map textual data to numeric values for 'Work/Study Hours'
work_hours_mapping = {
    "Less than 4 hours": 1,
    "4-6 hours": 2,
    "6-8 hours": 3,
    "More than 8 hours": 4
}

df['Work/Study Hours Encoded'] = df['Work/Study Hours'].map(work_hours_mapping)

# Check if all data in numerical_cols is now numeric
numerical_cols = [
    'Age', 'Work Pressure', 'Job Satisfaction', 'Sleep Duration', 
    'Financial Stress', 'Profession_encoded', 'Dietary_Habits_encoded', 
    'Work/Study Hours Encoded'
]

print(df[numerical_cols].dtypes)  # Should now all be numeric


Age                         float64
Work Pressure               float64
Job Satisfaction            float64
Sleep Duration               object
Financial Stress            float64
Profession_encoded          float64
Dietary_Habits_encoded      float64
Work/Study Hours Encoded    float64
dtype: object


In [25]:
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df[numerical_cols])

ValueError: could not convert string to float: 'More than 8 hours'