In [2]:

#Basic
import pandas as pd
import numpy as np

#Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

#Transformation/Procesing 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

#Model
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import RidgeCV
from lightgbm import LGBMRegressor
import xgboost as xgb
from mlxtend.regressor import StackingCVRegressor
from sklearn.model_selection import KFold

#Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
train.head(10)

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0
5,5,Vani,Female,59.0,Ahmedabad,Working Professional,Finanancial Analyst,,2.0,,,5.0,5-6 hours,Healthy,MCA,No,7.0,5.0,No,0
6,6,Ritvik,Male,47.0,Thane,Working Professional,Chemist,,5.0,,,2.0,7-8 hours,Moderate,MD,No,6.0,2.0,No,0
7,7,Rajveer,Male,38.0,Nashik,Working Professional,Teacher,,3.0,,,4.0,7-8 hours,Unhealthy,B.Pharm,No,10.0,3.0,Yes,0
8,8,Aishwarya,Female,24.0,Bangalore,Student,,2.0,,5.9,5.0,,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
9,9,Simran,Female,42.0,Patna,Working Professional,Electrician,,4.0,,,1.0,5-6 hours,Healthy,ME,Yes,7.0,2.0,Yes,0


Filling NA

In [5]:
train.isnull().sum()

id                                            0
Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working Professional or Student               0
Profession                                36630
Academic Pressure                        112803
Work Pressure                             27918
CGPA                                     112802
Study Satisfaction                       112803
Job Satisfaction                          27910
Sleep Duration                                0
Dietary Habits                                4
Degree                                        2
Have you ever had suicidal thoughts ?         0
Work/Study Hours                              0
Financial Stress                              4
Family History of Mental Illness              0
Depression                                    0
dtype: int64

In [6]:
test.isnull().sum()

id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               24632
Academic Pressure                        75033
Work Pressure                            18778
CGPA                                     75034
Study Satisfaction                       75033
Job Satisfaction                         18774
Sleep Duration                               0
Dietary Habits                               5
Degree                                       2
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
dtype: int64

In [7]:
train.shape


(140700, 20)

In [8]:
train = train.drop(['id'], axis=1)

In [9]:
train['Profession'].fillna("", inplace=True)
train['Degree'].fillna("", inplace=True)

In [10]:
train['Academic Pressure'].fillna(0, inplace=True)
train['Work Pressure'].fillna(0, inplace=True)

In [11]:
train['CGPA'].fillna(-1, inplace=True)
train['Study Satisfaction'].fillna(-1, inplace=True)
train['Job Satisfaction'].fillna(-1, inplace=True)

In [12]:
train['Financial Stress'].fillna(train['Financial Stress'].mode()[0], inplace=True)
train['Dietary Habits'].fillna(train['Dietary Habits'].mode()[0], inplace=True)

In [13]:
test_id = test['id']
test = test.drop(['id'], axis=1)
test['Profession'].fillna("", inplace=True)
test['Degree'].fillna("", inplace=True)
test['Academic Pressure'].fillna(0, inplace=True)
test['Work Pressure'].fillna(0, inplace=True)
test['CGPA'].fillna(-1, inplace=True)
test['Study Satisfaction'].fillna(-1, inplace=True)
test['Job Satisfaction'].fillna(-1, inplace=True)
test['Financial Stress'].fillna(test['Financial Stress'].mode()[0], inplace=True)
test['Dietary Habits'].fillna(test['Dietary Habits'].mode()[0], inplace=True)


In [14]:
test.isnull().sum()

Name                                     0
Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
dtype: int64

In [15]:
train.isnull().sum()

Name                                     0
Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

EDA


In [16]:
valeur_counts_name = train['Name'].value_counts()
valeur_counts_name

Name
Rohan       3178
Aarav       2336
Rupak       2176
Aaradhya    2045
Anvi        2035
            ... 
Anhil          1
Rieta          1
Zahra          1
Jathesh        1
Aarun          1
Name: count, Length: 422, dtype: int64

In [17]:
valeur_counts_city = train['City'].value_counts()
valeur_counts_city

City
Kalyan         6591
Patna          5924
Vasai-Virar    5765
Kolkata        5689
Ahmedabad      5613
               ... 
Shrey             1
Ivaan             1
Vaanya            1
Gaurav            1
Unirar            1
Name: count, Length: 98, dtype: int64

In [18]:
valeur_counts_Profession = train['Profession'].value_counts()
valeur_counts_Profession

Profession
                     36630
Teacher              24906
Content Writer        7814
Architect             4370
Consultant            4229
                     ...  
Dev                      1
BE                       1
B.Com                    1
Family Consultant        1
Yuvraj                   1
Name: count, Length: 65, dtype: int64

In [19]:
valeur_counts_Degree = train['Degree'].value_counts()
valeur_counts_Degree

Degree
Class 12    14729
B.Ed        11691
B.Arch       8742
B.Com        8113
B.Pharm      5856
            ...  
Vivaan          1
MTech           1
29              1
LLCom           1
Advait          1
Name: count, Length: 116, dtype: int64

In [20]:
valeur_counts_Sleep = train['Sleep Duration'].value_counts()
valeur_counts_Sleep

Sleep Duration
Less than 5 hours    38784
7-8 hours            36969
More than 8 hours    32726
5-6 hours            32142
3-4 hours               12
6-7 hours                8
4-5 hours                7
2-3 hours                5
4-6 hours                5
6-8 hours                4
1-6 hours                4
No                       4
9-11 hours               2
10-11 hours              2
Sleep_Duration           2
Unhealthy                2
45                       2
8-9 hours                2
10-6 hours               1
9-5                      1
45-48 hours              1
3-6 hours                1
Work_Study_Hours         1
49 hours                 1
than 5 hours             1
Pune                     1
9-6 hours                1
8 hours                  1
35-36 hours              1
Indore                   1
1-3 hours                1
55-66 hours              1
Moderate                 1
40-45 hours              1
1-2 hours                1
9-5 hours                1
Name: count, 

In [21]:
valeur_counts_Dietary = train['Dietary Habits'].value_counts()
valeur_counts_Dietary

Dietary Habits
Moderate             49709
Unhealthy            46227
Healthy              44741
Yes                      2
No                       2
More Healthy             2
No Healthy               1
Class 12                 1
Indoor                   1
Male                     1
Vegas                    1
M.Tech                   1
Less Healthy             1
1.0                      1
Electrician              1
Hormonal                 1
Mihir                    1
Less than Healthy        1
3                        1
Gender                   1
BSc                      1
Pratham                  1
2                        1
Name: count, dtype: int64

In [22]:
valeur_counts_Dietary_t = test['Dietary Habits'].value_counts()
aleur_counts_Dietary_t = test['Sleep Duration'].value_counts()

Transformation


In [None]:
threshold = 100  

train['Dietary Habits'] = train['Dietary Habits'].apply(lambda x: x if valeur_counts_Dietary[x] >= threshold else 'Moderate')
train['Sleep Duration'] = train['Sleep Duration'].apply(lambda x: x if valeur_counts_Sleep[x] >= threshold else 'Moderate') 

test['Dietary Habits'] = test['Dietary Habits'].apply(lambda x: x if valeur_counts_Dietary_t[x] >= threshold else 'Moderate')
test['Sleep Duration'] = test['Sleep Duration'].apply(lambda x: x if aleur_counts_Dietary_t[x] >= threshold else 'Moderate') 



In [24]:
train['Working Professional or Student'] = train['Working Professional or Student'].replace({
    'Working Professional': 1,
    'Student': 0
})

train['Have you ever had suicidal thoughts ?'] = train['Have you ever had suicidal thoughts ?'].replace({
    'Yes': 1,
    'No': 0
})

train['Gender'] = train['Gender'].replace({
    'Male': 1,
    'Female': 0
})

train['Family History of Mental Illness'] = train['Family History of Mental Illness'].replace({
    'Yes': 1,
    'No': 0
})



In [25]:
test['Working Professional or Student'] = test['Working Professional or Student'].replace({
    'Working Professional': 1,
    'Student': 0
})

test['Have you ever had suicidal thoughts ?'] = test['Have you ever had suicidal thoughts ?'].replace({
    'Yes': 1,
    'No': 0
})

test['Gender'] = test['Gender'].replace({
    'Male': 1,
    'Female': 0
})

test['Family History of Mental Illness'] = test['Family History of Mental Illness'].replace({
    'Yes': 1,
    'No': 0
})


In [26]:
train['City_encoded'] = train['City'].map(valeur_counts_city)
train['Degree_encoded'] = train['Degree'].map(valeur_counts_Degree)
train['Name_encoded'] = train['Name'].map(valeur_counts_name)
train['Profession_encoded'] = train['Profession'].map(valeur_counts_Profession)

In [27]:
valeur_counts_city_t = test['City'].value_counts()
valeur_counts_Degree_t = test['Degree'].value_counts()
valeur_counts_name_t = test['Name'].value_counts()
valeur_counts_Profession_t = test['Profession'].value_counts()

In [28]:
test['City_encoded'] = test['City'].map(valeur_counts_city_t)
test['Degree_encoded'] = test['Degree'].map(valeur_counts_Degree_t)
test['Name_encoded'] = test['Name'].map(valeur_counts_name_t)
test['Profession_encoded'] = test['Profession'].map(valeur_counts_Profession_t)


In [29]:
train.drop(['City', 'Degree', 'Name', 'Profession'], axis=1, inplace=True)
test.drop(['City', 'Degree', 'Name', 'Profession'], axis=1, inplace=True)

In [30]:
train = pd.get_dummies(train, columns=['Sleep Duration', 'Dietary Habits'])
test = pd.get_dummies(test, columns=['Sleep Duration', 'Dietary Habits'])

In [31]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [32]:
test.head()

Unnamed: 0,Gender,Age,Working Professional or Student,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,City_encoded,Degree_encoded,Name_encoded,Profession_encoded,Sleep Duration_5-6 hours,Sleep Duration_7-8 hours,Sleep Duration_Less than 5 hours,Sleep Duration_Moderate,Sleep Duration_More than 8 hours,Dietary Habits_Healthy,Dietary Habits_Moderate,Dietary Habits_Unhealthy
0,1,53.0,1,0.0,2.0,-1.0,-1.0,5.0,0,9.0,3.0,1,3416,2938,717,1189,False,False,True,False,False,False,True,False
1,0,58.0,1,0.0,2.0,-1.0,-1.0,4.0,0,6.0,4.0,0,3726,7762,897,1827,False,False,True,False,False,False,True,False
2,1,53.0,1,0.0,4.0,-1.0,-1.0,1.0,1,12.0,4.0,0,2874,6037,296,16385,False,True,False,False,False,False,True,False
3,0,23.0,0,5.0,0.0,6.84,1.0,-1.0,1,10.0,4.0,0,3450,3314,228,24632,False,False,False,False,True,False,True,False
4,1,47.0,1,0.0,5.0,-1.0,-1.0,5.0,1,3.0,4.0,0,4387,3869,609,16385,False,True,False,False,False,False,True,False


In [33]:
train.head()

Unnamed: 0,Gender,Age,Working Professional or Student,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,City_encoded,Degree_encoded,Name_encoded,Profession_encoded,Sleep Duration_5-6 hours,Sleep Duration_7-8 hours,Sleep Duration_Less than 5 hours,Sleep Duration_Moderate,Sleep Duration_More than 8 hours,Dietary Habits_Healthy,Dietary Habits_Moderate,Dietary Habits_Unhealthy
0,0,49.0,1,0.0,5.0,-1.0,-1.0,2.0,0,1.0,2.0,0,0,5226,4305,2045,2862,False,False,False,False,True,True,False,False
1,1,26.0,1,0.0,4.0,-1.0,-1.0,3.0,1,7.0,3.0,0,1,4606,4348,963,24906,False,False,True,False,False,False,False,True
2,1,33.0,0,5.0,0.0,8.97,2.0,-1.0,1,3.0,1.0,0,1,5176,5856,730,36630,True,False,False,False,False,True,False,False
3,1,22.0,1,0.0,5.0,-1.0,-1.0,1.0,1,10.0,1.0,1,1,4966,5030,730,24906,False,False,True,False,False,False,True,False
4,0,30.0,1,0.0,1.0,-1.0,-1.0,1.0,1,9.0,4.0,1,0,4398,5030,499,3161,True,False,False,False,False,False,False,True


In [34]:
test.shape

(93800, 24)

In [35]:
train.shape

(140700, 25)

In [None]:

train_columns = set(train.columns)
test_columns = set(test.columns)

only_in_train = train_columns - test_columns

only_in_test = test_columns - train_columns
print("Colonnes présentes uniquement dans train :", only_in_train)
print("Colonnes présentes uniquement dans test :", only_in_test)

Colonnes présentes uniquement dans train : {'Depression'}
Colonnes présentes uniquement dans test : set()


Modelling 

In [37]:
X = train.drop('Depression', axis=1)  
y = train['Depression']

In [38]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC

estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)),
    ('svm', SVC(probability=True))
]

In [39]:
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression()
)

In [40]:
stacking_model.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [41]:
y_train_pred = stacking_model.predict(X)
train_accuracy = accuracy_score(y, y_train_pred)
print(f"Accuracy sur l'ensemble d'entraînement avec stacking : {train_accuracy:.2f}")

Accuracy sur l'ensemble d'entraînement avec stacking : 0.98


In [43]:
accuracy = accuracy_score(y, y_train_pred)
print(f"Accuracy sur l'ensemble d'entraînement : {accuracy:.2f}")

Accuracy sur l'ensemble d'entraînement : 0.98


In [44]:
y_test_pred = stacking_model.predict(test)


In [None]:
results = pd.DataFrame({
    'id': test_id,      
    'Depression': y_test_pred
})

results.to_csv('predictions.csv', index=False)

print("Les prédictions ont été sauvegardées dans predictions.csv.")

Les prédictions ont été sauvegardées dans predictions.csv.
