In [None]:
#Packages import

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = (15, 8)

In [None]:
#Dataframe import

df = pd.read_csv('/kaggle/input/playground-series-s4e11/train.csv')

df.drop_duplicates(inplace = True)

df.head()


# Goal
* The main goal of the project is to analyze factors more likely to contribute to the development of depression in individuals.
* Also, the development of a model aiming at predicting whether a person could be suffering of depression, based on specific features, is in scope.

# 1) EDA (Exploratory Data Analysis)

In [None]:
# Here I start to analyze data.
# I already noticed above that "Work Pressure" is populated only for workers, while "Academic pressure" is only for students. I will create a single "pressure" 
# column, and a "satisfaction" one, following the same principle.

df.describe()

In [None]:
# let's rename the columns, so that they are easier to use


df.columns = df.columns.str.lower().str.replace(' ','_')

df.rename(columns={
    'working_professional_or_student' : 'work_situation',
    'have_you_ever_had_suicidal_thoughts_?' : 'suicidal_thoughts',
    'work/study_hours' : 'effort_hours',
    'family_history_of_mental_illness': 'family_mental_illness'
}, inplace = True
         )

df.head()

In [None]:
# I create two columns to account for satisfaction and pressure, work/study related

df['satisfaction'] = df[['study_satisfaction', 'job_satisfaction']].max(axis = 1)
df['pressure'] = df[['academic_pressure', 'work_pressure']].max(axis = 1)

df = df.drop(['academic_pressure', 'work_pressure','study_satisfaction', 'job_satisfaction'], axis = 1)

df

In [None]:
df.describe()   

In [None]:
df.isna().sum() / len(df)

# I notice that:

* satisfaction, pressure, degree, dietary_habits, financial_stress are missing a minuscule % of row. I'll drop these rows
* name is useless, I'll drop this feature
* cgpa is missing for >80% of rows. I'll drop the feature, as I think that estimating all rows would require too many assumptions.
* profession is missing for roughly 26% of the dataframe. I will try to estimate this and see how my model behaves with this feature included.

In [None]:
df = df.drop(['cgpa'], axis = 1)

df.dropna(subset = ['satisfaction', 'pressure', 'degree', 'dietary_habits', 'financial_stress', 'name'], inplace=True)

In [None]:
df.head()

In [None]:
# By looking at correlations between variables I see there is no strong correlation between numeric features (no multicollinearity, this is good).
# Depression seems negatively correlated with age, which may have sense: as a person grows older he/she might able to better control variables which impact on his/her mental wellbeing, 
# while younger people might be more negatively impacted by financial stress and pressure (maybe they can hardly bear these issues?). 

sns.heatmap(df.select_dtypes(include=['number']).corr().round(2),
            annot=True)

In [None]:
# Let's draw some histogram

fig, ax = plt.subplots(nrows = 2,
                      ncols = 3)


sns.histplot(ax = ax[0,0],
             data = df,
             x = 'age',
             hue = 'depression',
             kde = True,
             bins = 10)

ax[0,0].set_title('Distribution of depression status, by age')


sns.histplot(ax = ax[0,1],
                   data = df,
                   x = 'effort_hours',
                   hue = 'depression',
                   kde = True,
                   bins = 5)

ax[0,1].set_title('Distribution of depression status, by hours of work/study')


sns.histplot(ax = ax[0,2],
                   data = df,
                   x = 'pressure',
                   hue = 'depression',
                   kde = True,
                   bins = 3)

ax[0,2].set_title('Distribution of depression status, by pressure')


sns.histplot(ax = ax[1,0],
             data = df,
             x = 'financial_stress',
             hue = 'depression',
             kde = True,
             bins = 3)

ax[1,0].set_title('Distribution of depression status, by financial stress')


sns.histplot(ax = ax[1,1],
             data = df,
             x = 'satisfaction',
             hue = 'depression',
             kde = True,
             bins = 3)

ax[1,1].set_title('Distribution of depression status, by satisfaction');

# A few initial observations:

* Apparently in case of depression age is skewed toward lower values, concentrated on people <30. Depression is concentrated in younger age classes.
* Depressed people have high effort hours. Working longer hours apprently impacts depression
* Low satisfaction, high financial stress, high work/study pressure apparently are positively correlated to depression (even if I don't see a clear pattern yet).
* CGPA seems not correlated to depression: distribution for depressed and non-depressed people is similar, altough more concentrated on 8-9 for depressed people. 

# Prossimi step:
* Fillna() --> most frequent
* Encoding (anche ordinal) 
* Modelli

In [None]:
df.head()

In [None]:
cols_to_be_dropped = ['name']
categorical_cols = ['gender', 'city','work_situation', 'profession','degree','suicidal_thoughts','family_mental_illness']
ordinal_cols = ['sleep_duration', 'dietary_habits']

In [None]:
# There is a significant problem with the sleep_duration column: too many unuseful groups and some values which where wrongly specified.
# I need to find a solution. I'll define a function to level out the categories in: <5, 5-8, >8

pd.pivot_table(df, index = ['sleep_duration'], values = ['depression'], aggfunc = 'count')

In [None]:
def regroup_sleep_hours(sleep_duration):
    df.sleep_duration = df.sleep_duration.str.replace(' hours' , '')
    df.sleep_duration = df.sleep_duration.str.replace('Less than ' , '<')
    df.sleep_duration = df.sleep_duration.str.replace('More than ' , '>')
    df.sleep_duration = df.sleep_duration.str.replace('-' , ';')
    
    if '<' in df.sleep_duration:
        df.sleep_duration
    elif '>' in df.sleep_duration:
        df.sleep_duration
    elif max(list(map(int, list(df.sleep_duration).remove(';')))) <= 6:
           '>=6'
        
    
    print(sleep_duration)
    

In [None]:
regroup_sleep_hours(df.sleep_duration)

In [None]:
df.sleep_duration

In [None]:
A = '5,6'
B = list(A)
print (B)

In [None]:
B.remove(',')
B = list(map(int, B))
print(B)

In [None]:
max(B)as_type('int')

In [None]:
list o