In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from IPython.display import HTML
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load the data
file_path = 'survey.csv'
data = pd.read_csv(file_path)

In [3]:
#Visualizing data 
data.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,8/27/2014 11:29,37,Female,United States,IL,,No,Yes,Often,45468,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,8/27/2014 11:29,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,8/27/2014 11:29,32,Male,Canada,,,No,No,Rarely,45468,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,8/27/2014 11:29,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,8/27/2014 11:30,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [4]:
print("Shape of Data: " + str(data.shape))
#Exploring the values of dataframe
print(data.info())

Shape of Data: (1259, 27)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Timestamp                  1259 non-null   object
 1   Age                        1259 non-null   int64 
 2   Gender                     1259 non-null   object
 3   Country                    1259 non-null   object
 4   state                      744 non-null    object
 5   self_employed              1241 non-null   object
 6   family_history             1259 non-null   object
 7   treatment                  1259 non-null   object
 8   work_interfere             995 non-null    object
 9   no_employees               1259 non-null   object
 10  remote_work                1259 non-null   object
 11  tech_company               1259 non-null   object
 12  benefits                   1259 non-null   object
 13  care_options               1259 non-n

In [5]:
missing_values = data.isnull().sum()
datatypes = data.dtypes
print(missing_values)

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64


In [6]:
#Dropping the 'Timestamp' and 'comments' columns from the Dataframe
#'comments': This free-text field will require significant natural language processing to be useful, which may not be practical for our initial model.
#'state': The state column has 515 missing values, which is substantial. Given its relevance mainly to the US and the presence of a Country column, we will drop it.
data = data.drop(['Timestamp', 'comments', 'state'], axis=1)
data['no_employees'] = data['no_employees'].replace({'45468': '6-25', '45296': '1-5'})

In [7]:
unique_values_in_object_columns = data.select_dtypes(include=[object]).apply(lambda x: sorted(pd.unique(x.astype(str))))

# Display the unique values in each object column using HTML to avoid truncation
HTML(unique_values_in_object_columns.to_frame().to_html())

Unnamed: 0,0
Gender,"[A little about you, Agender, All, Androgyne, Cis Female, Cis Male, Cis Man, Enby, F, Femake, Female, Female , Female (cis), Female (trans), Genderqueer, Guy (-ish) ^_^, M, Mail, Make, Mal, Male, Male , Male (CIS), Male-ish, Malr, Man, Nah, Neuter, Trans woman, Trans-female, Woman, cis male, cis-female/femme, f, femail, female, fluid, m, maile, male, male leaning androgynous, msle, non-binary, ostensibly male, unsure what that really means, p, queer, queer/she/they, something kinda male?, woman]"
Country,"[Australia, Austria, Bahamas, The, Belgium, Bosnia and Herzegovina, Brazil, Bulgaria, Canada, China, Colombia, Costa Rica, Croatia, Czech Republic, Denmark, Finland, France, Georgia, Germany, Greece, Hungary, India, Ireland, Israel, Italy, Japan, Latvia, Mexico, Moldova, Netherlands, New Zealand, Nigeria, Norway, Philippines, Poland, Portugal, Romania, Russia, Singapore, Slovenia, South Africa, Spain, Sweden, Switzerland, Thailand, United Kingdom, United States, Uruguay, Zimbabwe]"
self_employed,"[No, Yes, nan]"
family_history,"[No, Yes]"
treatment,"[No, Yes]"
work_interfere,"[Never, Often, Rarely, Sometimes, nan]"
no_employees,"[1-5, 100-500, 26-100, 500-1000, 6-25, More than 1000]"
remote_work,"[No, Yes]"
tech_company,"[No, Yes]"
benefits,"[Don't know, No, Yes]"


In [8]:
# Define a mapping function to standardize the gender values
def standardize_gender(gender):
    gender = gender.lower().strip()
    if gender in ['male', 'm', 'male-ish', 'maile', 'cis male', 'man', 'cis man', 'mal', 'male (cis)', 'make', 'msle', 'mail', 'malr']:
        return 'Male'
    elif gender in ['female', 'f', 'cis female', 'woman', 'femail', 'female (cis)', 'femake', 'female ']:
        return 'Female'
    elif gender in ['trans-female', 'trans woman', 'female (trans)']:
        return 'Trans Female'
    elif gender in ['non-binary', 'enby', 'genderqueer', 'androgyne', 'agender', 'fluid', 'queer/she/they', 'male leaning androgynous', 'neuter', 'queer', 'something kinda male?', 'ostensibly male, unsure what that really means']:
        return 'Non-binary'
    else:
        return 'Other'  # For any values not explicitly matched above

# Apply the mapping to the 'Gender' column
data['Gender'] = data['Gender'].apply(standardize_gender)

In [9]:
#Exploring all unique values in age column
unique_values = data['Age'].unique()
unique_values


array([         37,          44,          32,          31,          33,
                35,          39,          42,          23,          29,
                36,          27,          46,          41,          34,
                30,          40,          38,          50,          24,
                18,          28,          26,          22,          19,
                25,          45,          21,         -29,          43,
                56,          60,          54,         329,          55,
       99999999999,          48,          20,          57,          58,
                47,          62,          51,          65,          49,
             -1726,           5,          53,          61,           8,
                11,          -1,          72], dtype=int64)

In [10]:
print(data['Age'].value_counts().sort_index())

Age
-1726            1
-29              1
-1               1
 5               1
 8               1
 11              1
 18              7
 19              9
 20              6
 21             16
 22             21
 23             51
 24             46
 25             61
 26             75
 27             71
 28             68
 29             85
 30             63
 31             67
 32             82
 33             70
 34             65
 35             55
 36             37
 37             43
 38             39
 39             33
 40             33
 41             21
 42             20
 43             28
 44             11
 45             12
 46             12
 47              2
 48              6
 49              4
 50              6
 51              5
 53              1
 54              3
 55              3
 56              4
 57              3
 58              1
 60              2
 61              1
 62              1
 65              1
 72              1
 329             1
 9999999

In [11]:
#Age: Replacing all values outside the range : 0 - 100 with NaN 
data['Age'] = np.where((data['Age'] >= 0) & (data['Age'] <= 100), data['Age'], np.nan)

In [12]:
print(data['Age'].value_counts().sort_index())

Age
5.0      1
8.0      1
11.0     1
18.0     7
19.0     9
20.0     6
21.0    16
22.0    21
23.0    51
24.0    46
25.0    61
26.0    75
27.0    71
28.0    68
29.0    85
30.0    63
31.0    67
32.0    82
33.0    70
34.0    65
35.0    55
36.0    37
37.0    43
38.0    39
39.0    33
40.0    33
41.0    21
42.0    20
43.0    28
44.0    11
45.0    12
46.0    12
47.0     2
48.0     6
49.0     4
50.0     6
51.0     5
53.0     1
54.0     3
55.0     3
56.0     4
57.0     3
58.0     1
60.0     2
61.0     1
62.0     1
65.0     1
72.0     1
Name: count, dtype: int64


In [13]:
# Split data into training and test sets
train_df, test_df = train_test_split(data, test_size=0.3, random_state=666)

### Exploratory Data Analysis

In [14]:
# Numerical
train_df.describe()

Unnamed: 0,Age
count,877.0
mean,32.019384
std,7.388028
min,5.0
25%,27.0
50%,31.0
75%,36.0
max,72.0


In [15]:
# Categorical
print(train_df.describe(include='object'))

       Gender        Country self_employed family_history treatment  \
count     881            881           868            881       881   
unique      5             40             2              2         2   
top      Male  United States            No             No       Yes   
freq      682            538           761            528       461   

       work_interfere no_employees remote_work tech_company benefits  ...  \
count             698          881         881          881      881  ...   
unique              4            6           2            2        3  ...   
top         Sometimes       26-100          No          Yes      Yes  ...   
freq              328          200         618          710      350  ...   

         anonymity       leave mental_health_consequence  \
count          881         881                       881   
unique           3           5                         3   
top     Don't know  Don't know                        No   
freq           563

### 1.3 Identifying potentially important features

In [16]:
import altair as alt

alt.data_transformers.disable_max_rows()  # Allows us to plot big datasets

alt.Chart(train_df.sort_values('treatment')).mark_bar(opacity=0.6).encode(
    alt.X(alt.repeat(), type='quantitative', bin=alt.Bin(maxbins=50)),
    alt.Y('count()', stack=None),
    alt.Color('treatment')
).properties(
    height=200
).repeat(
    train_df.select_dtypes('number').columns.to_list(),
    columns=2
)

In [17]:
alt.Chart(train_df.sort_values('treatment')).mark_bar(opacity=0.6).encode(
    alt.X(alt.repeat(), type='nominal'),
    alt.Y('count()', stack=None),
    alt.Color('treatment')
).properties(
    height=200
).repeat(
    train_df.select_dtypes('object').columns.to_list(),
    columns=2
)

In [18]:

#Separating the target and features
X_train = train_df.drop(columns=["treatment"])
y_train = train_df["treatment"]

X_test = test_df.drop(columns=["treatment"])
y_test = test_df["treatment"]

### Preprocessing

| Feature | Transformation |
| --- | ----------- |
| Age | Imputation, Scaling |
| Gender | One Hot Encoding |
| Country | One Hot Encoding |
| self_employed | Imputation, One Hot Encoding (binary) |
| family_history | One Hot Encoding(binary) |
| work_interfere | Imputation, One Hot Encoding ***|
| no_employees | Ordinal encoding |
| remote_work | One Hot Encoding (binary) |
| tech_company | One Hot Encoding (binary)|
| benefits | One Hot Encoding |
| care_options | One Hot Encoding |
| wellness_program | One Hot Encoding |
| seek_help | One Hot Encoding |
| anonymity | One Hot Encoding |
| leave |  Ordinal Encoding |
| mental_health_consequence | One Hot Encoding |
| phys_health_consequence | One Hot Encoding |
| coworkers | One Hot Encoding |
| supervisor | One Hot Encoding |
| mental_health_interview | One Hot Encoding |
| phys_health_interview | One Hot Encoding |
| mental_vs_physical | One Hot Encoding |
| obs_consequence | One Hot Encoding |



In [19]:
missing_values = X_train.isnull().sum()
datatypes = data.dtypes
print(missing_values)

Age                            4
Gender                         0
Country                        0
self_employed                 13
family_history                 0
work_interfere               183
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       0
care_options                   0
wellness_program               0
seek_help                      0
anonymity                      0
leave                          0
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
dtype: int64


In [20]:
# Identify categorical and numerical columns
categorical_onehot_cols = ["Gender", "self_employed", "family_history", "work_interfere","remote_work","tech_company","benefits","care_options","wellness_program","seek_help","anonymity","mental_health_consequence","phys_health_consequence","coworkers","supervisor","mental_health_interview","phys_health_interview","mental_vs_physical","obs_consequence"]
categorical_ordinal_cols = ["no_employees","leave"]
numerical_cols = ["Age"]
categorical_onehot_country = ["Country"]


In [21]:
# Categories for ordinal encoding
ordinal_categories = [
    ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000'],  # for 'no_employees'
    ['Very easy', 'Somewhat easy', "Don't know", 'Somewhat difficult', 'Very difficult']  # for 'leave'
]

specified_categories = [['Australia', 'Austria', 'Bahamas, The', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Canada', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark', 'Finland', 'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'India', 'Ireland', 'Israel', 'Italy', 'Japan', 'Latvia', 'Mexico', 'Moldova', 'Netherlands', 'New Zealand', 'Nigeria', 'Norway', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Thailand', 'United Kingdom', 'United States', 'Uruguay', 'Zimbabwe']]


In [22]:
# Preprocessing pipelines
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(sparse_output=False, dtype="int",handle_unknown='ignore',drop="if_binary"))
])

specified_cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(categories=specified_categories, handle_unknown='ignore'))
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])


In [23]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, numerical_cols),
    ('cat_onehot', onehot_transformer, categorical_onehot_cols),
    ('cat_specified', specified_cat_transformer, categorical_onehot_country),
    ('cat_ordinal', ordinal_transformer, categorical_ordinal_cols)
])

In [24]:
# Baseline model
baseline_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', DummyClassifier(strategy='stratified'))])
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
print("Baseline model accuracy:", accuracy_score(y_test, y_pred_baseline))
print(classification_report(y_test, y_pred_baseline))

Baseline model accuracy: 0.48412698412698413
              precision    recall  f1-score   support

          No       0.52      0.46      0.49       202
         Yes       0.45      0.51      0.48       176

    accuracy                           0.48       378
   macro avg       0.49      0.49      0.48       378
weighted avg       0.49      0.48      0.48       378



In [26]:
#SVM model
svm_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', SVC(probability=True))])
svm_param_grid = {
    'classifier__C': [0.1, 1.0, 10, 100],
    'classifier__gamma': [0.1, 1.0, 10, 100]
    }
svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=5, scoring='accuracy')
# Fit and evaluate the SVM model similarly to the KNN model
svm_grid_search.fit(X_train, y_train)

# The best mean cross-validated score achieved by the best hyperparameters
svm_best_training_score = svm_grid_search.best_score_

y_pred_svm = svm_grid_search.predict(X_test)
svm_test_accuracy = accuracy_score(y_test, y_pred_svm)

print("SVM Best Parameters:", svm_grid_search.best_params_)
print("SVM Best Training Score:", svm_best_training_score)
print("SVM Test Accuracy:", svm_test_accuracy)
print(classification_report(y_test, y_pred_svm))

SVM Best Parameters: {'classifier__C': 1.0, 'classifier__gamma': 0.1}
SVM Best Training Score: 0.8263032871083718
SVM Test Accuracy: 0.8095238095238095
              precision    recall  f1-score   support

          No       0.93      0.69      0.80       202
         Yes       0.73      0.94      0.82       176

    accuracy                           0.81       378
   macro avg       0.83      0.82      0.81       378
weighted avg       0.84      0.81      0.81       378



In [27]:
#Naive Bayes model
nb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('classifier', GaussianNB())])
# Define the parameter grid
nb_param_grid = {
    'classifier__var_smoothing': np.logspace(0,-9, num=100)
}

# Create the GridSearchCV object
nb_grid_search = GridSearchCV(nb_pipeline, nb_param_grid, cv=5, scoring='accuracy')

# Fit the model
nb_grid_search.fit(X_train, y_train)

# The best hyperparameters
print("Best Hyperparameters:", nb_grid_search.best_params_)


# Best mean cross-validated training accuracy
print("Naive Bayes Best Training Accuracy:", nb_grid_search.best_score_)

# Predict on the test set
y_pred_nb = nb_grid_search.predict(X_test)

# Evaluate the model
print("Naive Bayes Best Test Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))

Best Hyperparameters: {'classifier__var_smoothing': 0.008111308307896872}
Naive Bayes Best Training Accuracy: 0.8138097072419106
Naive Bayes Best Test Accuracy: 0.8095238095238095
              precision    recall  f1-score   support

          No       0.87      0.76      0.81       202
         Yes       0.76      0.87      0.81       176

    accuracy                           0.81       378
   macro avg       0.81      0.81      0.81       378
weighted avg       0.82      0.81      0.81       378



In [28]:
#Decission Tree model
dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', DecisionTreeClassifier(class_weight='balanced'))])
# Define the parameter grid
dt_param_grid = {
    'classifier__max_depth': np.arange(2,10,1)
}

# Create the GridSearchCV object
dt_grid_search = GridSearchCV(dt_pipeline, dt_param_grid, cv=5, n_jobs= -1,scoring='accuracy')

# Fit the model
dt_grid_search.fit(X_train, y_train)

# The best hyperparameters
print("Best Hyperparameters:", dt_grid_search.best_params_)

# Predict on the test set
y_pred_dt = dt_grid_search.predict(X_test)

# Evaluate the model
print("Decision Tree Best Training Score:", dt_grid_search.best_score_)
print("Decision Tree Best Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))

Best Hyperparameters: {'classifier__max_depth': 2}
Decision Tree Best Training Score: 0.8398947098099641
Decision Tree Best Test Accuracy: 0.8068783068783069
              precision    recall  f1-score   support

          No       0.93      0.69      0.79       202
         Yes       0.73      0.94      0.82       176

    accuracy                           0.81       378
   macro avg       0.83      0.82      0.81       378
weighted avg       0.83      0.81      0.81       378



In [29]:
#logistic regression model
logr_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', LogisticRegression(class_weight='balanced'))])
# Define the parameter grid
logr_param_grid = {
    'classifier__C': np.arange(0.5,1,0.02)
}

# Create the GridSearchCV object
logr_grid_search = GridSearchCV(logr_pipeline, logr_param_grid, cv=5, n_jobs= -1,scoring='accuracy')

# Fit the model
logr_grid_search.fit(X_train, y_train)

# The best hyperparameters
print("Best Hyperparameters:", logr_grid_search.best_params_)

# Predict on the test set
y_pred_logr = logr_grid_search.predict(X_test)

# Evaluate the model
print("logistic regression Best Training Score:", logr_grid_search.best_score_)
print("logistic regression Best Test Accuracy:", accuracy_score(y_test, y_pred_logr))
print(classification_report(y_test, y_pred_logr))

Best Hyperparameters: {'classifier__C': 0.9000000000000004}
logistic regression Best Training Score: 0.8410503338469442
logistic regression Best Test Accuracy: 0.8148148148148148
              precision    recall  f1-score   support

          No       0.88      0.76      0.81       202
         Yes       0.76      0.88      0.81       176

    accuracy                           0.81       378
   macro avg       0.82      0.82      0.81       378
weighted avg       0.82      0.81      0.81       378



In [30]:
#Random Forest model
rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', RandomForestClassifier(class_weight='balanced'))])
# Define the parameter grid
rf_param_grid = {
    'classifier__n_estimators': np.arange(80,150,2),
    'classifier__max_depth': [2,3,4,5,6,7,8]
}

# Create the GridSearchCV object
rf_grid_search = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, n_jobs=-1,scoring='accuracy')

# Fit the model
rf_grid_search.fit(X_train, y_train)

# The best hyperparameters
print("Best Hyperparameters:", rf_grid_search.best_params_)

# Predict on the test set
y_pred_rf = rf_grid_search.predict(X_test)

# Evaluate the model
print("Random Forest Best Training Score:", rf_grid_search.best_score_)
print("Random Forest Best Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Best Hyperparameters: {'classifier__max_depth': 3, 'classifier__n_estimators': 128}
Random Forest Best Training Score: 0.8433230611196713
Random Forest Best Test Accuracy: 0.828042328042328
              precision    recall  f1-score   support

          No       0.91      0.75      0.82       202
         Yes       0.76      0.91      0.83       176

    accuracy                           0.83       378
   macro avg       0.84      0.83      0.83       378
weighted avg       0.84      0.83      0.83       378

