In [103]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [104]:
# Load data into dataframe
df = pd.read_csv('mental-heath-in-tech-2016_20161114.csv')

In [105]:
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

Number of rows: 1433
Number of columns: 63


In [106]:
# Check number of NaNs in each column
for col in df.columns:
    print(col, df[col].isnull().sum())

Are you self-employed? 0
How many employees does your company or organization have? 287
Is your employer primarily a tech company/organization? 287
Is your primary role within your company related to tech/IT? 1170
Does your employer provide mental health benefits as part of healthcare coverage? 287
Do you know the options for mental health care available under your employer-provided coverage? 420
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)? 287
Does your employer offer resources to learn more about mental health concerns and options for seeking help? 287
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer? 287
If a mental health issue prompted you to request a medical leave from work, asking for that leave would be: 287
Do you think that discussing a mental health disorder with your employer would have negati

In [107]:
# Drop rows where number of non-NaN values < 50 and drop columns where number number of non-Nan values < 600
df_cleaned = df.dropna(axis=0, thresh=50).dropna(axis=1, thresh=600)

In [108]:
# Check number of NaNs in each column
for col in df_cleaned.columns:
    print(col, df_cleaned[col].isnull().sum())

Are you self-employed? 0
How many employees does your company or organization have? 0
Is your employer primarily a tech company/organization? 0
Does your employer provide mental health benefits as part of healthcare coverage? 0
Do you know the options for mental health care available under your employer-provided coverage? 31
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)? 0
Does your employer offer resources to learn more about mental health concerns and options for seeking help? 0
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer? 0
If a mental health issue prompted you to request a medical leave from work, asking for that leave would be: 0
Do you think that discussing a mental health disorder with your employer would have negative consequences? 0
Do you think that discussing a physical health issue with your

In [109]:
print('Number of rows:', df_cleaned.shape[0])
print('Number of columns:', df_cleaned.shape[1])

Number of rows: 692
Number of columns: 48


In [110]:
# Remove columns known to be unhelpful 
del df_cleaned['Which of the following best describes your work position?']
del df_cleaned['What country do you work in?']
del df_cleaned['Why or why not?']
del df_cleaned['Why or why not?.1']
del df_cleaned['If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?']
del df_cleaned['If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?']
del df_cleaned['How willing would you be to share with friends and family that you have a mental illness?']
del df_cleaned['Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?']

print('Number of rows:', df_cleaned.shape[0])
print('Number of columns:', df_cleaned.shape[1])

Number of rows: 692
Number of columns: 40


In [111]:
# SPLIT INTO X (FEATURES) AND Y
possible_y = {'Have you had a mental health disorder in the past?', 'Do you currently have a mental health disorder?', 'Have you been diagnosed with a mental health condition by a medical professional?', 'Have you ever sought treatment for a mental health issue from a mental health professional?'}
y = df_cleaned[possible_y]
X = df_cleaned.drop(possible_y, axis=1)

In [112]:
for line in X['What is your gender?']:
    print(line)

male
Female
Male
M
female
Female
Male
Male
Female
Male
Male
m
I identify as female.
M
Male
male
Male
Male
female
Bigender
Male
Male
Male
male
Female assigned at birth 
male
Male
F
Male
Male
Woman
female
Male
man
Male
female
M
Male
Male
Male
Male 
Male
Male
Male
Male
male
Female
Male
female
M
Female
Male
fm
f
Female
Male
M
Male
Male
f
F
M
Female
Cis female 
F
Female
Male
M
male
Male
Male
Male
Male
Male
female
M
Transitioned, M2F
Female
f
F
Male
Male
m
Male
m
Male
Male
Female
Male
Male
male
male
Male
Male
Female
Female
Male
Male
Male
female
male
Male
Male
Female
Male
Male
Male
female
Female or Multi-Gender Femme
Male
male
male
Male
female
Male
female
M
M
Male
Female
Male
Male
Female
woman
M
M
female
Female
male
Male
Male
female
Male
Male
M
Female
Male
Female
female
female
female
Male
Male
Male
Male
female/woman
Male
Female
Male
male
f
Male.
m
Androgynous
Female
F
Female
m
Female
male
Male
Male
male
Female
Male
male
Female
Male
male
F
M
Male
male
female
F
Male
Female
Male
Male
Male
male
M

In [113]:
# Clean up binary 
X['Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?'] = X['Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?'].map({'Yes': 1, 'No': 0})

# Convert categorial answers to numerical variables  
X = X.replace({'United States of America': 1})
X['What country do you live in?'] = X['What country do you live in?'].apply(lambda x: int(str(x).isdigit()))

# Convert if contains male or female 

male_list = ['Male', 'male', 'mail', 'M', 'Man', 'man', 'dude', 'guy']
female_list = ['Female', 'female', 'F', 'woman', 'Woman', 'girl', 'gal', 'fem']

for val in X['What is your gender?']:
    for male_name in male_list:
        if str(val).__contains__(male_name):
            X['What is your gender?'].replace(val, 'm', inplace=True)
            break
    for female_name in female_list:
        if str(val).__contains__(female_name):
            X['What is your gender?'].replace(val, 'f', inplace=True)
            break
     
    
# printing 
for col in X.columns:
    print(col, X[col])

Are you self-employed? 1       0
4       0
5       0
6       0
7       0
       ..
1422    0
1424    0
1427    0
1430    0
1431    0
Name: Are you self-employed?, Length: 692, dtype: int64
How many employees does your company or organization have? 1                 6-25
4                 6-25
5       More than 1000
6               26-100
7       More than 1000
             ...      
1422            26-100
1424           100-500
1427          500-1000
1430           100-500
1431           100-500
Name: How many employees does your company or organization have?, Length: 692, dtype: object
Is your employer primarily a tech company/organization? 1       1.0
4       0.0
5       1.0
6       1.0
7       1.0
       ... 
1422    1.0
1424    1.0
1427    1.0
1430    1.0
1431    0.0
Name: Is your employer primarily a tech company/organization?, Length: 692, dtype: float64
Does your employer provide mental health benefits as part of healthcare coverage? 1                                    No
4    

In [114]:
print(X['What is your gender?'])

1       m
4       m
5       m
6       m
7       m
       ..
1422    m
1424    m
1427    m
1430    m
1431    m
Name: What is your gender?, Length: 692, dtype: object


In [115]:
# helper function 

def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res) 

In [116]:
# if not int type then use one hot encoding 

features_to_encode = X.select_dtypes(['object']).columns

for feature in features_to_encode:
    X = encode_and_bind(X, feature)

In [119]:
# use one hot for y vals 
# if not int type then use one hot encoding 

y_to_encode = y.select_dtypes(['object']).columns

for y_val in y_to_encode:
    y = encode_and_bind(y, y_val)

In [117]:
# printing 
i = 0
for col in X.columns:
    i += 1
    print(col, X[col])

Are you self-employed? 1       0
4       0
5       0
6       0
7       0
       ..
1422    0
1424    0
1427    0
1430    0
1431    0
Name: Are you self-employed?, Length: 692, dtype: int64
number is 1
Is your employer primarily a tech company/organization? 1       1.0
4       0.0
5       1.0
6       1.0
7       1.0
       ... 
1422    1.0
1424    1.0
1427    1.0
1430    1.0
1431    0.0
Name: Is your employer primarily a tech company/organization?, Length: 692, dtype: float64
number is 2
Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace? 1       0
4       0
5       1
6       0
7       0
       ..
1422    0
1424    0
1427    0
1430    1
1431    0
Name: Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?, Length: 692, dtype: int64
number is 3
Do you have previous employers? 1       1
4       1
5       1
6       1
7       1
       

In [118]:
# Feature selection on 0.01 variance
# uses sklearn.feature_selection

sel = VarianceThreshold(0.01)
selected_features = sel.fit_transform(X)
print(len(selected_features[0]))

111


In [None]:
# next steps.. split into train/test (and possibly validation set). X and y have one hot encoding,
# Make sure to scale to mean 0 variance 1. Random shuffle as well 
# play around with data