In [201]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold

In [202]:
# Load data into dataframe
df = pd.read_csv('mental-heath-in-tech-2016_20161114.csv')

In [203]:
print('Number of rows:', df.shape[0])
print('Number of columns:', df.shape[1])

Number of rows: 1433
Number of columns: 63


In [204]:
# Check number of NaNs in each column
for col in df.columns:
    print(col, df[col].isnull().sum())

Are you self-employed? 0
How many employees does your company or organization have? 287
Is your employer primarily a tech company/organization? 287
Is your primary role within your company related to tech/IT? 1170
Does your employer provide mental health benefits as part of healthcare coverage? 287
Do you know the options for mental health care available under your employer-provided coverage? 420
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)? 287
Does your employer offer resources to learn more about mental health concerns and options for seeking help? 287
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer? 287
If a mental health issue prompted you to request a medical leave from work, asking for that leave would be: 287
Do you think that discussing a mental health disorder with your employer would have negati

In [205]:
# Drop rows where number of non-NaN values < 50 and drop columns where number number of non-Nan values < 600
df_cleaned = df.dropna(axis=0, thresh=50).dropna(axis=1, thresh=600)

In [206]:
# Check number of NaNs in each column
for col in df_cleaned.columns:
    print(col, df_cleaned[col].isnull().sum())

Are you self-employed? 0
How many employees does your company or organization have? 0
Is your employer primarily a tech company/organization? 0
Does your employer provide mental health benefits as part of healthcare coverage? 0
Do you know the options for mental health care available under your employer-provided coverage? 31
Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)? 0
Does your employer offer resources to learn more about mental health concerns and options for seeking help? 0
Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer? 0
If a mental health issue prompted you to request a medical leave from work, asking for that leave would be: 0
Do you think that discussing a mental health disorder with your employer would have negative consequences? 0
Do you think that discussing a physical health issue with your

In [207]:
print('Number of rows:', df_cleaned.shape[0])
print('Number of columns:', df_cleaned.shape[1])

Number of rows: 692
Number of columns: 48


In [208]:
# Remove columns known to be unhelpful 
del df_cleaned['Which of the following best describes your work position?']
del df_cleaned['What country do you work in?']
del df_cleaned['Why or why not?']
del df_cleaned['Why or why not?.1']
del df_cleaned['If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?']
del df_cleaned['If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?']
del df_cleaned['How willing would you be to share with friends and family that you have a mental illness?']
del df_cleaned['Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?']

print('Number of rows:', df_cleaned.shape[0])
print('Number of columns:', df_cleaned.shape[1])

Number of rows: 692
Number of columns: 40


In [209]:
# Clean up binary 
df_cleaned['Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?'] = df_cleaned['Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?'].map({'Yes': 1, 'No': 0})

# Convert categorial answers to numerical variables  
df_cleaned = df_cleaned.replace({'Yes': 2, 'No': 0})
df_cleaned = df_cleaned.replace({'Not eligible for coverage / N/A': 0})
df_cleaned = df_cleaned.replace({'Maybe': 1, "I don't know": 1, "I am not sure": 1})
df_cleaned = df_cleaned.replace({'United States of America': 1})
df_cleaned['What country do you live in?'] = df_cleaned['What country do you live in?'].apply(lambda x: int(str(x).isdigit()))


# Convert M/F to 0/1
df_cleaned = df_cleaned.replace({'Male': 0, 'male': 0, 'M': 0, 'Female': 1, 'female': 1})

# Now convert ordinal catergorical varaibles
df_cleaned['How many employees does your company or organization have?'] = df_cleaned['How many employees does your company or organization have?'].map({'6-25': 1, '26-100': 2, '100-500': 3, '500-1000': 4, 'More than 1000': 5})
df_cleaned['If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:'] = df_cleaned['If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:'].map({"I don't know": 1, 1:1, 'Somewhat difficult': 0, 'Neither easy nor difficult': 2, 'Somewhat easy': 3, 'Very easy': 4})
df_cleaned['Have your previous employers provided mental health benefits?'] = df_cleaned['Have your previous employers provided mental health benefits?'].map({1: 0, 'No, none did': 1, 'Some did': 2, 'Yes, they all did': 3})
df_cleaned['Were you aware of the options for mental health care provided by your previous employers?'] = df_cleaned['Were you aware of the options for mental health care provided by your previous employers?'].map({'N/A (not currently aware)': 1, 'No, I only became aware later': 2, 'I was aware of some': 3, 'Yes, I was aware of all of them': 4})
df_cleaned['Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?'] = df_cleaned['Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?'].map({'None did': 0, 1:1, 'Some did': 2, 'Yes, they all did': 3})
df_cleaned['Did your previous employers provide resources to learn more about mental health issues and how to seek help?'] = df_cleaned['Did your previous employers provide resources to learn more about mental health issues and how to seek help?'].map({'None did': 0, 1:1, 'Some did': 2, 'Yes, they all did': 3})
df_cleaned['Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?'] = df_cleaned['Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?'].map({0:0, 1:1, 'Sometimes': 2, "Yes, always": 3})
df_cleaned['Do you think that discussing a mental health disorder with previous employers would have negative consequences?'] = df_cleaned['Do you think that discussing a mental health disorder with previous employers would have negative consequences?'].map({'None of them': 0, 1:1, 'Some of them': 2, 'Yes, all of them': 3})
df_cleaned['Do you think that discussing a physical health issue with previous employers would have negative consequences?'] = df_cleaned['Do you think that discussing a physical health issue with previous employers would have negative consequences?'].map({'None of them': 0, 1:1, 'Some of them': 2, 'Yes, all of them': 3})
df_cleaned['Would you have been willing to discuss a mental health issue with your previous co-workers?'] = df_cleaned['Would you have been willing to discuss a mental health issue with your previous co-workers?'].map({'No, at none of my previous employers': 0, 'Some of my previous employers': 1, 'Yes, at all of my previous employers': 2})
df_cleaned['Would you have been willing to discuss a mental health issue with your direct supervisor(s)?'] = df_cleaned['Would you have been willing to discuss a mental health issue with your direct supervisor(s)?'].map({'No, at none of my previous employers': 0, 'Some of my previous employers': 1, 'Yes, at all of my previous employers': 2})
df_cleaned['Did you feel that your previous employers took mental health as seriously as physical health?'] = df_cleaned['Did you feel that your previous employers took mental health as seriously as physical health?'].map({'None did': 0, 1:1, 'Some did': 2, 'Yes, they all did': 3})
df_cleaned['Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?'] = df_cleaned['Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?'].map({'None of them': 0, 'Some of them': 1, 'Yes, all of them': 2})
df_cleaned['Do you feel that being identified as a person with a mental health issue would hurt your career?'] = df_cleaned['Do you feel that being identified as a person with a mental health issue would hurt your career?'].map({'No, it has not': 0, 1: 2, "No, I don't think it would": 1, 'Yes, I think it would': 3, "Yes, it has": 4})
df_cleaned['Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?'] = df_cleaned['Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?'].map({'No, they do not': 0, 1: 2, "No, I don't think they would": 1, 'Yes, I think they would': 3, "Yes, they do": 4})
df_cleaned['Do you work remotely?'] = df_cleaned['Do you work remotely?'].map({'Never': 0, 'Sometimes': 1, 'Always': 2})


# printing 
for col in df_cleaned.columns:
    print(col, df_cleaned[col])

Are you self-employed? 1       0
4       0
5       0
6       0
7       0
       ..
1422    0
1424    0
1427    0
1430    0
1431    0
Name: Are you self-employed?, Length: 692, dtype: int64
How many employees does your company or organization have? 1       1.0
4       1.0
5       5.0
6       2.0
7       5.0
       ... 
1422    2.0
1424    3.0
1427    4.0
1430    3.0
1431    3.0
Name: How many employees does your company or organization have?, Length: 692, dtype: float64
Is your employer primarily a tech company/organization? 1       1.0
4       0.0
5       1.0
6       1.0
7       1.0
       ... 
1422    1.0
1424    1.0
1427    1.0
1430    1.0
1431    0.0
Name: Is your employer primarily a tech company/organization?, Length: 692, dtype: float64
Does your employer provide mental health benefits as part of healthcare coverage? 1       0
4       2
5       2
6       1
7       2
       ..
1422    1
1424    0
1427    2
1430    2
1431    1
Name: Does your employer provide mental health benefits

In [210]:
# CURRENT ATTEMP AT CLASSIFYING GENDER... will attempt other ways of respectfully recognizing gender identity
# makes gender column a numerical variable
obj_columns = df_cleaned.select_dtypes(['object']).columns
for col in obj_columns:
    df_cleaned[col] = df_cleaned[col].astype('category')

cat_columns = df_cleaned.select_dtypes(['category']).columns
#print(cat_columns)

df_cleaned[cat_columns] = df_cleaned[cat_columns].apply(lambda x: x.cat.codes)

In [211]:
# SPLIT INTO X (FEATURES) AND Y
possible_y = {'Have you had a mental health disorder in the past?', 'Do you currently have a mental health disorder?', 'Have you been diagnosed with a mental health condition by a medical professional?', 'Have you ever sought treatment for a mental health issue from a mental health professional?'}
y = df_cleaned[possible_y]
X = df_cleaned.drop(possible_y, axis=1)

In [212]:
# Feature selection on 0 variance
# uses sklearn.feature_selection

sel = VarianceThreshold()
selected_features = sel.fit_transform(X)
print(len(selected_features[0]))

34


In [None]:
# next steps.. split into train/test (and possibly validation set). 
# Make sure to scale to mean 0 variance 1. Random shuffle as well 