In [1]:
# Data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# Visualization
import matplotlib.pyplot as plt

# Machine learning imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler


Initial Analysis of Data

1) Clean data.
1) Fix Gender column.
2) Change the rest to numeric data.
3) Determine which respondants are suffering from a mental health issue or not.
4) Determine which responses are about the "culture" of the organization toward mental health issues.
5) Create US and non-US files


(1) The gender column must not have had choices.  The catagories will have to be manually defined.

(2) This task needs the survey's possible responses.  They can be found here:  ???

(3) Determine which respondants are suffering from a mental health issue or not. (From column "work_interfere"?)

Survey Question: If you have a mental health condition, do you feel that it interferes with your work? (work_interfere)

Possible responses: Don't know, Never, Often, Rarely, Sometimes
There are also responses of N/A.  We believe a response of N/A means the responder does not feel they are having mental health issues

In [2]:
# Create the data frame
train_df = pd.read_csv('Resources/survey.csv')

#Whats the data row count?
print(train_df.shape)
    
#Whats the distribution of the data?
print(train_df.describe())
    
#What types of data
print(train_df.info())

#train_df.head(20)

(1259, 27)
                Age
count  1.259000e+03
mean   7.942815e+07
std    2.818299e+09
min   -1.726000e+03
25%    2.700000e+01
50%    3.100000e+01
75%    3.600000e+01
max    1.000000e+11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
Timestamp                    1259 non-null object
Age                          1259 non-null int64
Gender                       1259 non-null object
Country                      1259 non-null object
state                        744 non-null object
self_employed                1241 non-null object
family_history               1259 non-null object
treatment                    1259 non-null object
work_interfere               995 non-null object
no_employees                 1259 non-null object
remote_work                  1259 non-null object
tech_company                 1259 non-null object
benefits                     1259 non-null object
care_options                 1259 non-null object
welln

In [3]:
# Remove timestamp, comments, and state (for now)
train_df = train_df.drop(['Timestamp', 'state', 'comments'], axis=1)

In [4]:
# # Split out US and non-US
# train_us_df = train_df.loc[train_df['Country'] == "United States"].copy()
# train_not_us_df = train_df.loc[train_df['Country'] != "United States"].copy()


In [5]:
# Clean NaNs

# Assign default values for each data type
defaultInt = 0
defaultString = 'NaN'
defaultFloat = 0.0

# Create lists by data tpe
intFeatures = ['Age']
stringFeatures = ['Gender', 'Country', 'self_employed', 'family_history', 'treatment', 'work_interfere',
                 'no_employees', 'remote_work', 'tech_company', 'anonymity', 'leave', 'mental_health_consequence',
                 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']
floatFeatures = []

# Clean the NaN's
for feature in train_df:
    if feature in intFeatures:
        train_df[feature] = train_df[feature].fillna(defaultInt)
    elif feature in stringFeatures:
        train_df[feature] = train_df[feature].fillna(defaultString)
    elif feature in floatFeatures:
        train_df[feature] = train_df[feature].fillna(defaultFloat)
    else:
        print('Error: Feature %s not recognized.' % feature)

#train_df.head(5)


In [6]:
# Clean gender data
train_df['Gender'] = train_df['Gender'].str.lower()

# get the unique values for Gender
gender_strings = train_df['Gender'].unique().tolist()

# Create 4 gender options (Female, Male, Trans, GenderNonConforming)
# Make gender groups
male_str = ["male", "m", "maile", "mal", "male (cis)", "make", "male ", "man", "msle", "mail",
            "malr", "cis man", "Cis Male", "cis male"]
trans_str = ["trans-female", "trans woman", "female (trans)"]           
female_str = ["cis female", "f", "female", "woman",  "femake", "female ","cis-female/femme", "female (cis)",
              "femail"]
nc_str = ["male-ish", "enby", "nah", "all", "queer/she/they", "non-binary", "something kinda male?", "fluid", "genderqueer",
             "androgyne", "agender", "male leaning androgynous", "guy (-ish) ^_^", "neuter", "queer",
             "ostensibly male, unsure what that really means"]

# Create new gender values
for (row, col) in train_df.iterrows():

    if str.lower(col.Gender) in male_str:
        train_df['Gender'].replace(to_replace=col.Gender, value='male', inplace=True)

    if str.lower(col.Gender) in female_str:
        train_df['Gender'].replace(to_replace=col.Gender, value='female', inplace=True)

    if str.lower(col.Gender) in trans_str:
        train_df['Gender'].replace(to_replace=col.Gender, value='trans', inplace=True)

    if str.lower(col.Gender) in nc_str:
        train_df['Gender'].replace(to_replace=col.Gender, value='gen_nc', inplace=True)

# Remove the ones that don't make any sense
weird_list = ['a little about you', 'p']
train_df = train_df[~train_df['Gender'].str.lower().isin(weird_list)]
#train_df.head(20)


In [7]:
# Update missing Age with mean Age
train_df['Age'].fillna(train_df['Age'].median(), inplace = True)

# Fill with media() values < 18 and > 90
s = pd.Series(train_df['Age'])
s[s<18] = train_df['Age'].median()
train_df['Age'] = s
s = pd.Series(train_df['Age'])
s[s>90] = train_df['Age'].median()
train_df['Age'] = s

#Ranges of Age
train_df['age_range'] = pd.cut(train_df['Age'], [0,20,30,40,50,60,90], labels=["0-20", "20-30", "30-40", "40-50", "50-60", "60-90"], include_lowest=True)
#train_df.head(30)


In [8]:
#There are only 0.014% of self employed so let's change NaN to NOT self_employed
#Replace "NaN" string from defaultString
train_df['self_employed'] = train_df['self_employed'].replace([defaultString], 'No')
#train_df.head(10)


In [9]:
# Fix no_employees and NaN's in work_influance
train_df['no_employees'] = train_df['no_employees'].replace(['5-Jan'], '1-5')
train_df['no_employees'] = train_df['no_employees'].replace(['25-Jan'], '6-25')

# Going to assume blanks in the work_interfere column were meant to be Never
train_df['work_interfere'] = train_df['work_interfere'].replace([defaultString], 'Never')


In [10]:
# Encoding data

# Remove records where work_interfere = NA.  This is because we are interpreting that
# answer to mean the person does not have a mental health issue.
train_df = train_df[~train_df['work_interfere'].isin(['NaN'])]

# Change string responses to numerical values
labelDict = {}
for feature in train_df:
    le = preprocessing.LabelEncoder()
    le.fit(train_df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    train_df[feature] = le.transform(train_df[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] = labelValue
    
for key, value in labelDict.items():     
    print(key, value)

# Get rid of 'Country' and 'Age'
train_df = train_df.drop(['Country'], axis= 1)
train_df = train_df.drop(['Age'], axis= 1)
train_df.head()


label_Age [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56, 57, 58, 60, 61, 62, 65, 72]
label_Gender ['female', 'gen_nc', 'male', 'trans']
label_Country ['Australia', 'Austria', 'Belgium', 'Bosnia and Herzegovina', 'Brazil', 'Bulgaria', 'Canada', 'China', 'Colombia', 'Costa Rica', 'Croatia', 'Czech Republic', 'Denmark', 'Finland', 'France', 'Georgia', 'Germany', 'Greece', 'Hungary', 'India', 'Ireland', 'Israel', 'Italy', 'Japan', 'Latvia', 'Mexico', 'Moldova', 'Netherlands', 'New Zealand', 'Nigeria', 'Norway', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Singapore', 'Slovenia', 'South Africa', 'Spain', 'Sweden', 'Switzerland', 'Thailand', 'United Kingdom', 'United States', 'Uruguay', 'Zimbabwe']
label_self_employed ['No', 'Yes']
label_family_history ['No', 'Yes']
label_treatment ['No', 'Yes']
label_work_interfere ['Never', 'Often', 'Rarely', 'Sometimes']
label_no_employe

Unnamed: 0,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,benefits,care_options,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,0,0,0,1,1,4,0,1,2,1,...,2,1,1,1,2,1,0,2,0,2
1,2,0,0,0,2,5,0,0,0,0,...,0,0,1,0,0,1,1,0,0,3
2,2,0,0,0,2,4,0,1,1,0,...,1,1,1,2,2,2,2,1,0,2
3,2,0,1,1,1,2,0,1,1,2,...,1,2,2,1,0,0,0,1,1,2
4,2,0,0,0,0,1,1,1,2,0,...,0,1,1,1,2,2,2,0,0,2


In [11]:
# Output the cleaned us dataframe and not cleaned not us dataframe to a new files
train_df.to_csv('Resources/all_ml.csv')
