In [1]:
# Data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# Visualization
import matplotlib.pyplot as plt

# Machine learning imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler


In [2]:
# Create the data frame
train_df = pd.read_csv('Resources/survey.csv')

#Whats the data row count?
print(train_df.shape)
    
#Whats the distribution of the data?
print(train_df.describe())
    
#What types of data
print(train_df.info())

#train_df.head(20)

(1259, 27)
                Age
count  1.259000e+03
mean   7.942815e+07
std    2.818299e+09
min   -1.726000e+03
25%    2.700000e+01
50%    3.100000e+01
75%    3.600000e+01
max    1.000000e+11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 27 columns):
Timestamp                    1259 non-null object
Age                          1259 non-null int64
Gender                       1259 non-null object
Country                      1259 non-null object
state                        744 non-null object
self_employed                1241 non-null object
family_history               1259 non-null object
treatment                    1259 non-null object
work_interfere               995 non-null object
no_employees                 1259 non-null object
remote_work                  1259 non-null object
tech_company                 1259 non-null object
benefits                     1259 non-null object
care_options                 1259 non-null object
welln

In [3]:
# # Split out US and non-US
train_df = train_df.loc[train_df['Country'] == "United States"].copy()


In [4]:
# Remove everything that a company doesn't have control over, or doesn't really apply
train_df = train_df.drop(['Timestamp', 'Age', 'state', 'comments', 'Country', 'state', 'Gender', 'self_employed',
                          'family_history', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 
                          'phys_health_consequence', 'mental_health_interview', 'phys_health_interview',
                          'comments'], axis=1)

In [5]:
# Clean NaNs

# Assign default values for each data type
defaultString = 'NaN'

# Create lists by data tpe
stringFeatures = ['treatment', 'anonymity', 'leave', 'mental_health_consequence', 'coworkers', 'supervisor',
                 'mental_vs_physical', 'obs_consequence', 'benefits', 'care_options', 'wellness_program',
                 'seek_help']

# Clean the NaN's
for feature in train_df:
    train_df[feature] = train_df[feature].fillna(defaultString)

#train_df.head(5)


In [6]:
# Encoding data

# Change string responses to numerical values
labelDict = {}
for feature in train_df:
    le = preprocessing.LabelEncoder()
    le.fit(train_df[feature])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    train_df[feature] = le.transform(train_df[feature])
    # Get labels
    labelKey = 'label_' + feature
    labelValue = [*le_name_mapping]
    labelDict[labelKey] = labelValue
    
for key, value in labelDict.items():     
    print(key, value)


label_treatment ['No', 'Yes']
label_benefits ["Don't know", 'No', 'Yes']
label_care_options ['No', 'Not sure', 'Yes']
label_wellness_program ["Don't know", 'No', 'Yes']
label_seek_help ["Don't know", 'No', 'Yes']
label_anonymity ["Don't know", 'No', 'Yes']
label_leave ["Don't know", 'Somewhat difficult', 'Somewhat easy', 'Very difficult', 'Very easy']
label_mental_health_consequence ['Maybe', 'No', 'Yes']
label_coworkers ['No', 'Some of them', 'Yes']
label_supervisor ['No', 'Some of them', 'Yes']
label_mental_vs_physical ["Don't know", 'No', 'Yes']
label_obs_consequence ['No', 'Yes']


In [7]:
# Output the cleaned us dataframe and not cleaned not us dataframe to a new files
train_df.to_csv('Resources/us-company-ml.csv')
