In [1]:
# Import Dependencies
import pandas as pd 
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# import tensorflow as tf

In [2]:
# Read CSV file from Resources folder into Pandas DataFrame
file = Path('./Resources_sdt/survey.csv')
survey_df = pd.read_csv(file)

# Review DataFrame
survey_df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,


In [3]:
# Dropping 'comments' column due to limited value-added data and droppint 'Timestamp' because the entire dataset is for 2014
# Adding option to drop 'state' to look at the dataset globally and due to the NaN values -- can comment out and rerun
survey_df.drop(columns=['comments', 'Timestamp'], inplace=True)
survey_df.drop(columns=['state'], inplace=True)

In [4]:
# Get DataFrame info
survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 24 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Age                        1259 non-null   int64 
 1   Gender                     1259 non-null   object
 2   Country                    1259 non-null   object
 3   self_employed              1241 non-null   object
 4   family_history             1259 non-null   object
 5   treatment                  1259 non-null   object
 6   work_interfere             995 non-null    object
 7   no_employees               1259 non-null   object
 8   remote_work                1259 non-null   object
 9   tech_company               1259 non-null   object
 10  benefits                   1259 non-null   object
 11  care_options               1259 non-null   object
 12  wellness_program           1259 non-null   object
 13  seek_help                  1259 non-null   object
 14  anonymit

In [5]:
# Determine number of unique values in each column
survey_df.nunique()

Age                          53
Gender                       49
Country                      48
self_employed                 2
family_history                2
treatment                     2
work_interfere                4
no_employees                  6
remote_work                   2
tech_company                  2
benefits                      3
care_options                  3
wellness_program              3
seek_help                     3
anonymity                     3
leave                         5
mental_health_consequence     3
phys_health_consequence       3
coworkers                     3
supervisor                    3
mental_health_interview       3
phys_health_interview         3
mental_vs_physical            3
obs_consequence               2
dtype: int64

## Count Countries
Determine how many, and which, nations are in the dataset, and determine their prevalence.

In [6]:
# Look at 'Country' value counts
country_count = survey_df['Country'].value_counts()
country_count

Country
United States             751
United Kingdom            185
Canada                     72
Germany                    45
Ireland                    27
Netherlands                27
Australia                  21
France                     13
India                      10
New Zealand                 8
Poland                      7
Switzerland                 7
Sweden                      7
Italy                       7
South Africa                6
Belgium                     6
Brazil                      6
Israel                      5
Singapore                   4
Bulgaria                    4
Austria                     3
Finland                     3
Mexico                      3
Russia                      3
Denmark                     2
Greece                      2
Colombia                    2
Croatia                     2
Portugal                    2
Moldova                     1
Georgia                     1
Bahamas, The                1
China                       1
Th

## Gender Cleanup
There were 49 unique genders listed. Some of these were because of input errors. Some are because of how the person represented themselves. Need to determine which they are to correct them.

In [7]:
# Look at 'Gender' value counts
gender_count = survey_df['Gender'].value_counts()
gender_count

Gender
Male                                              615
male                                              206
Female                                            121
M                                                 116
female                                             62
F                                                  38
m                                                  34
f                                                  15
Make                                                4
Male                                                3
Woman                                               3
Cis Male                                            2
Man                                                 2
Female (trans)                                      2
Female                                              2
Trans woman                                         1
msle                                                1
male leaning androgynous                            1
Neuter               

In [8]:
# total 'Gender'
gender_column = survey_df['Gender'].count()
gender_column

1259

In [9]:
# There may have been misspellings in 'Gender' names and in some cases bogus answers. This is an attempt to cleanup the 'Gender' to form usable data.

# Adding Category for 'Male'
survey_df['Gender'].replace(['Male', 'male', 'M', 'm', 'Male ', 'Cis Male', 'cis male', 'Man', 
                            'Make', 'Mail', 'Cis Man', 'Guy (-ish) ^_^', 'Male-ish', 'maile', 
                            'Mal', 'Male (CIS)', 'msle', 'Malr'], 'Male', inplace=True)

# Adding Category for 'Female'
survey_df['Gender'].replace(['Female', 'female', 'F', 'f', 'Woman', 'Female ', 'Female (cis)', 
                            'cis-female/femme', 'femail', 'Cis Female', 'Femake', 'woman'], 'Female', inplace=True)

# Adding Category for 'Other' -- Note: Other will include non-specific gender and trans
survey_df['Gender'].replace(['Nah', 'All', 'fluid', 'Genderqueer', 'ostensibly male, unsure what that really means', 
                            'non-binary', 'Androgyne', 'Agender', 'Enby', 'p', 'Neuter', 'queer', 'Trans woman', 
                            'Female (trans)', 'male leaning androgynous', 'A little about you', 'Trans-female', 
                            'something kinda male?', 'queer/she/they'], 'Other', inplace=True)

# Check the updated value counts
gender_update = survey_df['Gender'].value_counts()
gender_update

Gender
Male      992
Female    247
Other      20
Name: count, dtype: int64

In [10]:
# check 'Gender' column count
gender_column = survey_df['Gender'].count()
gender_column

1259

## Check Age Ranges in Dataset
Verify ages for authenticity.

In [11]:
# Look at 'Age' value counts
age_ranges = survey_df['Age'].value_counts()
age_ranges

Age
 29             85
 32             82
 26             75
 27             71
 33             70
 28             68
 31             67
 34             65
 30             63
 25             61
 35             55
 23             51
 24             46
 37             43
 38             39
 36             37
 40             33
 39             33
 43             28
 22             21
 41             21
 42             20
 21             16
 45             12
 46             12
 44             11
 19              9
 18              7
 48              6
 50              6
 20              6
 51              5
 49              4
 56              4
 57              3
 54              3
 55              3
 47              2
 60              2
 99999999999     1
 5               1
-1               1
 11              1
 8               1
 61              1
 53              1
-29              1
-1726            1
 65              1
 62              1
 58              1
 329             1
 72     

In [12]:
# total 'Age'
age_column = survey_df['Age'].count()
age_column

1259

In [13]:
# Several answers for 'Age' were given with incorrect or bogus values. This is an attempt to retain the data.

# Convert the 'Age' column to numeric
survey_df['Age'] = pd.to_numeric(survey_df['Age'], errors='coerce')

# Then replace any ages less than 0 OR greater than 99 with '404' to indicate an "error"
survey_df.loc[(survey_df['Age'] < 0) | (survey_df['Age'] > 99), 'Age'] = 404

# Check the updated value counts
age_update = survey_df['Age'].value_counts()
age_update

Age
29     85
32     82
26     75
27     71
33     70
28     68
31     67
34     65
30     63
25     61
35     55
23     51
24     46
37     43
38     39
36     37
40     33
39     33
43     28
22     21
41     21
42     20
21     16
45     12
46     12
44     11
19      9
18      7
50      6
48      6
20      6
404     5
51      5
56      4
49      4
54      3
55      3
57      3
47      2
60      2
62      1
58      1
65      1
5       1
53      1
61      1
8       1
11      1
72      1
Name: count, dtype: int64

In [14]:
# check 'Age' column count
age_column = survey_df['Age'].count()
age_column

1259

In [15]:
# Sort 'Age' column into age ranges then create new column
# ----------------------------------------------
# Convert the 'Age' column to numeric
survey_df['Age'] = pd.to_numeric(survey_df['Age'], errors='coerce')

# Create a new column 'age_range' that contains the age range for each individual
survey_df['age_range'] = pd.cut(survey_df['Age'], bins=[0, 18, 25, 35, 50, 65, 100, 405], labels=['0-18', '19-25', '26-35', '36-50', '51-65', '65+', 'Other'])

# Check to make sure binning was successful
survey_df['age_range'].value_counts(dropna=False)

age_range
26-35    701
36-50    307
19-25    210
51-65     25
0-18      10
Other      5
65+        1
Name: count, dtype: int64

In [16]:
# check NEW 'age_range' column count
age_column = survey_df['age_range'].count()
age_column

1259

## Survey Data
The contains either Yes/No or short answers for the general survey questions.

In [17]:
# Look at 'self_employed' value counts
self_emp_count = survey_df['self_employed'].value_counts()
self_emp_count

self_employed
No     1095
Yes     146
Name: count, dtype: int64

In [18]:
# total 'self_employed'
self_emp_column = survey_df['self_employed'].count()
self_emp_column

1241

In [19]:
# Look at 'family_history' value counts
fam_history_count = survey_df['family_history'].value_counts()
fam_history_count

family_history
No     767
Yes    492
Name: count, dtype: int64

In [20]:
# total 'family_history'
fam_history_column = survey_df['family_history'].count()
fam_history_column

1259

In [21]:
# Look at 'treatment' value counts
treatment_count = survey_df['treatment'].value_counts()
treatment_count

treatment
Yes    637
No     622
Name: count, dtype: int64

In [22]:
# total look for 'treatment'
treatment_column = survey_df['treatment'].count()
treatment_column

1259

In [23]:
# Look at 'work_interfere' value counts
work_inter_count = survey_df['work_interfere'].value_counts()
work_inter_count

work_interfere
Sometimes    465
Never        213
Rarely       173
Often        144
Name: count, dtype: int64

In [24]:
# total 'work_interfere'
work_inter_column = survey_df['work_interfere'].count()
work_inter_column

995

In [25]:
# Replacing the NaN in the 'self_employed' and 'work_interfere' to a 'no' answer, assuming a non-answer means "No".
survey_df = survey_df.fillna({'self_employed': 'No',
                            'work_interfere': 'Never'})

survey_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   Age                        1259 non-null   int64   
 1   Gender                     1259 non-null   object  
 2   Country                    1259 non-null   object  
 3   self_employed              1259 non-null   object  
 4   family_history             1259 non-null   object  
 5   treatment                  1259 non-null   object  
 6   work_interfere             1259 non-null   object  
 7   no_employees               1259 non-null   object  
 8   remote_work                1259 non-null   object  
 9   tech_company               1259 non-null   object  
 10  benefits                   1259 non-null   object  
 11  care_options               1259 non-null   object  
 12  wellness_program           1259 non-null   object  
 13  seek_help                  1259 n

In [26]:
# recheck 'self_employed' value counts
self_emp_count = survey_df['self_employed'].value_counts()
self_emp_count

self_employed
No     1113
Yes     146
Name: count, dtype: int64

In [27]:
# recheck 'work_interfere' value counts
work_inter_count = survey_df['work_interfere'].value_counts()
work_inter_count

work_interfere
Never        477
Sometimes    465
Rarely       173
Often        144
Name: count, dtype: int64

In [28]:
# Look at 'no_employess' value counts
employees_count = survey_df['no_employees'].value_counts()
employees_count

no_employees
6-25              290
26-100            289
More than 1000    282
100-500           176
1-5               162
500-1000           60
Name: count, dtype: int64

In [29]:
# total 'no_employess'
employees_column = survey_df['no_employees'].count()
employees_column

1259

In [30]:
# Look at 'remote_work' value counts
remote_work_count = survey_df['remote_work'].value_counts()
remote_work_count

remote_work
No     883
Yes    376
Name: count, dtype: int64

In [31]:
# total 'remote_work'
remote_work_column = survey_df['remote_work'].count()
remote_work_column

1259

In [32]:
# Look at 'tech_company' value counts
tech_co_count = survey_df['tech_company'].value_counts()
tech_co_count

tech_company
Yes    1031
No      228
Name: count, dtype: int64

In [33]:
# total 'tech_company'
tech_co_column = survey_df['tech_company'].count()
tech_co_column

1259

In [34]:
# Look at 'benefits' value counts
benefits_count = survey_df['benefits'].value_counts()
benefits_count

benefits
Yes           477
Don't know    408
No            374
Name: count, dtype: int64

In [35]:
# total 'benefits'
benefits_column = survey_df['benefits'].count()
benefits_column

1259

In [36]:
# Look at 'care_options' value counts
care_opt_count = survey_df['care_options'].value_counts()
care_opt_count

care_options
No          501
Yes         444
Not sure    314
Name: count, dtype: int64

In [37]:
# total 'care_options'
care_opt_column = survey_df['care_options'].count()
care_opt_column

1259

In [38]:
# Look at 'wellness_program' value counts
well_prog_count = survey_df['wellness_program'].value_counts()
well_prog_count

wellness_program
No            842
Yes           229
Don't know    188
Name: count, dtype: int64

In [39]:
# total 'wellness_program'
well_prog_column = survey_df['wellness_program'].count()
well_prog_column

1259

In [40]:
# Look at 'seek_help' value counts
help_count = survey_df['seek_help'].value_counts()
help_count

seek_help
No            646
Don't know    363
Yes           250
Name: count, dtype: int64

In [41]:
# total 'seek_help'
help_column = survey_df['seek_help'].count()
help_column

1259

In [42]:
# Look at 'anonymity' value counts
anon_count = survey_df['anonymity'].value_counts()
anon_count

anonymity
Don't know    819
Yes           375
No             65
Name: count, dtype: int64

In [43]:
# total 'anonymity'
anon_column = survey_df['anonymity'].count()
anon_column

1259

In [44]:
# Look at 'leave' value counts
leave_count = survey_df['leave'].value_counts()
leave_count

leave
Don't know            563
Somewhat easy         266
Very easy             206
Somewhat difficult    126
Very difficult         98
Name: count, dtype: int64

In [45]:
# total 'leave'
leave_column = survey_df['leave'].count()
leave_column

1259

In [46]:
# Look at 'mental_health_consequence' value counts
mental_health_conseq = survey_df['mental_health_consequence'].value_counts()
mental_health_conseq

mental_health_consequence
No       490
Maybe    477
Yes      292
Name: count, dtype: int64

In [47]:
# total 'mental_health_consequence'
mental_health_conseq_tl = survey_df['mental_health_consequence'].count()
mental_health_conseq_tl

1259

In [48]:
# Look at 'phys_health_consequence' value counts
phys_health_conseq = survey_df['phys_health_consequence'].value_counts()
phys_health_conseq

phys_health_consequence
No       925
Maybe    273
Yes       61
Name: count, dtype: int64

In [49]:
# total 'phys_health_consequence'
phys_health_conseq_tl = survey_df['phys_health_consequence'].count()
phys_health_conseq_tl

1259

In [50]:
# Look at 'coworkers' value counts
coworker_trust = survey_df['coworkers'].value_counts()
coworker_trust

coworkers
Some of them    774
No              260
Yes             225
Name: count, dtype: int64

In [51]:
# total 'coworkers'
coworker_trust_tl = survey_df['coworkers'].count()
coworker_trust_tl

1259

In [52]:
# Look at 'supervisor' value counts
supervisor_trust = survey_df['supervisor'].value_counts()
supervisor_trust

supervisor
Yes             516
No              393
Some of them    350
Name: count, dtype: int64

In [53]:
# total 'supervisor'
supervisor_trust_tl = survey_df['supervisor'].count()
supervisor_trust_tl

1259

In [54]:
# Look at 'mental_health_interview' value counts
mental_health_int = survey_df['mental_health_interview'].value_counts()
mental_health_int

mental_health_interview
No       1008
Maybe     207
Yes        44
Name: count, dtype: int64

In [55]:
# total 'mental_health_interview'
mental_health_int_tl = survey_df['mental_health_interview'].count()
mental_health_int_tl

1259

In [56]:
# Look at 'phys_health_interview' value counts
phys_health_int = survey_df['phys_health_interview'].value_counts()
phys_health_int

phys_health_interview
Maybe    557
No       500
Yes      202
Name: count, dtype: int64

In [57]:
# total 'phys_health_interview'
phys_health_int_tl = survey_df['phys_health_interview'].count()
phys_health_int_tl

1259

In [58]:
# Look at 'mental_vs_physical' value counts
mental_vs_phys = survey_df['mental_vs_physical'].value_counts()
mental_vs_phys

mental_vs_physical
Don't know    576
Yes           343
No            340
Name: count, dtype: int64

In [59]:
# total 'mental_vs_physical'
mental_vs_phys_tl = survey_df['mental_vs_physical'].count()
mental_vs_phys_tl

1259

In [60]:
# Look at 'obs_consequence' value counts
obs_conseq = survey_df['obs_consequence'].value_counts()
obs_conseq

obs_consequence
No     1075
Yes     184
Name: count, dtype: int64

In [61]:
# total 'obs_consequence'
obs_conseq_tl = survey_df['obs_consequence'].count()
obs_conseq_tl

1259

In [62]:
# Preview updated DataFrame
survey_df.head()

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,age_range
0,37,Female,United States,No,No,Yes,Often,6-25,No,Yes,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,36-50
1,44,Male,United States,No,No,No,Rarely,More than 1000,No,No,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,36-50
2,32,Male,Canada,No,No,No,Rarely,6-25,No,Yes,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,26-35
3,31,Male,United Kingdom,No,Yes,Yes,Often,26-100,No,Yes,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,26-35
4,31,Male,United States,No,No,No,Never,100-500,Yes,Yes,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,26-35
