In [1]:
"""
DSCI-663-03 Project: Data Preprocessing File
This file preforms some of our discrete preprocessing tasks for our ML

:language:      Python with pandas
:author:        Stephen Cook
:author:        Cory Maclauchlan
:author:        Robert Gentilucci
:author:        Julia Okvath
:date created:  10/18/21
:last edit:     11/05/21
"""
# import pandas and numpy
import pandas as pd
import numpy as np

In [2]:
# Task 1: Load in the data file, mental_health_data.csv
mental_health_data_filename = 'mental_health_data.csv'
mental_health_df= pd.read_csv(mental_health_data_filename)

In [3]:
# Task 2: Drop all records for self employed people
# This was done because self-employed respondents are irrelevant to our questions
mental_health_df.drop(mental_health_df[mental_health_df['A'] == 1].index, inplace = True)


In [4]:

# Task 3: Preform Feature Subset Selection by filtering out irrelevant columns
mental_health_df = mental_health_df.drop(columns = ['T','Q', 'R', 'S', 'U', 'V', 'W', 'X', 'A', 'D', 'L', 'O', 'LL', 'NN', 
                                                    'WW', 'XX', 'ZZ', 'BBB', 'CCC', 'FFF', 'JJJ', 'GGG', 'III', 'KK','FF',
                                                    'C', 'Y','Z', 'AA','BB','CC','EE','GG','HH','II','JJ'])

# Note: Due to issues with the columns in our algorithm, the aggregation of the mental health column has been
# put on the backburner, currently the columns are dropped, this may change so the code has been left
#
#
# #Combine WW and XX
# mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], mental_health_df['XX'], mental_health_df['WW'])
#
# #Drop column XX now that is is combined
# mental_health_df = mental_health_df.drop(columns = ['XX'])
#
# #Handle updated column WW missing values
# #if no value due to "no" entered beforehand:
# mental_health_df['WW'] = np.where((mental_health_df['WW'] != mental_health_df['WW']) & (mental_health_df['VV'] == 'No'), 'N/A', mental_health_df['WW'])
# #if no value because skipped:
# mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], 'None', mental_health_df['WW'])
#
# #Column XX, WW, ZZ codify non-standard values as other
# answerString = ('Anxiety Disorder (Generalized, Social, Phobia, etc); Mood Disorder (Depression, Bipolar Disorder, etc); Psychotic Disorder (Schizophrenia, Schizoaffective, etc); Eating Disorder (Anorexia, Bulimia, etc); Attention Deficit Hyperactivity Disorder; Personality Disorder (Borderline, Antisocial, Paranoid, etc); Obsessive-Compulsive Disorder; Post-traumatic Stress Disorder; Stress Response Syndromes; Dissociative Disorder; Substance Use Disorder; Addictive Disorder; Other; N/A; None')
# answerList = answerString.split('; ')
#
# def replacement(col):
#     colList = col.split('|')
#
#     for item in range(len(colList)):
#         if colList[item] in answerList:
#             colList[item] = colList[item]
#         else:
#             colList[item] = 'Other'
#
#     return '|'.join(colList)
#
# mental_health_df['WW'] = mental_health_df['WW'].apply(replacement)
#
#
# #Need to replace Nan values in col ZZ before running the "replacement" function
# #if no value due to "no" entered beforehand:
# mental_health_df['ZZ'] = np.where((mental_health_df['ZZ'] != mental_health_df['ZZ']) & (mental_health_df['YY'] == 'No'), 'N/A', mental_health_df['ZZ'])
# #if no value because skipped:
# mental_health_df['ZZ'] = np.where(mental_health_df['ZZ'] != mental_health_df['ZZ'], 'None', mental_health_df['ZZ'])
#
# mental_health_df['ZZ'] = mental_health_df['ZZ'].apply(replacement)

In [5]:
## Task 4: Replace General Missing Values

no_Entry = {'SS': 'No Entry', 'GGG': 'No Entry', 'III': 'No Entry', 'EEE': 'No Entry', 'RR': 'No Entry', 'F': 'No Entry'}
mental_health_df = mental_health_df.fillna(value = no_Entry)

not_App = {'Z': 'Not Applicable', 'AA': 'Not Applicable', 'BB': 'Not Applicable','CC': 'Not Applicable','DD': 'Not Applicable','EE': 'Not Applicable', 'FF': 'Not Applicable','GG': 'Not Applicable', 'HH': 'Not Applicable', 'II': 'Not Applicable', 'JJ': 'Not Applicable', 'Y': 'Not Applicable'}
mental_health_df = mental_health_df.fillna(value = not_App)

In [6]:
# Task 5: Standardize the Gender Responses, as they were free response.

# Var for the gender column ID
gender_col_id = "EEE"

# for this method to work better, strip leading and ending whitespace
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].str.strip()

# regex explanation:
# (?i) - case insensitive, looks for female or woman or f
female_regex_pattern = r'(?i).*(female|woman|f).*'

# regex explanation:
# (?i) - case insensitive, looks for male or man or m
male_regex_pattern = r'(?i).*(male|man|m).*'

# regex explanation:
# any value, not F or M gets O
other_regex_pattern = r'^(?!M).*^(?!F).*'

# replace female
mental_health_df[gender_col_id].replace(to_replace=female_regex_pattern, value = 'F',
                                        inplace = True, regex= True)
# replace male
mental_health_df[gender_col_id].replace(to_replace=male_regex_pattern, value = 'M',
                                        inplace = True, regex= True)

# replace other
mental_health_df[gender_col_id].replace(to_replace=other_regex_pattern, value = 'O',
                                        inplace = True, regex= True)

#fill na spaces with other.
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].fillna(value="O")# This var makes my life a little easier

In [7]:
# Task 6: Manage Age Outliers

# Find IQR of age group for removal of extreme outliers
Q1 = mental_health_df['DDD'].quantile(.25)
Q3 = mental_health_df['DDD'].quantile(.75)
IQR = Q3-Q1

# Remove extreme outliers in the age column
# Note IQR is used for upper bound; however, IQR is invalid for lowered as it would be negative. we use 15
mental_health_df.drop(mental_health_df[(mental_health_df['DDD'] > Q3+6*IQR)
                                       | (mental_health_df['DDD'] < 15)].index, axis = 0, inplace=True)

In [8]:
# Task 7: Discretize age data

mental_health_df['DDD'] = pd.cut(mental_health_df["DDD"], bins=[0, 25, 35, 45, 100],
                                 labels=["<25", "25-35", "36-45", ">45"])



In [9]:
# Task 8: Cast all binary columns to boolean type, this caused issues in Apriori
mental_health_df['AAA'] = mental_health_df['AAA'].astype('bool')

In [11]:
import category_encoders as ce

In [17]:
# create object of Ordinalencoding
encoderE= ce.OrdinalEncoder(cols=['E'],return_df=True,
                           mapping=[{'col':'E','mapping':{'No':0,'Yes':1,"I don't know":0,'Not eligible for coverage / N/A':0}}])
encoderF= ce.OrdinalEncoder(cols=['F'],return_df=True,
                           mapping=[{'col':'F','mapping':{'No':0,'Yes':1,"No Entry":0,'I am not sure':0}}])
encoderG= ce.OrdinalEncoder(cols=['G'],return_df=True,
                           mapping=[{'col':'G','mapping':{'No':0,'Yes':1,"I don't know":0}}])
encoderH= ce.OrdinalEncoder(cols=['H'],return_df=True,
                           mapping=[{'col':'H','mapping':{'No':0,'Yes':1,"I don't know":0}}])
encoderI= ce.OrdinalEncoder(cols=['I'],return_df=True,
                           mapping=[{'col':'I','mapping':{'No':0,'Yes':1,"I don't know":0}}])
encoderJ= ce.OrdinalEncoder(cols=['J'],return_df=True,
                           mapping=[{'col':'J','mapping':{'Very easy':1,'Somewhat easy':1,"I don't know":0, 
                                                          'Neither easy nor difficult':1, 'Very difficult':0, 
                                                          'Somewhat difficult':0}}])
encoderK= ce.OrdinalEncoder(cols=['K'],return_df=True,
                           mapping=[{'col':'K','mapping':{'No':1,'Yes':0,"Maybe":0}}])
encoderM= ce.OrdinalEncoder(cols=['M'],return_df=True,
                           mapping=[{'col':'M','mapping':{'No':0,'Yes':1,"Maybe":0}}])
encoderN= ce.OrdinalEncoder(cols=['N'],return_df=True,
                           mapping=[{'col':'N','mapping':{'No':0,'Yes':1,"Maybe":0}}])
encoderP= ce.OrdinalEncoder(cols=['P'],return_df=True,
                           mapping=[{'col':'P','mapping':{'No':1,'Yes':0}}])
encoderDD= ce.OrdinalEncoder(cols=['DD'],return_df=True,
                           mapping=[{'col':'DD','mapping':{'No':0,'Yes, always':1,"I don't know":0,'Sometimes':0,
                                                          'Not Applicable':0}}])
encoderMM= ce.OrdinalEncoder(cols=['MM'],return_df=True,
                           mapping=[{'col':'MM','mapping':{'No':0,'Yes':1,"Maybe":0}}])
encoderOO= ce.OrdinalEncoder(cols=['OO'],return_df=True,
                           mapping=[{'col':'OO','mapping':{"No, I don't think it would":1,"No, it has not":1,
                                                           'Yes, I think it would':0,'Yes, it has':0,"Maybe":0}}])
encoderPP= ce.OrdinalEncoder(cols=['PP'],return_df=True,
                           mapping=[{'col':'PP','mapping':{"No, I don't think they would":1,"No, they do not":1,
                                                           'Yes, I think they would':0,'Yes, they do':0,"Maybe":0}}])
encoderQQ= ce.OrdinalEncoder(cols=['QQ'],return_df=True,
                           mapping=[{'col':'QQ','mapping':{"Somewhat open":1,
                                                           "Not applicable to me (I do not have a mental illness)":0,
                                                           'Very open':1,'Not open at all':0,"Neutral":0, 
                                                           'Somewhat not open':0}}])
encoderRR= ce.OrdinalEncoder(cols=['RR'],return_df=True,
                           mapping=[{'col':'RR','mapping':{"No":1,"Maybe/Not sure":0,'Yes, I experienced':0,
                                                           'Yes, I observed':0,"No Entry":0}}])
encoderSS= ce.OrdinalEncoder(cols=['SS'],return_df=True,
                           mapping=[{'col':'SS','mapping':{"No":1,"Maybe":0,'Yes':0,"No Entry":0}}])

In [20]:
mental_health_df_encoding_test = encoderE.fit_transform(mental_health_df)
mental_health_df_encoding_test = encoderF.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderG.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderH.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderI.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderJ.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderK.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderM.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderN.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderP.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderDD.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderMM.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderOO.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderPP.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderQQ.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderRR.fit_transform(mental_health_df_encoding_test)
mental_health_df_encoding_test = encoderSS.fit_transform(mental_health_df_encoding_test)

In [39]:
mental_health_df_encoding_test['Workplace_Culture'] = (mental_health_df_encoding_test['E'] + 
                                                       mental_health_df_encoding_test['F'] +
                                                       mental_health_df_encoding_test['G'] +
                                                       mental_health_df_encoding_test['H'] +
                                                       mental_health_df_encoding_test['I'] +
                                                       mental_health_df_encoding_test['J'] +
                                                       mental_health_df_encoding_test['P'] +
                                                       mental_health_df_encoding_test['DD'] +
                                                       mental_health_df_encoding_test['RR'] +
                                                       mental_health_df_encoding_test['SS'])


mental_health_df_encoding_test['Employee_Comfort_w/_MH_@_Workplace'] = (mental_health_df_encoding_test['PP'] +
                                                                        mental_health_df_encoding_test['OO'] +
                                                                        mental_health_df_encoding_test['M'] + 
                                                                        mental_health_df_encoding_test['N'] + 
                                                                        mental_health_df_encoding_test['K'])

mental_health_df_encoding_test['Employee_Comfort_w/_MH'] = (mental_health_df_encoding_test['MM'] +
                                                            mental_health_df_encoding_test['QQ'])

In [42]:
mental_health_df_redux = mental_health_df_encoding_test.drop(columns = ['E', 'F', 'G', 'H', 'I', 'J', 'K',
                                                                       'M', 'N', 'P', 'DD', 'MM', 'OO', 'PP',
                                                                       'QQ', 'RR', 'SS'])

In [43]:
mental_health_df_redux

Unnamed: 0,B,TT,UU,VV,YY,AAA,DDD,EEE,HHH,LLL,Workplace_Culture,Employee_Comfort_w/_MH_@_Workplace,Employee_Comfort_w/_MH
0,26-100,No,Yes,No,Yes,False,36-45,M,United Kingdom,Sometimes,3,3,1
1,6-25,Yes,Yes,Yes,Yes,True,25-35,M,United States of America,Never,8,4,1
2,6-25,No,Maybe,No,No,True,36-45,M,United Kingdom,Always,2,0,2
4,6-25,Yes,Yes,Yes,Yes,True,36-45,F,United States of America,Sometimes,4,0,1
5,More than 1000,No,No,Yes,No,True,36-45,M,United Kingdom,Sometimes,5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1426,100-500,I don't know,Yes,Yes,Yes,True,25-35,F,Canada,Sometimes,2,0,1
1427,500-1000,Yes,Yes,Yes,Yes,True,36-45,F,United States of America,Always,4,4,1
1430,100-500,Yes,Yes,Maybe,Yes,True,>45,M,United States of America,Sometimes,4,3,1
1431,100-500,Yes,Maybe,Yes,Yes,False,25-35,F,United States of America,Sometimes,2,3,1


In [44]:
mental_health_df_redux.describe()

Unnamed: 0,Workplace_Culture,Employee_Comfort_w/_MH_@_Workplace,Employee_Comfort_w/_MH
count,1143.0,1143.0,1143.0
mean,3.694663,1.422572,0.701662
std,1.890255,1.535025,0.56864
min,0.0,0.0,0.0
25%,2.0,0.0,0.0
50%,3.0,1.0,1.0
75%,5.0,3.0,1.0
max,9.0,5.0,2.0


In [45]:
# Task 9: print the cleaned file to csv
mental_health_df_redux.to_csv('mental_health_redux_CLEAN.csv', index=False, quoting=1)

