In [21]:
"""
DSCI-663-03 Project: Data Preprocessing File
This file preforms some of our discrete preprocessing tasks for our ML

:language:      Python with pandas
:author:        Stephen Cook
:author:        Cory Maclauchlan
:author:        Robert Gentilucci
:author:        Julia Okvath
:date created:  10/18/21
:last edit:     11/05/21
"""
# import pandas and numpy
import pandas as pd
import numpy as np

In [22]:
# Task 1: Load in the data file, mental_health_data.csv
mental_health_data_filename = 'mental_health_data.csv'
mental_health_df= pd.read_csv(mental_health_data_filename)

In [23]:
# Task 2: Drop all records for self employed people
# This was done because self-employed respondents are irrelevant to our questions
mental_health_df.drop(mental_health_df[mental_health_df['A'] == 1].index, inplace = True)


In [24]:

# Task 3: Preform Feature Subset Selection by filtering out irrelevant columns
mental_health_df = mental_health_df.drop(columns = ['T','Q', 'R', 'S', 'U', 'V', 'W', 'X', 'A', 'D', 'LL',
                                                    'NN', 'WW', 'XX', 'ZZ', 'JJJ', 'GGG', 'III', 'KK','FF','C', 'Y','Z',
                                                    'AA','BB','CC','EE','GG','HH','II','JJ'])

# Note: Due to issues with the columns in our algorithm, the aggregation of the mental health column has been
# put on the backburner, currently the columns are dropped, this may change so the code has been left
#
#
# #Combine WW and XX
# mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], mental_health_df['XX'], mental_health_df['WW'])
#
# #Drop column XX now that is is combined
# mental_health_df = mental_health_df.drop(columns = ['XX'])
#
# #Handle updated column WW missing values
# #if no value due to "no" entered beforehand:
# mental_health_df['WW'] = np.where((mental_health_df['WW'] != mental_health_df['WW']) & (mental_health_df['VV'] == 'No'), 'N/A', mental_health_df['WW'])
# #if no value because skipped:
# mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], 'None', mental_health_df['WW'])
#
# #Column XX, WW, ZZ codify non-standard values as other
# answerString = ('Anxiety Disorder (Generalized, Social, Phobia, etc); Mood Disorder (Depression, Bipolar Disorder, etc); Psychotic Disorder (Schizophrenia, Schizoaffective, etc); Eating Disorder (Anorexia, Bulimia, etc); Attention Deficit Hyperactivity Disorder; Personality Disorder (Borderline, Antisocial, Paranoid, etc); Obsessive-Compulsive Disorder; Post-traumatic Stress Disorder; Stress Response Syndromes; Dissociative Disorder; Substance Use Disorder; Addictive Disorder; Other; N/A; None')
# answerList = answerString.split('; ')
#
# def replacement(col):
#     colList = col.split('|')
#
#     for item in range(len(colList)):
#         if colList[item] in answerList:
#             colList[item] = colList[item]
#         else:
#             colList[item] = 'Other'
#
#     return '|'.join(colList)
#
# mental_health_df['WW'] = mental_health_df['WW'].apply(replacement)
#
#
# #Need to replace Nan values in col ZZ before running the "replacement" function
# #if no value due to "no" entered beforehand:
# mental_health_df['ZZ'] = np.where((mental_health_df['ZZ'] != mental_health_df['ZZ']) & (mental_health_df['YY'] == 'No'), 'N/A', mental_health_df['ZZ'])
# #if no value because skipped:
# mental_health_df['ZZ'] = np.where(mental_health_df['ZZ'] != mental_health_df['ZZ'], 'None', mental_health_df['ZZ'])
#
# mental_health_df['ZZ'] = mental_health_df['ZZ'].apply(replacement)

In [25]:
## Task 4: Replace General Missing Values

no_Entry = {'SS': 'No Entry', 'GGG': 'No Entry', 'III': 'No Entry', 'EEE': 'No Entry', 'RR': 'No Entry', 'F': 'No Entry'}
mental_health_df = mental_health_df.fillna(value = no_Entry)

not_App = {'Z': 'Not Applicable', 'AA': 'Not Applicable', 'BB': 'Not Applicable','CC': 'Not Applicable','DD': 'Not Applicable','EE': 'Not Applicable', 'FF': 'Not Applicable','GG': 'Not Applicable', 'HH': 'Not Applicable', 'II': 'Not Applicable', 'JJ': 'Not Applicable', 'Y': 'Not Applicable'}
mental_health_df = mental_health_df.fillna(value = not_App)

# Noted special case, This value is set to 0 instead of what nan
mental_health_df.at[183, 'GG'] = 'Not Applicable'

In [26]:
# Task 5: Standardize the Gender Responses, as they were free response.

# Var for the gender column ID
gender_col_id = "EEE"

# for this method to work better, strip leading and ending whitespace
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].str.strip()

# regex explanation:
# (?i) - case insensitive, looks for female or woman or f
female_regex_pattern = r'(?i).*(female|woman|f).*'

# regex explanation:
# (?i) - case insensitive, looks for male or man or m
male_regex_pattern = r'(?i).*(male|man|m).*'

# regex explanation:
# any value, not F or M gets O
other_regex_pattern = r'^(?!M).*^(?!F).*'

# replace female
mental_health_df[gender_col_id].replace(to_replace=female_regex_pattern, value = 'F',
                                        inplace = True, regex= True)
# replace male
mental_health_df[gender_col_id].replace(to_replace=male_regex_pattern, value = 'M',
                                        inplace = True, regex= True)

# replace other
mental_health_df[gender_col_id].replace(to_replace=other_regex_pattern, value = 'O',
                                        inplace = True, regex= True)

#fill na spaces with other.
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].fillna(value="O")# This var makes my life a little easier

In [27]:
# Task 6: Manage Age Outliers

# Find IQR of age group for removal of extreme outliers
Q1 = mental_health_df['DDD'].quantile(.25)
Q3 = mental_health_df['DDD'].quantile(.75)
IQR = Q3-Q1

# Remove extreme outliers in the age column
# Note IQR is used for upper bound; however, IQR is invalid for lowered as it would be negative. we use 15
mental_health_df.drop(mental_health_df[(mental_health_df['DDD'] > Q3+6*IQR)
                                       | (mental_health_df['DDD'] < 15)].index, axis = 0, inplace=True)

In [28]:
# Task 7: Discretize age data

mental_health_df['DDD'] = pd.cut(mental_health_df["DDD"], bins=[0, 25, 35, 45, 100],
                                 labels=[">25", "25-35", "36-45", "45<"])



In [29]:
# Task 8: Cast all binary columns to boolean type, this caused issues in Apriori
mental_health_df['AAA'] = mental_health_df['AAA'].astype('bool')

In [30]:
# Task 9: print the cleaned file to csv
mental_health_df.to_csv('mental_health_CLEAN.csv', index=False, quoting=1)

