In [1]:
#Julia Branch

import pandas as pd
import numpy as np

In [2]:
#Load data
mental_health_data_filename = 'mental_health_data_true.csv'
mental_health_df= pd.read_csv(mental_health_data_filename)

In [3]:
#Remove self employed records
x,y = mental_health_df.shape
print("Records before drop = ",x)
mental_health_df.drop(mental_health_df[mental_health_df['A'] == 1].index, inplace = True)
x,y = mental_health_df.shape
print("Records after drop = ",x)

Records before drop =  1433
Records after drop =  1146


In [4]:
#Drop columns
x,y = mental_health_df.shape
print("Columns before drop = ",y)
mental_health_df = mental_health_df.drop(columns = ['T','Q', 'R', 'S', 'U', 'V', 'W', 'X', 'A', 'D', 'LL', 'NN'])
x,y = mental_health_df.shape
print("Columns after drop = ",y)

Columns before drop =  63
Columns after drop =  51


In [5]:
#Combine WW and XX
mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], mental_health_df['XX'], mental_health_df['WW'])

In [6]:
#Drop column XX now that is is combined
x,y = mental_health_df.shape
print("Columns before drop = ",y)
mental_health_df = mental_health_df.drop(columns = ['XX'])
x,y = mental_health_df.shape
print("Columns after drop = ",y)

Columns before drop =  51
Columns after drop =  50


In [7]:
#Handle updated column WW missing values
#if no value due to "no" entered beforehand:
mental_health_df['WW'] = np.where((mental_health_df['WW'] != mental_health_df['WW']) & (mental_health_df['VV'] == 'No'), 'N/A', mental_health_df['WW'])
#if no value because skipped:
mental_health_df['WW'] = np.where(mental_health_df['WW'] != mental_health_df['WW'], 'None', mental_health_df['WW'])

In [8]:
#Column XX, WW, ZZ codify non-standard values as other

answerString = ('Anxiety Disorder (Generalized, Social, Phobia, etc); Mood Disorder (Depression, Bipolar Disorder, etc); Psychotic Disorder (Schizophrenia, Schizoaffective, etc); Eating Disorder (Anorexia, Bulimia, etc); Attention Deficit Hyperactivity Disorder; Personality Disorder (Borderline, Antisocial, Paranoid, etc); Obsessive-Compulsive Disorder; Post-traumatic Stress Disorder; Stress Response Syndromes; Dissociative Disorder; Substance Use Disorder; Addictive Disorder; Other; N/A; None')
answerList = answerString.split('; ')

def replacement(col):
    colList = col.split('|')
    
    for item in range(len(colList)):
        if colList[item] in answerList:
            colList[item] = colList[item]
        else: 
            colList[item] = 'Other'
            
    return '|'.join(colList)

In [9]:
mental_health_df['WW'] = mental_health_df['WW'].apply(replacement)

In [10]:
#Need to replace Nan values in col ZZ before running the "replacement" function
#if no value due to "no" entered beforehand:
mental_health_df['ZZ'] = np.where((mental_health_df['ZZ'] != mental_health_df['ZZ']) & (mental_health_df['YY'] == 'No'), 'N/A', mental_health_df['ZZ'])
#if no value because skipped:
mental_health_df['ZZ'] = np.where(mental_health_df['ZZ'] != mental_health_df['ZZ'], 'None', mental_health_df['ZZ'])

In [11]:
mental_health_df['ZZ'] = mental_health_df['ZZ'].apply(replacement)

In [12]:
#Replace General Missing Values

no_Entry = {'SS': 'No Entry', 'GGG': 'No Entry', 'III': 'No Entry', 'EEE': 'No Entry', 'RR': 'No Entry', 'F': 'No Entry'}
mental_health_df = mental_health_df.fillna(value = no_Entry)

not_App = {'Z': 'Not Applicable', 'AA': 'Not Applicable', 'BB': 'Not Applicable','CC': 'Not Applicable','DD': 'Not Applicable','EE': 'Not Applicable', 'FF': 'Not Applicable','GG': 'Not Applicable', 'HH': 'Not Applicable', 'II': 'Not Applicable', 'JJ': 'Not Applicable', 'Y': 'Not Applicable'}
mental_health_df = mental_health_df.fillna(value = not_App)

In [13]:
#Handle Gender Somehow
# This var makes my life a little easier
# TODO: gender_col_id may change after integration, double check this
gender_col_id = "EEE"

# for this method to work better, strip leading and ending whitespace
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].str.strip()

# regex explanation:
# (?i) - case insensitive, looks for female or woman or f
female_regex_pattern = r'(?i).*(female|woman|f).*'

# regex explanation:
# (?i) - case insensitive, looks for male or man or m
male_regex_pattern = r'(?i).*(male|man|m).*'

# regex explanation:
# any value, not F or M gets O
other_regex_pattern = r'^(?!M).*^(?!F).*'

# replace female
mental_health_df[gender_col_id].replace(to_replace=female_regex_pattern, value = 'F',
                                        inplace = True, regex= True)
# replace male
mental_health_df[gender_col_id].replace(to_replace=male_regex_pattern, value = 'M',
                                        inplace = True, regex= True)

# replace other
mental_health_df[gender_col_id].replace(to_replace=other_regex_pattern, value = 'O',
                                        inplace = True, regex= True)

#fill na spaces with other.
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].fillna(value="O")# This var makes my life a little easier
# TODO: gender_col_id may change after integration, double check this
gender_col_id = "EEE"

# for this method to work better, strip leading and ending whitespace
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].str.strip()

# regex explanation:
# (?i) - case insensitive, looks for female or woman or f
female_regex_pattern = r'(?i).*(female|woman|f).*'

# regex explanation:
# (?i) - case insensitive, looks for male or man or m
male_regex_pattern = r'(?i).*(male|man|m).*'

# regex explanation:
# any value, not F or M gets O
other_regex_pattern = r'^(?!M).*^(?!F).*'

# replace female
mental_health_df[gender_col_id].replace(to_replace=female_regex_pattern, value = 'F',
                                        inplace = True, regex= True)
# replace male
mental_health_df[gender_col_id].replace(to_replace=male_regex_pattern, value = 'M',
                                        inplace = True, regex= True)

# replace other
mental_health_df[gender_col_id].replace(to_replace=other_regex_pattern, value = 'O',
                                        inplace = True, regex= True)

#fill na spaces with other.
mental_health_df[gender_col_id] = mental_health_df[gender_col_id].fillna(value="O")

In [14]:
mental_health_df['EEE'].value_counts()

M    847
F    280
O     19
Name: EEE, dtype: int64

In [18]:
#Age Outliers

x,y = mental_health_df.shape
print("Records before drop = ",x)

count = 0

for i in mental_health_df['DDD']:
    if i > 75 or i < 15:
        mental_health_df = mental_health_df.drop([count], axis = 0)    
    count = count+1     

        
x,y = mental_health_df.shape
print("Records after drop = ",x)

Records before drop =  1146
Records after drop =  1143


In [21]:
mental_health_df.to_csv('mental_health_CLEAN.csv', index=False)

In [26]:
#Encode yes,no,ect values 
conda install -c conda-forge category_encoders

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\okvat\AppData\Local\Continuum\anaconda3

  added / updated specs:
    - category_encoders


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    category_encoders-2.3.0    |     pyhd8ed1ab_0          57 KB  conda-forge
    conda-4.10.3               |   py37h03978a9_2         3.1 MB  conda-forge
    python_abi-3.7             |          2_cp37m           4 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.1 MB

The following NEW packages will be INSTALLED:

  category_encoders  conda-forge/noarch::category_encoders-2.3.0-pyhd8ed1ab_0
  python_abi         conda-forge/win-64::python_abi-3.7-2_cp37m

The following packages will be UPDATED:

  

In [27]:
import category_encoders as ce

In [28]:
# create object of Ordinalencoding
encoder= ce.OrdinalEncoder(cols=['F'],return_df=True,
                           mapping=[{'col':'F',
'mapping':{'No Entry':0,'Yes':1,'No':2,'I am not sure':3}}])

In [29]:
mental_health_df_encoding_test = encoder.fit_transform(mental_health_df)

In [31]:
mental_health_df['F'].value_counts()

I am not sure    352
No               351
Yes              307
No Entry         133
Name: F, dtype: int64

In [32]:
mental_health_df_encoding_test['F'].value_counts()

3    352
2    351
1    307
0    133
Name: F, dtype: int64

In [33]:
# create object of Ordinalencoding
encoder= ce.OrdinalEncoder(cols=['YY'],return_df=True,
                           mapping=[{'col':'YY',
'mapping':{'No':0,'Yes':1}}])

In [34]:
mental_health_df_encoding_test = encoder.fit_transform(mental_health_df)

In [35]:
mental_health_df['YY'].value_counts()

No     577
Yes    566
Name: YY, dtype: int64

In [36]:
mental_health_df_encoding_test['YY'].value_counts()

0    577
1    566
Name: YY, dtype: int64