<a href="https://colab.research.google.com/github/qweliant/GenderAndSex/blob/master/NYpcs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import plotly.graph_objects as go 
import plotly.express as px

In [0]:
url = "https://raw.githubusercontent.com/qweliant/GenderAndSex/master/pcs2017.csv?token=AII7DUPIC5EVAMELYX4PIZK6MRTNW" 
pcs = pd.read_csv(url)


In [0]:
def format(data_frame):
    vals = {
        'NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK' : 'out',
        'UNEMPLOYED, LOOKING FOR WORK' : 'looking', 
        'EMPLOYED' : 'employed',
        'UNKNOWN EMPLOYMENT STATUS' : 'unknown', 
        'NON-PAID/VOLUNTEER' : 'pro-bono'
    }
    data_frame.replace(vals, inplace=True)
    data_frame.columns = data_frame.columns.str.replace(' ', '_')
    data_frame.columns = map(str.lower, data_frame.columns)


In [0]:
pcs.head()

In [0]:
pcs.shape

I want to check out employment_status and living_situation because it would bias assumptions about the data with a tilt too far one way or another


In [0]:
pcs[pcs.isnull().any(axis=1)]

In [0]:
# no nulls so yay
format(pcs)

In [0]:
# I can probs drob the survey date, lets make sure
pcs['survey_year'].unique()

In [0]:
pcs.drop(columns=['survey_year'], inplace=True)

In [0]:
pcs['employment_status'].unique()

In [0]:
vals = {
    'NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK' : 'out',
    'UNEMPLOYED, LOOKING FOR WORK' : 'looking', 
    'EMPLOYED' : 'employed',
    'UNKNOWN EMPLOYMENT STATUS' : 'unknown', 
    'NON-PAID/VOLUNTEER' : 'pro-bono'
}

pcs['employment_status'].replace(vals, inplace=True)

In [0]:
import seaborn as sns
from matplotlib import pyplot
# pcs['employment_status'].value_counts().plot(kind='bar')
f, axes = pyplot.subplots( sharey=True, figsize=(11, 4))
sns.countplot(x="employment_status", palette="ch:.25", data=pcs, )


the overwhelming majority of people are not in the workforce. lets see if age group has anything to do with it

In [0]:
f, axes = plt.subplots( sharey=True, figsize=(15, 8))
sns.countplot(x="age_group", palette="ch:.25", data=pcs, )

This does not really inform me of anything. Which means I cannot tell if these are majority older people, Lets try looking at `medicare_insurance`


In [0]:
f, axes = plt.subplots( sharey=True, figsize=(15, 8))
sns.countplot(x="medicare_insurance", color="c", data=pcs, )

In [0]:
print(pcs['medicare_insurance'].value_counts(normalize = True))
# https://en.wikipedia.org/wiki/Medicare_(United_States)
# most medicare recepiants are older. lets check percentages to see fewest retirees

In [0]:
print(pcs['employment_status'].value_counts(normalize = True))

In [0]:
# ttest of one sample because same group with two percentages
# Null Hypothesis: number of people on medicare outside of workforce is small
# Alternate Hypothesis: number of people on medicare outside workforce is large


In [0]:
pcs.columns

In [0]:
pd.crosstab( pcs.diabetes, columns=[pcs.employment_status],  margins=True, normalize=True)

### Normalizing for population shows that those outside the workforce experience diabetes at a markedly higher rate, 12% versus the 2% in those looking for work or are employed. Unfortunately there isn't any information in the dataset that would allow me to be aware of the variance in specific occurances of those leaving thee workforce. But there are two other dataset that will allow for some comparison across time

In [0]:
# I will start by setting creating a new 2017, 2015, and 2013 dataset
url2017 = "https://raw.githubusercontent.com/qweliant/GenderAndSex/master/pcs2017.csv?token=AII7DUPIC5EVAMELYX4PIZK6MRTNW" 
url2015 = "https://raw.githubusercontent.com/qweliant/GenderAndSex/master/pcs2015.csv?token=AII7DUICYVI4PRR3WQC5PAC6MRTFC"
url2013 = "https://raw.githubusercontent.com/qweliant/GenderAndSex/master/pcs2013.csv?token=AII7DULFZ4LD3FIMZJD5SKK6MRTLK"

In [0]:
pcs2017 = pd.read_csv(url2017)
pcs2015 = pd.read_csv(url2015)
pcs2013 = pd.read_csv(url2013)

In [0]:
print(pcs2017.shape)
print(pcs2015.shape)
print(pcs2013.shape)

In [0]:
format(pcs2017)
format(pcs2015)
format(pcs2013)

In [0]:
pcs2013.head()

In [0]:
sns.heatmap(
    pd.crosstab(
        [pcs2017.mental_illness], 
        [pcs2017.employment_status], 
        normalize=True),
    cmap="YlGnBu", annot=True, cbar=False)

In [0]:
pcs2017['mental_illness'].value_counts(normalize=True)

In [0]:
pcs2015['mental_illness'].value_counts(normalize=True)


In [0]:
pcs2013['mental_illness'].value_counts(normalize=True)


Mental illness in out of work respondants accounts for a signaficant portion of those out of work. But does this make sense? It would seem the reason behind the large amount of people in this dataset absent from the workforce would be the overwhelming amount of participants with a mental illness, but that is supposition. What can we do now if the majority of the population exist outside of the workforce. 


In [0]:
# sns.heatmap(pd.crosstab([pcs2017.employment_status], [pcs2017.living_situation], normalize=True),
#             cmap="YlGnBu", annot=True, cbar=False)

In [0]:
sns.heatmap(pd.crosstab([pcs2017.race, pcs2017.hispanic_ethnicity], [pcs2017.mental_illness], normalize=True),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
sns.heatmap(pd.crosstab([pcs2017.race, pcs2017.hispanic_ethnicity], [pcs2017.hearing_visual_impairment], normalize=True),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
sns.heatmap(pd.crosstab([pcs2017.sex], [pcs2017.hearing_visual_impairment], normalize=True),
            cmap="YlGnBu", annot=True, cbar=False)
# sex differences do not seem to play a role 

In [0]:
# since a longitudinal approach would not be valid given the one week period of data collection and anonymized results,
# I will merge the sets and begin looking at various demographics for to understand which grouo is at risk for what
assert len(pcs2017.columns) == len(pcs2015.columns)
assert len(pcs2017.columns) == len(pcs2013.columns)

In [0]:
pcs = pcs2017.append(pcs2015, sort=False, ignore_index=True)

In [0]:
pcs = pcs.append(pcs2013, sort=False, ignore_index=True)

In [0]:
pcs2015.head()

In [0]:
pcs2013.head()

In [0]:
format(pcs)
pcs.columns

In [0]:
# this line from above, lets see what we get now
sns.heatmap(pd.crosstab([pcs.sex], [pcs.diabetes], normalize=True),
            cmap="YlGnBu", annot=True, cbar=False)
# sex differences do not seem to play a role 


In [0]:
sns.heatmap(pd.crosstab([pcs.sex], [pcs.pulmonary_asthma], normalize=True),
            cmap="YlGnBu", annot=True, cbar=False)

In [0]:
f, axes = pyplot.subplots( sharey=True, figsize=(11, 4))
sns.heatmap(
    pd.crosstab([pcs.sex], [pcs.principal_diagnosis_class], 
    normalize=True),
    cmap="YlGnBu", 
    annot=True, 
    cbar=False)