In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

# loading required modules
import pandas as pd
import pprint
pp = pprint.PrettyPrinter(indent=4)
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
import math
from sklearn import svm
%matplotlib inline

#List Files
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# setting pandas env variables to display max rows and columns
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows',1000)


# load 2015 US Bureau of Labor Statistics http://www.bls.gov/cps/cpsaat39.xlsx data
print("Loading.....")
data = pd.read_csv("../input/inc_occ_gender.csv")
print("Finished.")

In [None]:
t = data[['Occupation', 'All_weekly', 'M_weekly', 'F_weekly']]

t[['All_weekly', 'M_weekly', 'F_weekly']] = t[['All_weekly', 'M_weekly', 'F_weekly']].replace({'Na': 0.0})
t[['All_weekly', 'M_weekly', 'F_weekly']] = t[['All_weekly', 'M_weekly', 'F_weekly']].fillna(0).astype(float)

t = t.loc[ (t['M_weekly']!=0) & (t['F_weekly']!=0) ]

In [None]:
#occupations_women_make_more = {}
occupations_women_make_more = []

for occupation in t['Occupation']:
    o = occupation.replace(" ", "")
    row = t[t['Occupation']==occupation]
    if ((float(row["M_weekly"]) <  float(row["F_weekly"])) and (float(row["F_weekly"]) > 0 and float(row["M_weekly"]) > 0)):
        #occupations_women_make_more[o] = occupations_women_make_more.get(o,0)+1
        occupations_women_make_more.append(occupation)
    #break

pp.pprint(occupations_women_make_more)

In [None]:
f = t.loc[ t['Occupation'].isin(occupations_women_make_more) ]
m = t.loc[ ~t['Occupation'].isin(occupations_women_make_more) ]

In [None]:
def getDFRange(bar_i = 0, bar_interval=15, df = m):
    bar_low = min(df.index[-2], bar_i*bar_interval)
    bar_high = min(df.index[-1], (bar_i+1)*bar_interval)
    bar_range = df.index[range(bar_low,bar_high)]
    
    
    return bar_range

In [None]:
#Plot the Occupations for which males make more on average
for i in range(0,10):  
    r = getDFRange(i, bar_interval=14)
    m.loc[r,:].plot.barh(
        title='Median Weekly Income for occupations where men make more (pg ' + str(i+1) + ')',
        x='Occupation', 
        y_err = data.loc[r,:]
    )


#Plot the Occupations for which females make more on average
f.plot.barh(
    title='Median Weekly Income for occupations where women make more',
    x='Occupation', 
)

In [None]:
#Calculations for conclusions

#Determine the difference in weekly earnings in occupations males make more
m_diff = m['M_weekly']-m['F_weekly']

m_diff_max_i = m_diff.idxmax()
m_diff_max_val = m_diff.loc[m_diff_max_i]
m_diff_max_occ = m.loc[m_diff_max_i]['Occupation']

m_diff_percentage = m_diff / m['M_weekly']
m_count = len(m.index)


#Determine the difference in weekly earnings in occupations males make more
f_diff = -1*(f['M_weekly']-f['F_weekly'])

f_diff_max_i = f_diff.idxmax()
f_diff_max_val = f_diff.loc[f_diff_max_i]
f_diff_max_occ = f.loc[f_diff_max_i]['Occupation']

f_diff_percentage = f_diff / f['F_weekly']
f_count = len(f.index)

In [None]:
#Conclusions
print("1. Number of occupations where (M/F) make more", 
      "\n\tMale: ", m_count, "(",  str( 100*m_count/(m_count+f_count) ) ,"%)",
      "\n\tFemale: ", f_count, "(", str( 100*f_count/(m_count+f_count) ) ,"%)",
)
print("\n\t i. There are more occupations where males make more.")

print("\n--- --- --- --- --- --- --- --- --- --- --- --- --- ---\n")
print("2 a. Average difference in weekly income in occupations where (M/F) make more.", 
      "\n\tMale: ", "$", m_diff.mean(),
      "\n\tFemale: ", "$", f_diff.mean(), 
)
print("\n\t i. In occupations where women make more they don't earn as much more as men do when they work in occupation that pays men more.")
print("\n\t ii. Smaller advantage for women in the fewer occupations where they get paid more.")

print("\n--- --- --- --- --- --- --- --- --- --- --- --- --- ---\n")
print("2 b. Average percentage difference in weekly income earned over opposite sex in occupations where (M/F) make more.", 
      "\n\tMale: ", 100*m_diff_percentage.mean(), "%",
      "\n\tFemale: ", 100*f_diff_percentage.mean(), "%", 
)
print("\n\t i. In occupations where women make more the percentage differnce they make as compared to males is less than the reverse case.")


print("\n--- --- --- --- --- --- --- --- --- --- --- --- --- ---\n")
print("3. Average weekly income of occupations where (M/F) made more.", 
      "\n\tMale: ", "$", m['All_weekly'].mean(),
      "\n\tFemale: ", "$", f['All_weekly'].mean(), 
)
print("\n\t i. Occupations where women make more than men make less than occupations where men make more than women.")

print("\n--- --- --- --- --- --- --- --- --- --- --- --- --- ---\n")
print("4. The biggest difference in weekly income where (M/F) make more",
      "\n\tMale: ", "$", m_diff.max(), "(", m_diff_max_occ , ")",
      "\n\tFemale: ", "$", f_diff.max(), "(", f_diff_max_occ , ")",
)

In [None]:
import requests
import json
import prettytable
headers = {'Content-type': 'application/json'}
data = json.dumps({"seriesid": ['CUUR0000SA0','SUUR0000SA0'],"startyear":"2011", "endyear":"2014"})
p = requests.post('http://api.bls.gov/publicAPI/v2/timeseries/data/', data=data, headers=headers)
json_data = json.loads(p.text)
for series in json_data['Results']['series']:
    x=prettytable.PrettyTable(["series id","year","period","value","footnotes"])
    seriesId = series['seriesID']
    for item in series['data']:
        year = item['year']
        period = item['period']
        value = item['value']
        footnotes=""
        for footnote in item['footnotes']:
            if footnote:
                footnotes = footnotes + footnote['text'] + ',' 'if 'M01' <= period <= 'M12':'
            x.add_row([seriesId,year,period,value,footnotes[0:-1]])
    output = open(seriesId + '.txt','w')
    output.write (x.get_string())
    output.close()
    