Focusing on the data available for all ages, **what does the distribution of unemployment rates look like among the different major categories?**  Come up with a _graphical display_ that allows a reader to easily make sense of the information.


In addition to the comprehensive, all-ages dataset, the github repository _also contains data regarding just **recent college graduates (ages < 28)**_. Comparing this subset of data to the whole dataset that it comes from (all-ages) can provide us with some information about recent trends. **Which majors appear to have experienced a relative boom** among recent graduates and **which majors are dropping off** in popularity? Again, explore visual ways of describing the answer as well as numerical ones.


In [213]:
import pandas as pd
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)


import missingno as msno
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
import pylab
import json
import collections
import pprint
pp = pprint.PrettyPrinter()
import warnings 
warnings.filterwarnings('ignore')

In [6]:
all_ages = pd.read_csv("data-college-majors/all-ages.csv")

In [50]:
grad_students = pd.read_csv("data-college-majors/grad-students.csv")

In [12]:
recent_grads = pd.read_csv("data-college-majors/recent-grads.csv")

In [13]:
majors_list = pd.read_csv("data-college-majors/majors-list.csv")

In [41]:
projected_occupation = pd.read_excel("data-college-majors/occupation.xlsx",sheet_name = 1)[:-4]
#data via https://www.bls.gov/emp/tables/emp-by-major-occupational-group.htm | 2019 - 2029 projections

In [218]:
# all_ages.groupby(["Major_category","Major","Employed","Unemployed","Unemployment_rate"]).mean()

In [10]:
all_ages.dtypes

Major_code                         int64
Major                             object
Major_category                    object
Total                              int64
Employed                           int64
Employed_full_time_year_round      int64
Unemployed                         int64
Unemployment_rate                float64
Median                             int64
P25th                              int64
P75th                            float64
dtype: object

In [219]:
# all_ages.groupby("Major_category").describe()["Unemployment_rate"].sort_values("max")

In [48]:
recent_grads.sample(5)

Unnamed: 0,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
148,149,6006,ART HISTORY AND CRITICISM,21030.0,3240.0,17790.0,Humanities & Liberal Arts,0.845934,204,17579,13262,6140,9965,1128,0.060298,31000,23000,40000,5139,9738,3426
54,55,4006,COGNITIVE SCIENCE AND BIOPSYCHOLOGY,3831.0,1667.0,2164.0,Biology & Life Science,0.564866,25,2741,2470,711,1584,223,0.075236,41000,20000,60000,1369,921,135
102,103,5503,CRIMINOLOGY,19879.0,10031.0,9848.0,Social Science,0.495397,214,16181,13616,4543,10548,1743,0.097244,35000,25000,45000,3373,10605,1895
127,128,6211,HOSPITALITY MANAGEMENT,43647.0,15204.0,28443.0,Business,0.65166,546,36728,32160,7494,23106,2393,0.061169,33000,25000,42000,2325,23341,9063
118,119,6110,COMMUNITY AND PUBLIC HEALTH,19735.0,4103.0,15632.0,Health,0.792095,130,14512,10099,6377,7460,1833,0.112144,34000,21000,45000,5225,7385,1854


In [51]:
grad_students.sample(5)

Unnamed: 0,Major_code,Major,Major_category,Grad_total,Grad_sample_size,Grad_employed,Grad_full_time_year_round,Grad_unemployed,Grad_unemployment_rate,Grad_median,Grad_P25,Grad_P75,Nongrad_total,Nongrad_employed,Nongrad_full_time_year_round,Nongrad_unemployed,Nongrad_unemployment_rate,Nongrad_median,Nongrad_P25,Nongrad_P75,Grad_share,Grad_premium
28,2100,COMPUTER AND INFORMATION SYSTEMS,Computers & Mathematics,71527,1425,60858,53807,2539,0.040049,80000.0,55000,104000.0,242194,209994,184959,10439,0.047357,65000.0,45000,90000.0,0.227996,0.230769
146,2305,MATHEMATICS TEACHER EDUCATION,Education,80826,1194,51750,34672,748,0.014248,60000.0,47500,80000.0,63346,42354,27419,1610,0.036621,45000.0,35000,62000.0,0.560622,0.333333
1,6004,COMMERCIAL ART AND GRAPHIC DESIGN,Arts,53864,882,40492,29553,2482,0.057756,60000.0,40000,89000.0,461977,347166,250596,25484,0.068386,48000.0,34000,71000.0,0.10442,0.25
112,5502,ANTHROPOLOGY AND ARCHEOLOGY,Humanities & Liberal Arts,107888,1971,83632,59545,4374,0.049701,65000.0,45100,100000.0,126116,90622,62339,6369,0.065666,45000.0,30000,70000.0,0.461052,0.444444
131,3700,MATHEMATICS,Computers & Mathematics,418056,6906,287467,217363,11245,0.037645,89000.0,60000,127000.0,407046,262174,202078,14142,0.051181,68000.0,43000,100000.0,0.506672,0.308824


In [216]:
all_ages.dtypes

Major_code                         int64
Major                             object
Major_category                    object
Total                              int64
Employed                           int64
Employed_full_time_year_round      int64
Unemployed                         int64
Unemployment_rate                float64
Median                             int64
P25th                              int64
P75th                            float64
dtype: object

## Re-format for D3 

In [203]:
from collections import defaultdict

https://stackoverflow.com/questions/50929768/pandas-multiindex-more-than-2-levels-dataframe-to-nested-dict-json

In [196]:
all_ages_h = all_ages.set_index(['Major_category',"Major"])

In [206]:
all_ages_h.to_csv("all_ages_h.csv")

In [204]:
all_ages_h.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Major_code,Total,Employed,Employed_full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th
Major_category,Major,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Agriculture & Natural Resources,GENERAL AGRICULTURE,1100,128148,90245,74078,2423,0.026147,50000,34000,80000.0
Agriculture & Natural Resources,AGRICULTURE PRODUCTION AND MANAGEMENT,1101,95326,76865,64240,2266,0.028636,54000,36000,80000.0
Agriculture & Natural Resources,AGRICULTURAL ECONOMICS,1102,33955,26321,22810,821,0.030248,63000,40000,98000.0
Agriculture & Natural Resources,ANIMAL SCIENCES,1103,103549,81177,64937,3619,0.042679,46000,30000,72000.0
Agriculture & Natural Resources,FOOD SCIENCE,1104,24280,17281,12722,894,0.049188,62000,38500,90000.0


In [191]:
def nest(d):
    result = {}
    for key, value in d.items():
        target = result
        for k in key[:-1]:  # traverse all keys but the last
            target = target.setdefault(k, {})
        target[key[-1]] = value
    return result

In [205]:
tree = lambda: defaultdict(tree)  # a recursive defaultdict
d = tree()

for _, (Total, Employed, Unemployed, unemployment_rate) in all_ages_h.iterrows():
    print(_,)
# for _, (region, type, name, value) in all_ages_h.iterrows():
    #d['children'][region]['name'] = region
    

#json.dumps(d)

ValueError: too many values to unpack (expected 4)

In [212]:
pd.read_json("flare.json")

Unnamed: 0,name,children
0,flare,"{'name': 'Agriculture & Natural Resources', 'c..."
1,flare,"{'name': 'Arts', 'children': [{'name': 'DRAMA ..."
2,flare,"{'name': 'Biology & Life Science', 'children':..."
3,flare,"{'name': 'Business', 'children': [{'name': 'AC..."
4,flare,"{'name': 'Communications & Journalism', 'child..."
5,flare,"{'name': 'Computers & Mathematics', 'children'..."
6,flare,"{'name': 'Education', 'children': [{'name': 'E..."
7,flare,"{'name': 'Engineering', 'children': [{'name': ..."
8,flare,"{'name': 'Health', 'children': [{'name': 'COMM..."
9,flare,"{'name': 'Humanities & Liberal Arts', 'childre..."


## Slope Plots

In [221]:
recent_grads.head(2)

Unnamed: 0,Rank,Major_code,Major,Total,Men,Women,Major_category,ShareWomen,Sample_size,Employed,Full_time,Part_time,Full_time_year_round,Unemployed,Unemployment_rate,Median,P25th,P75th,College_jobs,Non_college_jobs,Low_wage_jobs
0,1,2419,PETROLEUM ENGINEERING,2339.0,2057.0,282.0,Engineering,0.120564,36,1976,1849,270,1207,37,0.018381,110000,95000,125000,1534,364,193
1,2,2416,MINING AND MINERAL ENGINEERING,756.0,679.0,77.0,Engineering,0.101852,7,640,556,170,388,85,0.117241,75000,55000,90000,350,257,50


In [275]:
res_recent = []
for i in recent_grads.groupby("Major_category").sum()["Total"]:
    res_recent.append(i/recent_grads["Total"].sum())

In [289]:
x_recent = list(zip(list(recent_grads.groupby("Major_category").sum()["Total"].to_frame().reset_index()["Major_category"]),
res_recent))


In [292]:
x_recent

[('Agriculture & Natural Resources', 0.011167138781751105),
 ('Arts', 0.052738961559465385),
 ('Biology & Life Science', 0.06702380245653425),
 ('Business', 0.19232760563371962),
 ('Communications & Journalism', 0.05797712050851978),
 ('Computers & Mathematics', 0.04415582958018824),
 ('Education', 0.08256904443139003),
 ('Engineering', 0.07938725162272024),
 ('Health', 0.06840721631672263),
 ('Humanities & Liberal Arts', 0.10536096498728376),
 ('Industrial Arts & Consumer Services', 0.03393439771140108),
 ('Interdisciplinary', 0.001815804528701555),
 ('Law & Public Policy', 0.026449520309218398),
 ('Physical Sciences', 0.027390501641105704),
 ('Psychology & Social Work', 0.0710324242792086),
 ('Social Science', 0.07826241565206964)]

In [294]:
final = []
for n in range(len(x_recent)):
    final.append((x_recent[n][0],x_overall[n][1],x_recent[n][1]))

In [306]:
percentage = pd.DataFrame(final)
percentage.columns = ["Major_category","Proportion_overall","Proportion_recent"]

In [307]:
def change(row):
    return row["Proportion_recent"] - row["Proportion_overall"]

In [308]:
percentage["Change"] = percentage.apply(change,axis = 1)

In [309]:
res_overall = []
for i in all_ages.groupby("Major_category").sum()["Total"]:
    res_overall.append(i/all_ages["Total"].sum())

In [310]:
percentage.sort_values('Change')

Unnamed: 0,Major_category,Proportion_overall,Proportion_recent,Change
3,Business,0.247493,0.192328,-0.055166
6,Education,0.117991,0.082569,-0.035422
7,Engineering,0.089772,0.079387,-0.010385
8,Health,0.074078,0.068407,-0.005671
0,Agriculture & Natural Resources,0.015877,0.011167,-0.00471
5,Computers & Mathematics,0.04472,0.044156,-0.000564
11,Interdisciplinary,0.001135,0.001816,0.000681
13,Physical Sciences,0.02574,0.027391,0.001651
12,Law & Public Policy,0.022667,0.02645,0.003783
1,Arts,0.045334,0.052739,0.007405


In [290]:
x_overall = list(zip(list(all_ages.groupby("Major_category").sum()["Total"].to_frame().reset_index()["Major_category"]),
res_overall))

In [291]:
x_overall

[('Agriculture & Natural Resources', 0.01587665514613777),
 ('Arts', 0.045334311315561995),
 ('Biology & Life Science', 0.03359372972073031),
 ('Business', 0.24749315905313796),
 ('Communications & Journalism', 0.04528302398344265),
 ('Computers & Mathematics', 0.04471959134414433),
 ('Education', 0.11799144046308921),
 ('Engineering', 0.08977198550860489),
 ('Health', 0.07407816229581278),
 ('Humanities & Liberal Arts', 0.09384690588269967),
 ('Industrial Arts & Consumer Services', 0.02595239420964765),
 ('Interdisciplinary', 0.0011346726013030245),
 ('Law & Public Policy', 0.02266699248222604),
 ('Physical Sciences', 0.025739512870258514),
 ('Psychology & Social Work', 0.049888490846529174),
 ('Social Science', 0.06662897227667404)]

In [229]:
all_ages.groupby("Major_category").sum()["Total"]

Major_category
Agriculture & Natural Resources         632437
Arts                                   1805865
Biology & Life Science                 1338186
Business                               9858741
Communications & Journalism            1803822
Computers & Mathematics                1781378
Education                              4700118
Engineering                            3576013
Health                                 2950859
Humanities & Liberal Arts              3738335
Industrial Arts & Consumer Services    1033798
Interdisciplinary                        45199
Law & Public Policy                     902926
Physical Sciences                      1025318
Psychology & Social Work               1987278
Social Science                         2654125
Name: Total, dtype: int64

In [224]:
all_ages["Total"].sum()

39834398

In [None]:
newgrads = pd.read_csv("majordata/recent-grads.csv")
newgrads.sort_values("Major_code", ascending = True)
totalgradsnew = newgrads['Total'].sum() 
newgrads['Proportion'] = newgrads['Total']/totalgradsnew
newprops = newgrads[["Major","Proportion"]]
newprops.rename(columns={'Proportion':'Proportion of recent graduates'}, inplace=True)
totalprops = majordata[["Major","Proportion"]]
merged_data = pd.merge(left=totalprops,right=newprops, how='left', left_on='Major', right_on='Major')
merged_data["Shift"] = merged_data["Proportion of recent graduates"] - merged_data['Proportion']
merged_data.rename(columns={'Proportion':'Proportion of all graduates'}, inplace=True)