In [1]:
%load_ext autoreload 
%autoreload 2

In [27]:
from scipeds.data.queries import QueryFilters, TaxonomyRollup
from scipeds.data.completions import CompletionsQueryEngine
from scipeds.data.enums import FieldTaxonomy

from pathlib import Path

import pandas as pd

In [28]:
db_path = Path('data/processed/ipeds.duckdb')

In [29]:
engine = CompletionsQueryEngine(db_path=db_path)

In [30]:
qf = QueryFilters(start_year=1984, end_year=1994)


First let's diagnose the problem -- how many of the CIP codes (converted to their cip2020 version) in the 1984-1994 data are unknown / not in NCSES?

In [31]:
df = engine.field_totals_by_grouping(grouping='gender', query_filters=qf, taxonomy=FieldTaxonomy.cip, by_year=True)
df = df.reset_index()
df.head()

Unnamed: 0,FieldTaxonomy.cip,gender,year,field_degrees_within_gender,field_degrees_total,uni_degrees_within_gender,uni_degrees_total
0,1.0,men,1984,1779,2354,1968133,3987235
1,1.0,men,1985,1649,2222,1962701,4011107
2,1.0,men,1986,1451,1974,1951076,4017624
3,1.0,men,1987,1218,1705,1036892,2163263
4,1.0,men,1988,1192,1644,1112371,2324652


In [32]:
cipdf = engine.get_cip_table()
cipdf[cipdf['cip_title'] == "Unknown"]

Unnamed: 0_level_0,cip_title,ncses_sci_group,ncses_field_group,ncses_detailed_field_group,nsf_broad_field,dhs_stem
cip2020,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15.1301,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,True
15.0302,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
50.0402,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
52.0201,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
52.0302,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
...,...,...,...,...,...,...
18.1029,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
17.0703,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
17.0603,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
01.0202,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False


In [33]:
df_w_cip = pd.merge(
    df, cipdf,
    left_on=FieldTaxonomy.cip, left_index=False,
    right_index=True, 
    validate='many_to_one'
)
df_w_cip.head()

Unnamed: 0,FieldTaxonomy.cip,gender,year,field_degrees_within_gender,field_degrees_total,uni_degrees_within_gender,uni_degrees_total,cip_title,ncses_sci_group,ncses_field_group,ncses_detailed_field_group,nsf_broad_field,dhs_stem
0,1.0,men,1984,1779,2354,1968133,3987235,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
1,1.0,men,1985,1649,2222,1962701,4011107,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
2,1.0,men,1986,1451,1974,1951076,4017624,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
3,1.0,men,1987,1218,1705,1036892,2163263,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
4,1.0,men,1988,1192,1644,1112371,2324652,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False


In [34]:
# How many unique unclassified CIPs per year?
df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip]].drop_duplicates().groupby('year').size()

year
1984    439
1985    456
1986    447
1987    391
1988    383
1989    387
1990    408
1991    400
1992    109
1993    109
1994    109
dtype: int64

In [35]:
# How many total CIPs per year?
df_w_cip[['year', FieldTaxonomy.cip]].drop_duplicates().groupby('year').size()

year
1984    969
1985    989
1986    978
1987    938
1988    927
1989    931
1990    953
1991    941
1992    829
1993    829
1994    825
dtype: int64

Ok so for earlier years it's like half of the CIPs that aren't classified.

We need to figure out why these CIPs aren't being classified. Likely becuase they're not in the 85 -> 90 classifier?

In [36]:
unknown_cips = df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip]].drop_duplicates()
unknown_cips.head()

Unnamed: 0,year,FieldTaxonomy.cip
176,1985,1.0202
177,1984,1.0203
178,1985,1.0203
179,1986,1.0203
180,1987,1.0203


In [37]:
unknown_cips[unknown_cips[FieldTaxonomy.cip] == "01.0202"]

Unnamed: 0,year,FieldTaxonomy.cip
176,1985,1.0202


01.0202
- listed as "10202 - Agricultural Electrification Power and Controls" in the 1985 data dictionary.
- Only has one degree?

Let's see how many _students_ these unknowns represent per year

In [38]:
df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip, "field_degrees_total"]].drop_duplicates().groupby('year')["field_degrees_total"].sum()

year
1984    2725851
1985    2743180
1986    2747222
1987     791467
1988     912598
1989     966911
1990     855664
1991     899410
1992     828244
1993     843760
1994     858496
Name: field_degrees_total, dtype: int64

Uhhh wait is that _all_ of the degrees in 1984?

In [39]:
df_w_cip[['year', 'uni_degrees_total']].drop_duplicates()

Unnamed: 0,year,uni_degrees_total
0,1984,3987235
1,1985,4011107
2,1986,4017624
3,1987,2163263
4,1988,2324652
5,1989,2412546
6,1990,2320375
7,1991,2408979
8,1992,2550460
9,1993,2616641


In [40]:
proportion_missing = pd.merge(
    df_w_cip[['year', 'uni_degrees_total']].drop_duplicates(),
    df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip, "field_degrees_total"]].drop_duplicates().groupby('year')["field_degrees_total"].sum(),
    right_index=True,
    left_on='year'
).rename(columns={'field_degrees_total': 'unknown_cip_total'})

proportion_missing['fraction_unknown_cips'] = proportion_missing['unknown_cip_total'] / proportion_missing['uni_degrees_total']

proportion_missing

Unnamed: 0,year,uni_degrees_total,unknown_cip_total,fraction_unknown_cips
0,1984,3987235,2725851,0.683644
1,1985,4011107,2743180,0.683896
2,1986,4017624,2747222,0.683793
3,1987,2163263,791467,0.365867
4,1988,2324652,912598,0.392574
5,1989,2412546,966911,0.400784
6,1990,2320375,855664,0.368761
7,1991,2408979,899410,0.373357
8,1992,2550460,828244,0.324743
9,1993,2616641,843760,0.322459


Ok, let's start with the biggest unknowns for 84-86 and see if we can find them.

In [41]:
(
    df_w_cip[
        (df_w_cip['cip_title'] == "Unknown") & (df_w_cip['year'].isin([1984, 1985, 1986]))
    ]
    .groupby([FieldTaxonomy.cip])
    ['field_degrees_total']
    .sum()
).sort_values(ascending=False)

  .groupby([FieldTaxonomy.cip])


FieldTaxonomy.cip
99.0000    12022994
52.0201      713528
51.3801      471772
52.0101      418684
52.0301      347698
             ...   
16.0599           0
16.0502           0
16.0501           0
16.0499           0
19.0702           0
Name: field_degrees_total, Length: 1654, dtype: int64

06.0401 is in the CIP85toCIP90 crosswalk! 

`06.0401	Business Administration	52.0201	Business Administration & Management, General`

Oh, we just weren't reading in that crosswalk. Let's try adding that to the code and then...

Hm, now 06.0401 is being mapped correctly to its 1990 CIP code of 52.0201 but nothing after that.

In [43]:
cipdf.loc['52.0201']

cip_title                                                Unknown
ncses_sci_group               Not categorized in NCSES crosswalk
ncses_field_group             Not categorized in NCSES crosswalk
ncses_detailed_field_group    Not categorized in NCSES crosswalk
nsf_broad_field                      Non-science and engineering
dhs_stem                                                   False
Name: 52.0201, dtype: object

Ok it's being mapped from 1985 to 1990 correctly, but then it's not present in the 1990 crosswalk. Does that mean it should just be propagated? Or it needs to be in the dict?