In [233]:
%load_ext autoreload 
%autoreload 2

In [234]:
from scipeds.data.queries import QueryFilters, TaxonomyRollup
from scipeds.data.completions import CompletionsQueryEngine
from scipeds.data.enums import FieldTaxonomy

from pathlib import Path

import pandas as pd

In [235]:
db_path = Path('data/processed/ipeds.duckdb')

In [236]:
engine = CompletionsQueryEngine(db_path=db_path)

In [237]:
qf = QueryFilters(start_year=1984, end_year=1994)


First let's diagnose the problem -- how many of the CIP codes (converted to their cip2020 version) in the 1984-1994 data are unknown / not in NCSES?

In [238]:
df = engine.field_totals_by_grouping(grouping='gender', query_filters=qf, taxonomy=FieldTaxonomy.cip, by_year=True)
df = df.reset_index()
df.head()

Unnamed: 0,FieldTaxonomy.cip,gender,year,field_degrees_within_gender,field_degrees_total,uni_degrees_within_gender,uni_degrees_total
0,1.0,men,1984,1779,2354,983227,1991889
1,1.0,men,1985,1649,2222,980838,2004285
2,1.0,men,1986,1451,1974,975329,2008295
3,1.0,men,1987,1218,1705,1036892,2163263
4,1.0,men,1988,1192,1644,1112371,2324652


In [239]:
cipdf = engine.get_cip_table()
cipdf[cipdf['cip_title'] == "Unknown"]

Unnamed: 0_level_0,cip_title,ncses_sci_group,ncses_field_group,ncses_detailed_field_group,nsf_broad_field,dhs_stem
cip2020,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28.0301,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
06.1799,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
20.0102,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
20.0101,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
20.0205,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
...,...,...,...,...,...,...
48.0210,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
08.0904,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
08.0806,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False
08.1002,Unknown,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Not categorized in NCSES crosswalk,Non-science and engineering,False


In [240]:
df_w_cip = pd.merge(
    df, cipdf,
    left_on=FieldTaxonomy.cip, left_index=False,
    right_index=True, 
    validate='many_to_one'
)
df_w_cip.head()

Unnamed: 0,FieldTaxonomy.cip,gender,year,field_degrees_within_gender,field_degrees_total,uni_degrees_within_gender,uni_degrees_total,cip_title,ncses_sci_group,ncses_field_group,ncses_detailed_field_group,nsf_broad_field,dhs_stem
0,1.0,men,1984,1779,2354,983227,1991889,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
1,1.0,men,1985,1649,2222,980838,2004285,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
2,1.0,men,1986,1451,1974,975329,2008295,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
3,1.0,men,1987,1218,1705,1036892,2163263,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False
4,1.0,men,1988,1192,1644,1112371,2324652,"Agriculture, Agriculture Operations And Relate...",Science and engineering,Life Sciences,Agricultural Sciences,Agricultural and biological sciences,False


In [241]:
# How many unique unclassified CIPs per year?
df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip]].drop_duplicates().groupby('year').size()

year
1984    123
1985    132
1986    129
1987     52
1988     41
1989     42
1990    403
1991    394
1992      1
1993      1
1994      1
dtype: int64

In [242]:
# How many total CIPs per year?
df_w_cip[['year', FieldTaxonomy.cip]].drop_duplicates().groupby('year').size()

year
1984    806
1985    819
1986    814
1987    754
1988    742
1989    744
1990    953
1991    941
1992    829
1993    829
1994    825
dtype: int64

Ok so for earlier years it's like half of the CIPs that aren't classified.

We need to figure out why these CIPs aren't being classified. Likely becuase they're not in the 85 -> 90 classifier?

In [243]:
unknown_cips = df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip]].drop_duplicates()
unknown_cips.head()

Unnamed: 0,year,FieldTaxonomy.cip
176,1990,1.0203
178,1991,1.0203
201,1990,1.0206
202,1991,1.0206
313,1990,1.0305


In [244]:
unknown_cips[unknown_cips[FieldTaxonomy.cip] == "01.0202"]

Unnamed: 0,year,FieldTaxonomy.cip


01.0202
- listed as "10202 - Agricultural Electrification Power and Controls" in the 1985 data dictionary.
- Only has one degree?

Let's see how many _students_ these unknowns represent per year

In [245]:
df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip, "field_degrees_total"]].drop_duplicates().groupby('year')["field_degrees_total"].sum()

year
1984     20380
1985     21083
1986     23703
1987      3981
1988     80657
1989    138020
1990    844741
1991    887294
1992     51136
1993     30515
1994     20746
Name: field_degrees_total, dtype: int64

Uhhh wait is that _all_ of the degrees in 1984?

In [246]:
df_w_cip[['year', 'uni_degrees_total']].drop_duplicates()

Unnamed: 0,year,uni_degrees_total
0,1984,1991889
1,1985,2004285
2,1986,2008295
3,1987,2163263
4,1988,2324652
5,1989,2412546
6,1990,2320375
7,1991,2408979
8,1992,2550460
9,1993,2616641


In [247]:
proportion_missing = pd.merge(
    df_w_cip[['year', 'uni_degrees_total']].drop_duplicates(),
    df_w_cip[df_w_cip['cip_title'] == "Unknown"][['year', FieldTaxonomy.cip, "field_degrees_total"]].drop_duplicates().groupby('year')["field_degrees_total"].sum(),
    right_index=True,
    left_on='year'
).rename(columns={'field_degrees_total': 'unknown_cip_total'})

proportion_missing['fraction_unknown_cips'] = proportion_missing['unknown_cip_total'] / proportion_missing['uni_degrees_total']

proportion_missing

Unnamed: 0,year,uni_degrees_total,unknown_cip_total,fraction_unknown_cips
0,1984,1991889,20380,0.010231
1,1985,2004285,21083,0.010519
2,1986,2008295,23703,0.011803
3,1987,2163263,3981,0.00184
4,1988,2324652,80657,0.034696
5,1989,2412546,138020,0.057209
6,1990,2320375,844741,0.364054
7,1991,2408979,887294,0.368328
8,1992,2550460,51136,0.02005
9,1993,2616641,30515,0.011662


Ok, let's start with the biggest unknowns for 84-86 and see if we can find them.

In [248]:
(
    df_w_cip[
        (df_w_cip['cip_title'] == "Unknown") & (df_w_cip['year'].isin([1984, 1985, 1986]))
    ]
    .groupby([FieldTaxonomy.cip])
    ['field_degrees_total']
    .sum()
).sort_values(ascending=False)

  .groupby([FieldTaxonomy.cip])


FieldTaxonomy.cip
07.0608    27320
50.0801    14874
47.0602    14642
20.0102     9914
15.0604     6398
           ...  
15.1701        0
15.1501        0
15.1399        0
15.1306        0
95.9500        0
Name: field_degrees_total, Length: 1689, dtype: int64

In [249]:
# Now 1990 is the worst offender, what's up?
(
    df_w_cip[
        (df_w_cip['cip_title'] == "Unknown") & (df_w_cip['year'].isin([1990, 1991]))
    ]
    .groupby([FieldTaxonomy.cip])
    ['field_degrees_total']
    .sum()
).sort_values(ascending=False)

  .groupby([FieldTaxonomy.cip])


FieldTaxonomy.cip
06.0401    533038
18.1101    340192
06.0101    286266
06.0201    227630
95.0000    176038
            ...  
16.0302         0
16.0301         0
16.0201         0
16.0104         0
95.9500         0
Name: field_degrees_total, Length: 1689, dtype: int64

# 2/21 update:

Uhhh... 06.0401 is back to causing trouble. I've gotten all the other years way down, need to figure out what's up with the 1990 and 1991 CIPs but it's too late and my brain is tired.

06.0401 is in the CIP85toCIP90 crosswalk! 

`06.0401	Business Administration	52.0201	Business Administration & Management, General`

Oh, we just weren't reading in that crosswalk. Let's try adding that to the code and then...

Hm, now 06.0401 is being mapped correctly to its 1990 CIP code of 52.0201 but nothing after that.

In [17]:
cipdf.loc['52.0201']

cip_title                     Business Administration and Management, General
ncses_sci_group                                   Non-science and engineering
ncses_field_group                                     Business and Management
ncses_detailed_field_group                            Business and Management
nsf_broad_field                                   Non-science and engineering
dhs_stem                                                                False
Name: 52.0201, dtype: object

Ok it's being mapped from 1985 to 1990 correctly, but then it's not present in the 1990 crosswalk. Does that mean it should just be propagated? Or it needs to be in the dict?

Update: it was becuase the NCSES classifier was only classifying the original CIP code, not the 2020 one. And this original cip code (1) doesn't have a cip title (bc the original crosswalk file didn't have one) and (2) isn't in the NCSES classifier.

Ok so now we still have a bunch of missing codes, let's dig into those.

Update: the vast majority were "total" cips that weren't being seen. I fixed that.


Now it looks like the early 90's are most problematic. What's going on here? Let's look at 07.0305 as an example...

In [18]:
cipdf.loc["07.0305"]

cip_title                                                Unknown
ncses_sci_group               Not categorized in NCSES crosswalk
ncses_field_group             Not categorized in NCSES crosswalk
ncses_detailed_field_group    Not categorized in NCSES crosswalk
nsf_broad_field                      Non-science and engineering
dhs_stem                                                   False
Name: 07.0305, dtype: object

07.0305 Business Data Programming (from [here](https://nces.ed.gov/pubs91/91396.pdf))

Not finding it in the NSF data or in the Excel with all the crosswalks. Uh oh...

But also in that pdf, it looks like it SHOULD be in the 85 -> 90 crosswalk:

07.0305 52.1202 Business Computer Programming/Programmer

What about the next most common one, `17.0602`?

This one is also in the pdf but not in the Excel file...

Same with this guy: 43.0105

# Read the crosswalk from the CIP 90 pdf

https://nces.ed.gov/pubs91/91396.pdf

I then opened the pdf in preview and dragged 

In [15]:
import fitz  # PyMuPDF
from pathlib import Path
import pandas as pd

In [201]:


# Open the PDF file
pdf_path = Path("/Users/claireduvallet/Downloads/91396 (dragged).pdf")
doc = fitz.open(pdf_path)

# Extract text from all pages
pdf_text = '\n'.join([page.get_text("text") for page in doc])

pdf_text = pdf_text.split('\n')

pdf_text = pdf_text[16:]

# Remove the Chapter lines
ch_idx_to_remove = {
    'CHAPTER TWO': [-1, 1], # Ch. 2 has an empty string before the CHAPTER TWO line
    'CHAPTER THREE': [0, 1],
    'CHAPTER FOUR': [0, 1],
    'CHAPTER FIVE': [-1, 1],
    'CHAPTER SIX': [-1, 1]
}
rm_idxs = []
for k, v in ch_idx_to_remove.items():
    i = pdf_text.index(k)
    rm_idxs += list(range(i + v[0], i + v[1] + 1))

pdf_text = [t for i, t in enumerate(pdf_text) if i not in rm_idxs]

# Remove deleted CIP codes, they don't have a 90 CIP
deleted_idxs = [i for i, t in enumerate(pdf_text) if t == "Deleted"]
rm_idxs = []
for i in deleted_idxs:
    rm_idxs += [i-1, i]

pdf_text = [t for i, t in enumerate(pdf_text) if i not in rm_idxs]

# Remove one "Assign to Specific Hobby (see Appendix D)", also doesn't have a 90 CIP
deleted_idxs = [i for i, t in enumerate(pdf_text) if t == "Assign to Specific Hobby (see Appendix D)"]
rm_idxs = []
for i in deleted_idxs:
    rm_idxs += [i-1, i]

pdf_text = [t for i, t in enumerate(pdf_text) if i not in rm_idxs]

# Remove any remaining empty strings
pdf_text = [t for t in pdf_text if t != '']

# For some reason only one line got parsed incorrectly
idx = 1004
pdf_text = pdf_text[:idx] + [' '.join(pdf_text[idx:idx+8])] + pdf_text[idx+8:]

with open('extracted_text.txt', 'w') as f:
    f.write('\n'.join(pdf_text))

pdf_df = pd.DataFrame(
    data=[pdf_text[::3], pdf_text[1::3], pdf_text[2::3]],
    index=['CIP85', 'CIP90', 'CIP90TITLE']
).T.replace('', None).dropna(how='all')

pdf_df.head()


Unnamed: 0,CIP85,CIP90,CIP90TITLE
0,1.0101,1.0101,"Agricultural Business and Management, General"
1,1.0102,1.0102,Agricultural Business/Agribusiness Operations
2,1.0103,1.0103,Agricultural Economics
3,1.0104,1.0104,Farm and Ranch Management
4,1.0199,1.0199,"Agricultural Business and Management, Other"


In [205]:
from pipeline.cip_crosswalk import CIPCodeCrosswalk

In [206]:
cw = CIPCodeCrosswalk()
cw._load_1985_to_1990_crosswalk()

In [207]:
original_crosswalk = cw.crosswalk[(1984, 1989)]['cip_map']
# original_crosswalk

In [213]:
pdf_crosswalk = dict(zip(pdf_df['CIP85'], pdf_df['CIP90']))
pdf_crosswalk = {k: v for k, v in pdf_crosswalk.items() if k != v}
# pdf_crosswalk

In [214]:
len(original_crosswalk), len(pdf_crosswalk)

(136, 472)

In [215]:
# Any duplicates in the PDF crosswalk?
len(set(pdf_crosswalk.keys()))

472

In [217]:
len([k for k in pdf_crosswalk.keys() if k not in original_crosswalk.keys()])

337

In [219]:
# Keys present in both dictionaries
len(set(pdf_crosswalk.keys()) & set(original_crosswalk.keys()))

135

In [221]:
# Unique keys in each dictionary
unique_to_orig = set(original_crosswalk.keys()) - set(pdf_crosswalk.keys())
unique_to_pdf = set(pdf_crosswalk.keys()) - set(original_crosswalk.keys())

unique_to_orig

{'36.0108'}

In [223]:
len(unique_to_pdf)

337

In [227]:
# Of the keys that are in both dicts, do they map to the same values?
dup_keys = list(set(pdf_crosswalk.keys()) & set(original_crosswalk.keys()))
sum([original_crosswalk[k] != pdf_crosswalk[k] for k in dup_keys])

4

In [232]:
[(k, original_crosswalk[k], pdf_crosswalk[k]) for k in dup_keys if original_crosswalk[k] != pdf_crosswalk[k]]

[('17.0499', '51.0301', '51.1599'),
 ('13.0801', '13.0802', '42.1701'),
 ('26.0201', '26.0202', '26.0203'),
 ('18.0701', '51.0701', '51.0702')]

17.0499 is in the pdf twice, one maps to 51.0301 (what the excel maps to) and the other maps to 51.1599...
same with 13.0801


In [230]:
updated_dict = {**dict2, **dict1}  # dict1 values overwrite dict2 where keys overlap

('51.0301', '51.1599')