In [1]:
import pandas as pd
import numpy as np

Excel files downloaded from:
https://ncsesdata.nsf.gov/doctoratework/2017/

"Survey of Doctorate Recipients Survey Year 2017"
From above web site: "The Survey of Doctorate Recipients (SDR) provides demographic, education, and career history information from individuals with a U.S. research doctoral degree in a science, engineering, or health (SEH) field. The SDR is sponsored by the National Center for Science and Engineering Statistics and by the National Institutes of Health. Conducted since 1973, the SDR is a unique source of information about the educational and occupational achievements and career movement of U.S.-trained doctoral scientists and engineers in the United States and abroad."



Third sheet to import is Table 54. Median annual salaries of U.S. residing full-time employed doctoral scientists and engineers, by field of doctorate and sector of employment: 2017

Footnotes from Table 54:
Codes used in data tables: * = suppressed when population estimate < 25. D = suppressed to avoid disclosure of confidential information. na = not applicable. S = suppressed for reliability; coefficient of variation exceeds publication standards.								
SE = standard error.								
a Includes 4-year colleges or universities, medical schools (including university-affiliated hospitals or medical centers), and university-affiliated research institutes.								
b Includes 2-year colleges, community colleges, or technical institutes, and other precollege institutions.								
c Includes those self-employed in an incorporated business.								
d Self-employed or business owner in a nonincorporated business.								
e Includes employers not broken out separately.								
NOTES: Median annual salaries are for principal job and are rounded to nearest $1,000. Standard errors are rounded up to the nearest $500. Residence location is based on reported living location on 1 February 2017.								
SOURCE:  National Science Foundation, National Center for Science and Engineering Statistics, Survey of Doctorate Recipients: 2017.								

In [2]:
sectorDF = pd.read_excel(r'..\data\raw\sdr2017_dst_54.xlsx', skiprows = 3)
sectorDF.head()

Unnamed: 0,Field of study,All full-time employed,Unnamed: 2,4-year educational institutiona,Unnamed: 4,Other educational institutionb,Unnamed: 6,"Private, for profitc",Unnamed: 8,"Private, nonprofit",Unnamed: 10,Federal government,Unnamed: 12,State or local government,Unnamed: 14,Self-employedd,Unnamed: 16,Othere,Unnamed: 18
0,,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE,Median salary,SE
1,All fields,110000,500,90000,500,71000,1000,137000,2000,114000,2000,120000,500,90000,1000,99000,500,124000,4500
2,Science,104000,1000,88000,1000,71000,1000,135000,1000,109000,500,118000,2000,86000,3000,100000,2000,124000,4500
3,"Biological, agricultural, and environmental li...",100000,500,84000,1500,69000,1000,130000,1000,105000,4000,110000,3000,79000,2500,99000,9500,89000,8000
4,Agricultural and food sciences,106000,3000,94000,3000,88000,20000,125000,2500,100000,9500,115000,3000,79000,11000,89000,16500,65000,18000


In [3]:
sectorDF.tail(12)

Unnamed: 0,Field of study,All full-time employed,Unnamed: 2,4-year educational institutiona,Unnamed: 4,Other educational institutionb,Unnamed: 6,"Private, for profitc",Unnamed: 8,"Private, nonprofit",Unnamed: 10,Federal government,Unnamed: 12,State or local government,Unnamed: 14,Self-employedd,Unnamed: 16,Othere,Unnamed: 18
31,Other engineering,123000.0,3000.0,100000.0,2500.0,70000.0,12500.0,134000.0,3000.0,125000.0,5000.0,129000.0,5500.0,112000.0,7500.0,115000,22000,99000.0,19000.0
32,Health,104000.0,1500.0,91000.0,2000.0,84000.0,5000.0,140000.0,8000.0,134000.0,8000.0,119000.0,4500.0,108000.0,7500.0,S,S,96000.0,6500.0
33,,,,,,,,,,,,,,,,,,,
34,Codes used in data tables: * = suppressed when...,,,,,,,,,,,,,,,,,,
35,SE = standard error.,,,,,,,,,,,,,,,,,,
36,"a Includes 4-year colleges or universities, me...",,,,,,,,,,,,,,,,,,
37,"b Includes 2-year colleges, community colleges...",,,,,,,,,,,,,,,,,,
38,c Includes those self-employed in an incorpora...,,,,,,,,,,,,,,,,,,
39,d Self-employed or business owner in a noninco...,,,,,,,,,,,,,,,,,,
40,e Includes employers not broken out separately.,,,,,,,,,,,,,,,,,,


In [4]:
'''
Remove the last 10 rows with footnotes
and remove row 0 with subheadings
'''
sectorDF.drop(sectorDF.index[-10:], inplace = True)
sectorDF.drop(sectorDF.index[0], inplace = True)

In [5]:
'''
Remove the SE columns. I'm only interested in Median Salaries.
Row 0 shows that the "unnamed" columns are the "SE" columns
'''
sectorDF.drop(sectorDF.columns[sectorDF.columns.str.contains('Unnamed')], axis = 1, inplace = True)


In [6]:
#set the index to degree and clean footnotes from column headings
sectorDF.set_index('Field of study', inplace = True)
sectorDF.rename(columns={'4-year educational institutiona': '4-year educational institution', 'Other educational institutionb':'Other educational institution', 'Private, for profitc': 'Private, for profit', 'Self-employedd':'Self-employed', 'Othere':'Other'}, inplace = True)
sectorDF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, All fields to Health
Data columns (total 9 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   All full-time employed          32 non-null     object
 1   4-year educational institution  32 non-null     object
 2   Other educational institution   32 non-null     object
 3   Private, for profit             32 non-null     object
 4   Private, nonprofit              32 non-null     object
 5   Federal government              32 non-null     object
 6   State or local government       32 non-null     object
 7   Self-employed                   32 non-null     object
 8   Other                           32 non-null     object
dtypes: object(9)
memory usage: 2.5+ KB


In [7]:
sectorDF.head()

Unnamed: 0_level_0,All full-time employed,4-year educational institution,Other educational institution,"Private, for profit","Private, nonprofit",Federal government,State or local government,Self-employed,Other
Field of study,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
All fields,110000,90000,71000,137000,114000,120000,90000,99000,124000
Science,104000,88000,71000,135000,109000,118000,86000,100000,124000
"Biological, agricultural, and environmental life sciences",100000,84000,69000,130000,105000,110000,79000,99000,89000
Agricultural and food sciences,106000,94000,88000,125000,100000,115000,79000,89000,65000
Biochemistry and biophysics,109000,82000,57000,129000,131000,114000,108000,68000,D


Fourth sheet to import is Table 1-1. U.S. residing doctoral scientists and engineers, by fine field of doctorate and employment status: 2017

Footnotes from Table 1-1
Codes used in data tables: * = suppressed when population estimate < 25. D = suppressed to avoid disclosure of confidential information. na = not applicable. S = suppressed for reliability; coefficient of variation exceeds publication standards.										
SE = standard error.										
a Unemployed includes individuals who were not working during the survey reference week but had been seeking work in the prior 4 weeks or who were on layoff from their job.										
b Not employed or not seeking work includes individuals who were not working during the survey reference week and had not been seeking work in the prior 4 weeks because of family responsibilities, chronic illness, or other reasons.										
NOTES: Numbers are rounded to the nearest 50. Standard errors are rounded up to the nearest 25. Detail may not add to total because of rounding. Designation of full-time and part-time employment status is based on principal job only, not on all jobs held in labor force. For example, an individual could work part time in his or her principal job but full time in the labor force. Residence location is based on reported living location on 1 February 2017.										
SOURCE:  National Science Foundation, National Center for Science and Engineering Statistics, Survey of Doctorate Recipients: 2017.										

In [8]:
numbersDF = pd.read_excel(r'..\data\raw\sdr2017_dst_1-1.xlsx', skiprows = 3)
numbersDF.head()

Unnamed: 0,Field of study,Total,Unnamed: 2,Employed,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unemployeda,Unnamed: 10,Retired,Unnamed: 12,Not employed or not seeking workb,Unnamed: 14
0,,,,Total,,Full time,,Part time,,,,,,,
1,,Number,SE,Number,SE,Number,SE,Number,SE,Number,SE,Number,SE,Number,SE
2,All fields,967500,1225,815100,1400,721950,1925,93150,1575,17100,625,113450,1250,21850,750
3,Science,736700,1250,612450,1500,533800,1875,78700,1425,12800,575,93550,1050,17900,700
4,"Biological, agricultural, and environmental li...",246850,725,206900,925,188650,1100,18250,625,5150,350,28100,750,6700,400


In [9]:
numbersDF = numbersDF.transpose()
numbersDF.reset_index(inplace = True)
numbersDF = numbersDF.transpose()

In [10]:
numbersDF.iloc[2, 0] = 'dummy'
numbersDF.columns = numbersDF.iloc[2, :]

In [11]:
numbersDF.drop(numbersDF.columns[numbersDF.columns.str.contains('SE')], axis = 1, inplace = True)


In [12]:
numbersDF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108 entries, index to 106
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   dummy   106 non-null    object
 1   Number  100 non-null    object
 2   Number  101 non-null    object
 3   Number  101 non-null    object
 4   Number  101 non-null    object
 5   Number  100 non-null    object
 6   Number  100 non-null    object
 7   Number  100 non-null    object
dtypes: object(8)
memory usage: 7.6+ KB


In [13]:
numbersDF.columns = numbersDF.iloc[0, :]

In [14]:
numbersDF.head()

index,Field of study,Total,Employed,Unnamed: 5,Unnamed: 7,Unemployeda,Retired,Not employed or not seeking workb
index,Field of study,Total,Employed,Unnamed: 5,Unnamed: 7,Unemployeda,Retired,Not employed or not seeking workb
0,,,Total,Full time,Part time,,,
1,dummy,Number,Number,Number,Number,Number,Number,Number
2,All fields,967500,815100,721950,93150,17100,113450,21850
3,Science,736700,612450,533800,78700,12800,93550,17900


In [15]:
numbersDF.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108 entries, index to 106
Data columns (total 8 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   Field of study                     106 non-null    object
 1   Total                              100 non-null    object
 2   Employed                           101 non-null    object
 3   Unnamed: 5                         101 non-null    object
 4   Unnamed: 7                         101 non-null    object
 5   Unemployeda                        100 non-null    object
 6   Retired                            100 non-null    object
 7   Not employed or not seeking workb  100 non-null    object
dtypes: object(8)
memory usage: 7.6+ KB


In [16]:
numbersDF.rename(columns={
    'Total': 'Graduates', 
    'Employed':'Total Employed', 
    'Unnamed: 5': 'Full Time', 
    'Unnamed: 7':'Part Time',
    'Unemployeda':'Unemployed',
    'Not employed or not seeking workb':'Not employed or not seeking work'}, inplace = True)
numbersDF.drop(numbersDF.index[0:3], inplace = True)
numbersDF.set_index('Field of study', inplace = True)

In [17]:
numbersDF.head()

index,Graduates,Total Employed,Full Time,Part Time,Unemployed,Retired,Not employed or not seeking work
Field of study,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
All fields,967500,815100,721950,93150,17100,113450,21850
Science,736700,612450,533800,78700,12800,93550,17900
"Biological, agricultural, and environmental life sciences",246850,206900,188650,18250,5150,28100,6700
Agricultural and food sciences,20800,16350,15050,1300,200,3700,500
Agricultural sciences,1400,1000,800,200,*,400,*


In [18]:
sectorDF.to_csv('../data/inProcess/sector.csv')
numbersDF.to_csv('../data/inProcess/numbers.csv')