## Import libraries

In [2]:
import numpy as np
import pandas as pd

## Read the data

In [6]:
df = pd.read_csv("raw_data/nls97.csv")

In [8]:
df.set_index("personid", inplace = True)

In [10]:
df.columns

Index(['gender', 'birthmonth', 'birthyear', 'highestgradecompleted',
       'maritalstatus', 'childathome', 'childnotathome', 'wageincome',
       'weeklyhrscomputer', 'weeklyhrstv', 'nightlyhrssleep', 'satverbal',
       'satmath', 'gpaoverall', 'gpaenglish', 'gpamath', 'gpascience',
       'highestdegree', 'govprovidejobs', 'govpricecontrols', 'govhealthcare',
       'govelderliving', 'govindhelp', 'govunemp', 'govincomediff',
       'govcollegefinance', 'govdecenthousing', 'govprotectenvironment',
       'weeksworked00', 'weeksworked01', 'weeksworked02', 'weeksworked03',
       'weeksworked04', 'weeksworked05', 'weeksworked06', 'weeksworked07',
       'weeksworked08', 'weeksworked09', 'weeksworked10', 'weeksworked11',
       'weeksworked12', 'weeksworked13', 'weeksworked14', 'weeksworked15',
       'weeksworked16', 'weeksworked17', 'colenrfeb97', 'colenroct97',
       'colenrfeb98', 'colenroct98', 'colenrfeb99', 'colenroct99',
       'colenrfeb00', 'colenroct00', 'colenrfeb01', 'col

## Look at the employment and education data

In [13]:
df[['wageincome', 'highestgradecompleted', 'highestdegree']].head(3)

Unnamed: 0_level_0,wageincome,highestgradecompleted,highestdegree
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100061,12500.0,13.0,2. High School
100139,120000.0,12.0,2. High School
100284,58000.0,7.0,0. None


## Look at weeksworked between 2012 and 2017

In [16]:
df.loc[:,"weeksworked12":"weeksworked17"].head(3)

Unnamed: 0_level_0,weeksworked12,weeksworked13,weeksworked14,weeksworked15,weeksworked16,weeksworked17
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100061,40.0,52.0,52.0,52.0,48.0,48.0
100139,52.0,52.0,52.0,52.0,53.0,52.0
100284,0.0,,11.0,52.0,47.0,0.0


## Look at College enrolment between 2009 and 2014

In [19]:
df.loc[:,"colenroct09":"colenroct14"].head(3)

Unnamed: 0_level_0,colenroct09,colenrfeb10,colenroct10,colenrfeb11,colenroct11,colenrfeb12,colenroct12,colenrfeb13,colenroct13,colenrfeb14,colenroct14
personid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
100061,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,3. 4-year college,3. 4-year college,3. 4-year college,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled
100139,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled
100284,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled,1. Not enrolled


## Show individuals with wage income but no weeks worked in 2016

In [22]:
df.loc[(df.weeksworked16 ==0) & (df.wageincome) >0, ['weeksworked16', 'wageincome']]

Unnamed: 0_level_0,weeksworked16,wageincome
personid,Unnamed: 1_level_1,Unnamed: 2_level_1
102625,0.0,1200.0
109403,0.0,5000.0
118704,0.0,25000.0
130701,0.0,12000.0
131151,0.0,65000.0
...,...,...
957344,0.0,90000.0
966697,0.0,65000.0
969334,0.0,5000.0
991756,0.0,9000.0


## Unique values of college enrolment

In [25]:
df.colenroct14.unique()

array(['1. Not enrolled', '2. 2-year college ', nan,
       '4. Graduate program', '3. 4-year college'], dtype=object)

## Check whether an individual was enrolled in a 4-year college course

In [28]:
df.filter(like = 'colenr').apply(lambda x:x.str[0:1]=='3').head(4).T

personid,100061,100139,100284,100292
colenrfeb97,False,False,False,False
colenroct97,False,False,False,False
colenrfeb98,False,False,False,False
colenroct98,False,False,False,False
colenrfeb99,False,False,False,False
colenroct99,False,False,False,False
colenrfeb00,False,False,False,False
colenroct00,False,False,False,True
colenrfeb01,False,False,False,True
colenroct01,False,False,False,True


In [30]:
df.filter(like = 'colenr').apply(lambda x: x.str[0:1] =='3').any(axis =1).head(2)

personid
100061     True
100139    False
dtype: bool

## Show individuals with a post-graduate enrolment but no bachelor's enrolment

In [33]:
df.colenroct14.unique()

array(['1. Not enrolled', '2. 2-year college ', nan,
       '4. Graduate program', '3. 4-year college'], dtype=object)

In [35]:
no_bachelors = df.loc[df.filter(like = 'colenr').apply(lambda x: x.str[0:1] == '4').\
               any(axis =1) & \
                ~df.filter(like = 'colenr').apply(lambda x: x.str[0:1] == '3').any(axis =1),\
                "colenrfeb97":"colenroct17"]

In [37]:
len(no_bachelors)

22

In [None]:
df.highestdegree.value_counts()

In [39]:
no_bachelors.head(2).T

personid,153051,154535
colenrfeb97,,
colenroct97,1. Not enrolled,1. Not enrolled
colenrfeb98,1. Not enrolled,1. Not enrolled
colenroct98,1. Not enrolled,1. Not enrolled
colenrfeb99,1. Not enrolled,1. Not enrolled
colenroct99,1. Not enrolled,1. Not enrolled
colenrfeb00,1. Not enrolled,1. Not enrolled
colenroct00,2. 2-year college,1. Not enrolled
colenrfeb01,1. Not enrolled,1. Not enrolled
colenroct01,2. 2-year college,1. Not enrolled


In [42]:
df.highestdegree.value_counts()

highestdegree
2. High School     3667
4. Bachelors       1673
1. GED             1146
0. None             953
3. Associates       737
5. Masters          603
7. Professional     120
6. PhD               54
Name: count, dtype: int64

## Show individuals with bachelor's degree or more but no 4-year college enrollment

In [45]:
no_4_year_enrolment = df.loc[df.highestdegree.str[0:1].isin(['4','5','6','7']) & \
                      ~df.filter(like = 'colenr').apply(lambda x: x.str[0:1] =='3').any(axis=1), \
                      "colenrfeb97":"colenroct17"]

In [47]:
len(no_4_year_enrolment)

39

## Show individuals with high wage income

In [50]:
highwages = df.loc[df.wageincome > df.wageincome.mean() +(df.wageincome.std() *3), ['wageincome']]

In [54]:
highwages.head(3)

Unnamed: 0_level_0,wageincome
personid,Unnamed: 1_level_1
131858,235884.0
133619,235884.0
151863,235884.0


## Show individuals with large changes in weeks worked for the most recent year

In [57]:
workchanges = df.loc[~df.loc[:,"weeksworked12":"weeksworked16"].mean(axis =1).between(df.weeksworked17 * 0.5, df.weeksworked17*2)\
            & ~df.weeksworked17.isnull(),"weeksworked12":"weeksworked17"]

In [59]:
len(workchanges)

1160

In [61]:
workchanges.sample(3).T

personid,308579,372006,891588
weeksworked12,0.0,52.0,52.0
weeksworked13,0.0,52.0,44.0
weeksworked14,0.0,52.0,20.0
weeksworked15,0.0,52.0,45.0
weeksworked16,5.0,10.0,31.0
weeksworked17,44.0,0.0,19.0


## Show inconsistencies in the highest grade completed and highest degree

In [64]:
df.highestgradecompleted.value_counts()

highestgradecompleted
12.0    1389
16.0    1016
14.0     750
13.0     538
18.0     407
15.0     397
17.0     388
11.0     364
20.0     333
10.0     329
9.0      292
19.0     203
8.0      198
7.0       31
6.0       16
95.0      11
5.0        1
Name: count, dtype: int64

In [66]:
df.highestdegree.value_counts()

highestdegree
2. High School     3667
4. Bachelors       1673
1. GED             1146
0. None             953
3. Associates       737
5. Masters          603
7. Professional     120
6. PhD               54
Name: count, dtype: int64

In [68]:
less_than_12 = df.loc[df.highestgradecompleted <12, ["highestgradecompleted", "highestdegree"]]

## Using crosstab function to create a frequency table 

In [71]:
pd.crosstab(less_than_12.highestgradecompleted, less_than_12.highestdegree)

highestdegree,0. None,1. GED,2. High School
highestgradecompleted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5.0,0,0,1
6.0,11,5,0
7.0,24,6,1
8.0,113,78,7
9.0,112,169,8
10.0,111,204,13
11.0,120,200,41
