In [6]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"  # shows multiple outputs

import warnings
warnings.filterwarnings('ignore')

# Get Data

In [9]:
!ls 

LICENSE                   [34mdata[m[m                      [34mprocessed[m[m
README.md                 data_features-Copy1.ipynb
Untitled.ipynb            data_features.ipynb


In [10]:
!ls data

[31mhsls_16_student_v1_0.csv[m[m


In [11]:
data_path='data/hsls_16_student_v1_0.csv'
df = pd.read_csv(data_path)

# Inspect/Clean Data

In [13]:
df

Unnamed: 0,STU_ID,SCH_ID,X1NCESID,X2NCESID,STRAT_ID,PSU,X2UNIV1,X2UNIV2A,X2UNIV2B,X3UNIV1,...,W3W1SCITCH191,W3W1SCITCH192,W3W1SCITCH193,W3W1SCITCH194,W3W1SCITCH195,W3W1SCITCH196,W3W1SCITCH197,W3W1SCITCH198,W3W1SCITCH199,W3W1SCITCH200
0,10001,-5,-5,-5,-5,-5,11,1,1,1111,...,0.000000,798.114506,727.021953,0.000000,1004.686071,0.000000,0.000000,935.662473,0.000000,884.466000
1,10002,-5,-5,-5,-5,-5,11,1,1,1111,...,387.312514,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,391.183800,495.032338,0.000000
2,10003,-5,-5,-5,-5,-5,11,1,1,1111,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,10004,-5,-5,-5,-5,-5,10,1,7,1001,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,10005,-5,-5,-5,-5,-5,11,1,1,1111,...,0.000000,379.461779,325.279970,369.953871,0.000000,318.240346,612.951272,0.000000,444.985831,382.924975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23498,35202,-5,-5,-5,-5,-5,11,1,6,1111,...,0.000000,387.594547,0.000000,454.760907,256.529583,0.000000,271.003180,442.435571,0.000000,413.111710
23499,35203,-5,-5,-5,-5,-5,11,1,1,1111,...,0.000000,0.000000,10.641808,0.000000,0.000000,0.000000,44.764566,0.000000,0.000000,42.746630
23500,35204,-5,-5,-5,-5,-5,11,1,1,1111,...,297.979615,276.424086,0.000000,194.234727,0.000000,0.000000,581.039117,0.000000,0.000000,0.000000
23501,35205,-5,-5,-5,-5,-5,11,1,1,1111,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [96]:
# from the documentation, values of -5 mean the data is 
# restricted use, so we will delete these features

restricted_cols = [col for col in df.columns if df[col][0]==-5]
len(restricted_cols)
df_un = df.drop(restricted_cols, axis=1)
df_un.shape

887

(23503, 7622)

In [44]:
# check for missing values

df_un.isnull().values.any()

False

# Initial Feature Selection

### Choose subset of features from among the >7000 available

In [54]:
!ls data_docs

[31mCodebook.txt[m[m                           HSLS09_VariableList_BY-F2_revised.xlsx


In [55]:
# get variable descriptions from excel file

vd_path = './data_docs/HSLS09_VariableList_BY-F2_revised.xlsx'
df_var_desc = pd.read_excel(vd_path, sheet_name='All Student-Level Variables', header=0,
                           names=['Variable_Name','Variable_Label'],index_col=0,
                            usecols=[0,1]) 

In [77]:
cols_list = df_un.columns
features_list = []

In [78]:
# students' assessment of math teachers

rex = re.compile('S1MTCH')
math_teach = [col for col in cols_list if re.match(rex, col)]
features_list.extend(math_teach)
pd.options.display.max_colwidth = 100
df_var_desc.loc[math_teach].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
S1MTCHVALUES,S1 C11A 9th grader's fall 2009 math teacher values/listens to students' ideas
S1MTCHRESPCT,S1 C11B 9th grader's fall 2009 math teacher treats students with respect
S1MTCHFAIR,S1 C11C 9th grader's fall 2009 math teacher treats every student fairly
S1MTCHCONF,S1 C11D 9th grader's fall 2009 math teacher thinks all student can be successful
S1MTCHMISTKE,S1 C11E 9th grader's fall 2009 math teacher thinks mistakes OK if students learn
S1MTCHTREAT,S1 C11F 9th grader's fall 2009 math teacher treats some kids better than others
S1MTCHINTRST,S1 C11G 9th grader's fall 2009 math teacher makes math interesting
S1MTCHMFDIFF,S1 C11H 9th grader's fall 2009 math teacher treats males/females differently
S1MTCHEASY,S1 C11I 9th grader's fall 2009 math teacher makes math easy to understand


In [79]:
# students' assessment of science teachers

rex = re.compile('S1STCH')
sci_teach = [col for col in cols_list if re.match(rex, col)]
features_list.extend(sci_teach)
#df_var_desc.loc[sci_teach].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
S1STCHVALUES,S1 D11A 9th grader's fall 2009 science teacher values/listens to students' ideas
S1STCHRESPCT,S1 D11B 9th grader's fall 2009 science teacher treats students with respect
S1STCHFAIR,S1 D11C 9th grader's fall 2009 science teacher treats every student fairly
S1STCHCONF,S1 D11D 9th grader's fall 09 science teacher think all student can be successful
S1STCHMISTKE,S1 D11E 9th grader's fall 09 science teacher think mistakes OK if students learn
S1STCHTREAT,S1 D11F 9th grader's fall 09 science teacher treats some kids better than others
S1STCHINTRST,S1 D11G 9th grader's fall 2009 science teacher makes science interesting
S1STCHMFDIFF,S1 D11H 9th grader's fall 2009 science teacher treats males/females differently
S1STCHEASY,S1 D11I 9th grader's fall 2009 science teacher makes science easy to understand


In [97]:
# variables describing students' self assessment of math ability

ind_self_beg = df_un.columns.get_loc('S1MPERSON1')
ind_self_end = df_un.columns.get_loc('S1MASSEXCL')

math_self = df_un.columns[ind_self_beg:ind_self_end+1].to_list()
features_list.extend(math_self)
#df_var_desc.loc[math_self].style.set_properties(**{'text-align': 'left'})

In [98]:
# variables describing students' self assessment of science ability

ind_self_beg = df_un.columns.get_loc('S1SPERSON1')
ind_self_end = df_un.columns.get_loc('S1SASSEXCL')

sci_self = df_un.columns[ind_self_beg:ind_self_end+1].to_list()
features_list.extend(sci_self)
#df_var_desc.loc[sci_self].style.set_properties(**{'text-align': 'left'})

In [87]:
# variables describing whether students took AP math/science

AP_ms = ['S3APMATH','S3APSCIENCE']
features_list.extend(AP_ms)
df_var_desc.loc[AP_ms].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
S3APMATH,S3 A14A Has taken AP math course(s)
S3APSCIENCE,S3 A14B Has taken AP science course(s)


In [90]:
# variables describing race

race = ['X1RACE']
features_list.extend(race)
df_var_desc.loc[race].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
X1RACE,X1 Student's race/ethnicity-composite


In [91]:
# variables describing IEP

iep = ['X1IEPFLAG']
features_list.extend(iep)
df_var_desc.loc[iep].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
X1IEPFLAG,X1 Individualized Education Plan


In [92]:
# variables describing graduation/GED/dropout

grad = ['X3HSCOMPSTAT']
features_list.extend(grad)
df_var_desc.loc[grad].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
X3HSCOMPSTAT,X3 High school completion status (transcript and GED source updated)


In [93]:
# variables describing parents education

pared = ['X1PAR1EDU','X1PAR2EDU']
features_list.extend(pared)
df_var_desc.loc[pared].style.set_properties(**{'text-align': 'left'})

Unnamed: 0_level_0,Variable_Label
Variable_Name,Unnamed: 1_level_1
X1PAR1EDU,X1 Parent 1: highest level of education
X1PAR2EDU,X1 Parent 2: highest level of education


In [99]:
df_select = df_un[features_list]
df_select.head()

Unnamed: 0,S1MTCHVALUES,S1MTCHRESPCT,S1MTCHFAIR,S1MTCHCONF,S1MTCHMISTKE,S1MTCHTREAT,S1MTCHINTRST,S1MTCHMFDIFF,S1MTCHEASY,S1STCHVALUES,...,S1SENJOYING,S1SWASTE,S1SBORING,S1SUSELIFE,S1SUSECLG,S1SUSEJOB,S1STESTS,S1STEXTBOOK,S1SSKILLS,S1SASSEXCL
0,1,1,1,1,1,3,2,3,1,1,...,2,3,2,3,2,2,1,2,2,1
1,2,2,1,1,1,4,2,4,2,1,...,2,3,3,1,1,1,2,2,2,1
2,1,1,1,1,1,4,1,4,1,1,...,2,4,4,2,2,2,2,3,2,2
3,2,2,2,2,2,3,2,3,2,2,...,3,3,2,2,2,2,2,2,2,2
4,3,3,3,3,2,1,1,4,2,4,...,4,2,2,4,2,4,2,2,2,2


# Save Processed Data

In [95]:
# save processed dataframe to file

out_path = './processed/hsls_16_student_processed.csv'
df_select.to_csv(out_path)