In [4]:
import pandas as pd

pd.set_option('display.max_rows', 30)

### Loading the data dictionary from the file

In [5]:
# loading the data dictionary
file_name = 'institution_data_dictionary.csv'
df_data_dict_inst = pd.read_csv(file_name)
df_data_dict_inst.shape

(3251, 11)

### This shows that the data dictionary has 3251 rows. We can consider those as the features in our dataet.

In [35]:
# Displaying some sample values of the data dictionary
df_data_dict_inst

Unnamed: 0,NAME OF DATA ELEMENT,dev-category,developer-friendly name,API data type,INDEX,VARIABLE NAME,VALUE,LABEL,SOURCE,SHOWN/USE ON SITE,NOTES
0,Unit ID for institution,root,id,integer,,UNITID,,,IPEDS,Yes,
1,8-digit OPE ID for institution,root,ope8_id,string,varchar(10),OPEID,,,IPEDS,Yes,
2,6-digit OPE ID for institution,root,ope6_id,string,varchar(10),OPEID6,,,IPEDS,Yes,
3,Institution name,school,name,autocomplete,fulltext,INSTNM,,,IPEDS,Yes,
4,City,school,city,autocomplete,varchar(200),CITY,,,IPEDS,Yes,
...,...,...,...,...,...,...,...,...,...,...,...
3246,Median earnings of students working and not en...,earnings,10_yrs_after_entry.median_earnings.highest_ter...,integer,,MD_EARN_WNE_INC3_P10,,,Treasury,,
3247,Median earnings of independent students workin...,earnings,10_yrs_after_entry.median_earnings.independent...,integer,,MD_EARN_WNE_INDEP1_P10,,,Treasury,,
3248,Median earnings of dependent students working ...,earnings,10_yrs_after_entry.median_earnings.dependent_s...,integer,,MD_EARN_WNE_INDEP0_P10,,,Treasury,,
3249,Median earnings of non-male students working a...,earnings,10_yrs_after_entry.median_earnings.non_male_st...,integer,,MD_EARN_WNE_MALE0_P10,,,Treasury,,


In [37]:
### Getting to know which variables to pick for feature selection
df_data_dict_inst[['NAME OF DATA ELEMENT','VARIABLE NAME','VALUE','LABEL']][0:10]

Unnamed: 0,NAME OF DATA ELEMENT,VARIABLE NAME,VALUE,LABEL
0,Unit ID for institution,UNITID,,
1,8-digit OPE ID for institution,OPEID,,
2,6-digit OPE ID for institution,OPEID6,,
3,Institution name,INSTNM,,
4,City,CITY,,
5,State postcode,STABBR,,
6,ZIP code,ZIP,,
7,Accreditor for institution,ACCREDAGENCY,,
8,URL for institution's homepage,INSTURL,,
9,URL for institution's net price calculator,NPCURL,,


In [None]:
# Load the most recent data file with the instituion names
data_file_inst = 'dataset/CollegeScorecard_Raw_Data_03142022/Most-Recent-Cohorts-Institution.csv'
df_inst = pd.read_csv(data_file_inst, low_memory=False)

In [90]:
df_inst.shape

(6694, 2989)

## Preparing the feature list from the dictionary

In [89]:
# Listing down all the features that I could think of initially in order to do my analysis

list_of_all_features=['INSTNM','CITY','STABBR','ST_FIPS','PREDDEG','ICLEVEL','CONTROL','ADM_RATE','CURROPER','SATVR25','SATVR75','SATMT25','SATMT75','SATWR25','SATWR75',
                 'ACTCM25','ACTCM75','ACTEN25','ACTEN75','ACTMT25','ACTMT75','ACTWR25','ACTWR75']

In [91]:
df_features_inst = df_inst[list_of_all_features]
df_features_inst.shape

(6694, 23)

In [72]:
df_features_inst_subset_1=df_features_inst.loc[[i for i in range(0,6694)],['INSTNM','CITY','STABBR','ST_FIPS','PREDDEG','CURROPER']]
df_features_inst_subset_1.shape

(6694, 6)

In [73]:
df_features_inst_subset_2=df_features_inst.loc[[i for i in range(0,6694)],['INSTNM','CURROPER']]
df_features_inst_subset_2.shape

(6694, 2)

In [74]:
byCurrOperation = df_features_inst_subset_2.groupby('CURROPER')
byCurrOperation.count()

Unnamed: 0_level_0,INSTNM
CURROPER,Unnamed: 1_level_1
0,330
1,6364


### So this above output shows that there are 300 Institutions that are Not currently certified as an operating institution

In [92]:
# Filtering out the Institutions that are currently not operating
df_features_inst_curroper = df_features_inst.query("CURROPER == 1")
df_features_inst_curroper.shape

(6364, 23)

In [103]:
# Colleges by ADM rate
df_coll_adm_rate = df_features_inst_curroper.loc[0:,['INSTNM','CITY','PREDDEG','ICLEVEL','ADM_RATE']]
df_coll_adm_rate.shape

(6364, 5)

In [104]:
df_coll_adm_rate

Unnamed: 0,INSTNM,CITY,PREDDEG,ICLEVEL,ADM_RATE
0,Alabama A & M University,Normal,3,1,0.9175
1,University of Alabama at Birmingham,Birmingham,3,1,0.7366
2,Amridge University,Montgomery,2,1,
3,University of Alabama in Huntsville,Huntsville,3,1,0.8257
4,Alabama State University,Montgomery,3,1,0.9690
...,...,...,...,...,...
6689,Georgia Military College - Eastman,Eastman,0,1,
6690,American College of Barbering - Florence,Florence,0,3,
6691,HCI College - Fort Lauderdale Campus,Fort Lauderdale,0,2,
6692,ABC Adult School - Cabrillo Lane,Cerritos,0,3,


In [105]:
# Sort by ADM Rate
df_coll_adm_rate = df_coll_adm_rate.query("ADM_RATE > 0.0 & ADM_RATE < 1.0")
df_coll_adm_rate.sort_values(by='ADM_RATE', ascending=True, na_position='last')


Unnamed: 0,INSTNM,CITY,PREDDEG,ICLEVEL,ADM_RATE
1464,Hampshire College,Amherst,3,1,0.0197
2806,Curtis Institute of Music,Philadelphia,3,1,0.0393
3765,Stanford University,Stanford,3,1,0.0434
1465,Harvard University,Cambridge,3,1,0.0464
391,Pacific Oaks College,Pasadena,3,1,0.0511
...,...,...,...,...,...
2461,Dickinson State University,Dickinson,3,1,0.9973
842,Lewis-Clark State College,Lewiston,3,1,0.9976
800,Morehouse College,Atlanta,3,1,0.9980
382,Notre Dame de Namur University,Belmont,3,1,0.9994


In [98]:
df_coll_adm_rate = df_coll_adm_rate.query('INSTNM.str.contains("Indiana")', engine='python')
df_coll_adm_rate

Unnamed: 0,INSTNM,CITY,PREDDEG,ICLEVEL,ADM_RATE
1042,Indiana University-Purdue University-Indianapolis,Indianapolis,3,1,0.8073
1043,University of Indianapolis,Indianapolis,3,1,0.9279
1044,Indiana Institute of Technology,Fort Wayne,3,1,0.5964
1045,University of Southern Indiana,Evansville,3,1,0.9332
1046,Indiana State University,Terre Haute,3,1,0.8957
1047,Indiana University-Kokomo,Kokomo,3,1,0.7364
1048,Indiana University-South Bend,South Bend,3,1,0.777
1049,Indiana University-Bloomington,Bloomington,3,1,0.7791
1050,Indiana University-Northwest,Gary,3,1,0.7462
1051,Indiana University-Southeast,New Albany,3,1,0.8191


In [106]:
df_coll_adm_rate = df_coll_adm_rate.query('INSTNM.str.contains("bilt")', engine='python')
df_coll_adm_rate

Unnamed: 0,INSTNM,CITY,PREDDEG,ICLEVEL,ADM_RATE
3200,Vanderbilt University,Nashville,3,1,0.0912
