# Build SQL table containing institutional characteristics

In [4]:
import pandas as pd

In [51]:
sql_login_params = {"username": "wang159_ro", "password": "napoleon0"}

## Load input data

R1/R2/MSI classification dataset:$ http://carnegieclassifications.iu.edu/downloads/CCIHE2018-PublicData.xlsx

UnitID institutional characteristics dataset: 

https://nces.ed.gov/ipeds/use-the-data

Survey Data -> "Complete Data Files" -> "2018" + "Institutional Characteristics" -> "Directory information" -> Download data file "HD2018

In [6]:
import wget

# download the files

ins_char_filename = wget.download('https://nces.ed.gov/ipeds/datacenter/data/HD2018.zip')
ins_cls_filename    = wget.download('http://carnegieclassifications.iu.edu/downloads/CCIHE2018-PublicData.xlsx')

In [17]:
ins_char_df = pd.read_csv(ins_char_filename, compression='zip', encoding = "ISO-8859-1")
ins_cls_df = pd.read_excel(ins_cls_filename, sheet_name = 'Data')
ins_cls_label_df = pd.read_excel(ins_cls_filename, sheet_name = 'Labels')
ins_cls_var_df = pd.read_excel(ins_cls_filename, sheet_name = 'Variables')

# display
display(ins_char_df.head(2))
display(ins_cls_df.head(2))
display(ins_cls_label_df.head(2))
display(ins_cls_var_df.head(2))

Unnamed: 0,UNITID,INSTNM,IALIAS,ADDR,CITY,STABBR,ZIP,FIPS,OBEREG,CHFNM,...,CBSATYPE,CSA,NECTA,COUNTYCD,COUNTYNM,CNGDSTCD,LONGITUD,LATITUDE,DFRCGID,DFRCUSCG
0,100654,Alabama A & M University,AAMU,4900 Meridian Street,Normal,AL,35762,1,5,"Dr. Andrew Hugine, Jr.",...,1,290,-2,1089,Madison County,105,-86.568502,34.783368,119,1
1,100663,University of Alabama at Birmingham,,Administration Bldg Suite 1070,Birmingham,AL,35294-0110,1,5,Ray L. Watts,...,1,142,-2,1073,Jefferson County,107,-86.799345,33.505697,105,1


Unnamed: 0,UNITID,NAME,CITY,STABBR,CC2000,BASIC2005,BASIC2010,BASIC2015,BASIC2018,IPUG2018,...,NACT,NSATACT,SATV25,SATM25,SATCMB25,SATACTEQ25,ACTCMP25,ACTFINAL,Unnamed: 95,Unnamed: 96
0,177834,A T Still University of Health Sciences,Kirksville,MO,52,25,25,25,25,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,
1,180203,Aaniiih Nakoda College,Harlem,MT,60,33,33,33,33,18,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,,


Unnamed: 0,Variable,Label,Value,Label.1
0,CC2000,2000 Carnegie Classification (historical - not...,-3.0,"Not classified, not in classification universe"
1,,,15.0,Doctoral/Research Universities—Extensive


Unnamed: 0,Variable,Label,Source
0,A&SBADEG,Baccalaureate degrees conferred in the Arts & ...,CCIHE
1,ACTCAT,Final ACT category (1=inclusive; 2=selective; ...,CCIHE


In [22]:
# Clean label DF. Turn it into a single table with Variable + Value's numeric value + Value's label
ins_cls_label_df = ins_cls_label_df.drop('Label', axis=1).rename(columns={'Label.1':'Label'})

In [44]:
# "Roll down" the Variable and fill empty Variable cells

current_var = None

for this_row_idx in ins_cls_label_df.index:
    this_row = ins_cls_label_df.loc[this_row_idx]
    
    if this_row.isna().Variable & (current_var!=None):
        # Update this variable with current_var
        ins_cls_label_df.loc[this_row_idx, 'Variable'] = current_var
    else:
        # Update current_var
        current_var = this_row.Variable

In [47]:
ins_cls_label_df.dropna(inplace=True)

# display
display(ins_cls_label_df.head(3))

Unnamed: 0,Variable,Value,Label
0,CC2000,-3.0,"Not classified, not in classification universe"
1,CC2000,15.0,Doctoral/Research Universities—Extensive
2,CC2000,16.0,Doctoral/Research Universities—Intensive


## Write to DB2

In [55]:
import sqlalchemy as sql

engine = sql.create_engine('mysql+pymysql://%s:%s@127.0.0.1/wang159_myrmekes?charset=utf8mb4'%(sql_login_params['username'], sql_login_params['password']))

In [56]:
ins_char_df.to_sql('institution_characteristics', con=engine, if_exists='replace', chunksize=200000)

In [58]:
# the original dataset contains column names with spaces. Rename them
ins_cls_df.rename(columns={'HSI ':'HSI', 'Unnamed: 95':'Unnamed95', 'Unnamed: 96':'Unnamed96'})\
        .to_sql('institution_classification', con=engine, if_exists='replace', chunksize=200000)

In [59]:
ins_cls_label_df.to_sql('institution_classification_labels', con=engine, if_exists='replace', chunksize=200000)

In [60]:
ins_cls_var_df.to_sql('institution_variable_labels', con=engine, if_exists='replace', chunksize=200000)