# Retention rates for US Universities

This project is being built from the ground up to be customizable and reproducible.

After importing the libraries we need, we're going to load up a table of instructions containing which
of the 2,500+ attributes we're going to select from the IPEDS database for data mining. This is a lot,
so it seems more reasonable to externally store the data list we'll be drawing from.

Many of these, such as website addresses or mission statements, aren't going to be terribly useful. Some others we'll need to exclude as it is too closely related to retention rate, and we risk overfitting or circular logic ("hey, here's how to raise your retention rate--have more of them graduate!")

At the root, each entry in the JSON file denotes a separate table. Included alongside the table name are instructions on whether all the table should be imported as default or not, which attributes are continuous, which are discrete, and  which are strings, and whether multiple records exist for each primary key. This is all derived from the associated documentation that comes with IPEDS. 

We're going to load in each table (obviously checking that it doesn't exist first), and extract the correct tables from it. 

(Should you want to change what we're measuring, you can change the predictive variable within the JSON file.)

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import json
import wget
import sys
import os
from zipfile import ZipFile as zf

DATA_PATH = "data"

def import_data(filename):
    with open(filename) as file:
        instructions = json.load(file)
        
# does the data directory exist? if no, create it.
    
    if os.path.isdir(DATA_PATH) is False:
        try:  
            os.mkdir(DATA_PATH)
        except OSError:  
            print ("Could not create data folder.")
            raise
            
# start with an empty dataframe to fill, and get the primary key.
# the primary key should be in the first file in the data list

    ourdata = pd.DataFrame()
    pk = instructions["primarykey"]

# loop through each table. 

    for table in instructions["tables"]:
        
# first, check if the file in the table has been downloaded. if not, download it from
# the path given and throw an error if something is wrong.

        filename = table["name"]+instructions["format"]
        filelocation = DATA_PATH+"/"+filename
        csvfile = DATA_PATH+"/"+table["name"]+".csv"

        if os.path.exists(filelocation) is False:
            try:
                wget.download(instructions["url"]+filename, filelocation)
            except Exception as e:
                print("Problem downloading and saving", table["name"], ":", e)
                raise

# next, if they are zip files, unzip them

        if instructions["format"] == ".zip":
            if os.path.exists(str(csvfile).lower()) is False:
                print("Unzipping "+table["name"])
                with zf(filelocation,"r") as zip_ref:
                    zip_ref.extractall(DATA_PATH)

# load each CSV file into a temporary data frame, tdf

        tdf = pd.read_csv(str(csvfile).lower(), encoding = "ISO-8859-1")

# filter the data according to any values needed

        if "filter" in table:
            for filterinfo in table["filter"]:
                tdf = tdf.loc[tdf[filterinfo[0]] == filterinfo[1]]

# then, depending on the instructions in the JSON file, include all the headers, or include a selection.

        if table["includeall"]:

# include the whole list, excluding the specific tables (might be an empty list)
# and add the primary key to select

            to_include = list(tdf)
            to_exclude = table["exclude"]
            headers = [x for x in to_include if x not in to_exclude]
            headers.append(pk)

# otherwise, stick these three lists together plus the primary key

        else:
            headers = [pk, *table["strings"], *table["discrete"], *table["continuous"]]

        selected_headers = [x for x in tdf.columns if x in headers]

# columns that begin with an X should be removed, at least for now, because they don't describe
# anything other than how the data was collected
    
        selected_headers = [x for x in selected_headers if x[:1] is not "X"]

# ok, so do we have a primary key? if not, stop right there

        if pk not in tdf.columns:
            raise KeyError("Primary key "+pk+" not found in "+table["name"])

# now we can select the headers we want.

        tdf = tdf[selected_headers]
    
# the code below adds the table name to the headers now, to prevent issues with duplication
# to everythng other than the key

#        tdf.rename(columns = lambda x: pk if x == pk else table["name"]+"_"+x, inplace = True)

# next we must check in the JSON instructions if this table contains multiple rows for each
# unique ID. if so, we need to put them all on the same row. to do this, we change them to strings
# then read them into a multiple index, unstack, and then join the column names.

        if "multi" in table:

            multi = table["multi"]
# to use if we are adding table headers to column names
#
#            multi = [table["name"]+"_"+x for x in table["multi"]]
            tdf[multi] = tdf[multi].astype(str)
            tdf = tdf.set_index([pk, *multi])
            tdf = tdf.unstack(multi)               # need to specify ALL the levels
            tdf.columns = ['_'.join(col) for col in tdf.columns.values]

# if it's the first time around the loop, take the first set of data.

        if ourdata.empty:
            ourdata = tdf

# if not, then we need to join on the primary key

        else:
            ourdata = ourdata.merge(tdf,on=pk,how="left")

        print("Imported "+ table["name"]+ ": "+str(len(ourdata.columns))+" columns total, "
              + str(round(float(ourdata.memory_usage().sum() / 1048576), 2)) + "MB")
              
import_data("ipeds-instructions.json")

Imported HD2016: 26 columns total, 1.49MB
Imported IC2016: 131 columns total, 7.57MB
Imported IC2016_AY: 251 columns total, 14.46MB
Imported ADM2016: 289 columns total, 16.64MB
Imported EFFY2016: 379 columns total, 21.8MB
Imported EF2016A: 649 columns total, 37.3MB
Imported EF2016B: 775 columns total, 44.53MB
Imported EF2016C: 905 columns total, 51.99MB
Imported EF2016D: 916 columns total, 52.62MB
Imported EF2016A_DIST: 961 columns total, 55.2MB
Imported SFA1516: 1284 columns total, 73.73MB
Imported SFAV1516: 1302 columns total, 74.77MB


KeyError: 'includeall'