In [7]:
# The directory that release tarballs will be downloaded to, will contain a config file of previous runs
# This directory should have a trailing slash
RELEASE_DIR='/data0/transfer_models/releases/'

# The directory that includes all pre-processed data to be made available to the models
DATA_DIR='/data0/transfer_models/data/'

# The gh cli command for listing releases, this shouldn't need to be modified.
RELEASE_CMD='gh release list -R https://github.com/educational-technology-collective/transfer_models'

import pandas as pd
import io

# read or create history file as needed
try:
    history=pd.read_csv(f"{RELEASE_DIR}history.csv")
except:
    history=pd.DataFrame(columns=['desc','branch','tag','time'])
    history.to_csv(f"{RELEASE_DIR}history.csv",index=False)

# query for a list of available releases
releases=!$RELEASE_CMD
releases=pd.read_csv(io.StringIO("\n".join(releases)), sep='\t', names=['desc','branch','tag','time'])

# iterate through releases to begin processing them
for idx,row in releases.iterrows():
    tag=row["tag"]
    if len(history.query(f'tag==@tag'))==0:
        #download this new release
        MKDIR_CMD=f"mkdir {RELEASE_DIR}{tag}"
        DOWNLOAD_CMD=f"gh api https://api.github.com/repos/educational-technology-collective/transfer_models/tarball/{tag} | tar -zxf - -C {RELEASE_DIR}{tag}"
        !$MKDIR_CMD
        !$DOWNLOAD_CMD
        
        # determine where the project has unarchived to
        ROOT_CMD= f"ls {RELEASE_DIR}{tag}/"
        PROJECT_ROOT=!$ROOT_CMD 
        PROJECT_ROOT=f"{RELEASE_DIR}{tag}/{PROJECT_ROOT[0]}"
        
        # sync preprocessed datafiles
        !cp -R $DATA_DIR $PROJECT_ROOT
        
        # set directory to the root and begin analysis
        %cd $PROJECT_ROOT
        !/data1/home/brooksch/.conda/envs/python3.8transfer/bin/python scripts/train_dummy_model.py --src_institution um 
        
        # push files up to shared repository (drive)
        !rclone mkdir transfer:/$tag
        METRICS=PROJECT_ROOT+"/metrics"
        MODELS=PROJECT_ROOT+"/models"
        !rclone copy $METRICS transfer:/$tag/
        !rclone copy $MODELS transfer:/$tag/
        
        # check if all files in drive, if so evaluate
        
        # record that this has run
        history=history.append(row)
        # save to file in case subsequent runs cause problems, e.g. multiple releases and the first fails
        history.to_csv(f"{RELEASE_DIR}history.csv",index=False)
        
        # restore location to script root
        %cd -

/data0/transfer_models/releases/v0.0-alpha3/educational-technology-collective-transfer_models-b96d2462899f339da82866c9c8f9603dbb44634b
2022-03-07 12:55:15 INFO     src institution is um
2022-03-07 12:55:15 DEBUG    reading data from ./data/preprocessed/um.feather
2022-03-07 12:55:16 DEBUG    reading target data complete; preprocessing target data
2022-03-07 12:55:16 DEBUG    processing column sex with 3 unique values
2022-03-07 12:55:16 DEBUG    column sex_Other not in data; creating dummy
2022-03-07 12:55:16 DEBUG    processing column ethnicity with 8 unique values
2022-03-07 12:55:16 DEBUG    processing column urm_status with 3 unique values
2022-03-07 12:55:16 DEBUG    processing column cip2_major_1 with 25 unique values
2022-03-07 12:55:16 DEBUG    column cip2_major_1_01 not in data; creating dummy
2022-03-07 12:55:16 DEBUG    column cip2_major_1_02 not in data; creating dummy
2022-03-07 12:55:16 DEBUG    column cip2_major_1_06 not in data; creating dummy
2022-03-07 12:55:16 DEBUG 