In [1]:
import os
import pandas as pd
import numpy as np
import time
import simplejson as json
from subprocess import Popen
from tochiBasedPipelineResultsHelper import runForFolderDetailed10Fold
logsparamsDir = "./AutomatePipelineLogs"

serverDataPath = "." # set to "." for current directory (this should contain models and sensor combinations

THRES_PERCENT_NANS_PER_SUBJECT = 0.2 # must be less than this
THRES_NUM_NANS_PER_COL = 14 # if a column is null in more than these many people, it'll be removed.

runTimeForLog = time.time()

## TO CONFIGURE FOR EACH RUN
scenario = "pre_lock_change_feats_to_lock_mh"#"pre_lock_change_feats_to_lock_mh" # "pre_feats_to_post_lock_mh"
scenario_suffix = "per_phase"
outcome_status = "psqi_total_lbl" # post_bdi_2_auto, change_bdi_2_auto, change_bdi_2_levelsC_layer2_auto
sensorname = "scr"
n_jobs = 1

# ["blue", "calls", "hr", "loc", "scr", "slp", "steps"]
# Phone: ["blue", "calls", "loc", "scr"]
# Fitbit: ["hr", "slp", "steps"]



In [2]:
## support functions
def writeOrAppendFile(filename, df, cols):
    if os.path.exists(filename):
        df.to_csv(filename, mode='a', header=False, columns = cols)
    else:
        df.to_csv(filename, header=True, columns = cols)
        
def getFolderName(serverDataPath, outcome_status, sensorname, modelname, suffix_foldername):
    if modelname.upper() == "GBC":
        modelnameinfname = "GBC"
    elif modelname.upper() == "LOGR":
        modelnameinfname = "LOGR"
    else:
        raise Exception ("Unrecognized modelname")
#     if pre_status not in ["NO", "SC", "LBL"]:
#         raise Exception("Pre OUTCOME status type unrecognized!")
#     folderpath = serverDataPath+"/models/{0}".format(sensorname)+"_{3}_results{0}{2}prestatus{1}_10Fold".format(limflag, pre_status, modelnameinfname, outSuffix[outcome_status])+"_rlog_{0}/"
    folderpath = serverDataPath+"/models/{0}".format(sensorname)+"_{1}_{2}_results{0}_10Fold".format(modelname, outcome_status, scenario)+"_rlog_{0}/"
    folderpath = folderpath.format(suffix_foldername)
    return (folderpath)



In [3]:
def runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict):
# def runForModel(serverDataPath, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict):
    # Start running initial models with varying "C"
    cmd = ['python', '{0}/tochiBasedPipeline.py'.format(serverDataPath), serverDataPath, str(THRES_NUM_NANS_PER_COL), str(THRES_PERCENT_NANS_PER_SUBJECT), outcome_status, scenario_suffix, scenario, sensorname, modelname, str(curr_suffix_foldername), json.dumps(selParamsDict)]
    print ("calling {0}".format(" ".join(cmd)))
    curr_process = Popen(cmd)
    # Add params to paramdict file and also add to log
    startt = time.time()
    limflag = ""
    paramDictStr = "paramDict{0} = {1}".format(curr_suffix_foldername, str(selParamsDict))
    processName = "{0}_{1}_{2}_{3}_10Fold".format(outcome_status, sensorname, modelname, curr_suffix_foldername)
    dfParamOut = pd.DataFrame({"sensorname": [sensorname], "outcome_status": [outcome_status], "processName": [processName], "paramDict": [paramDictStr], "startRun": [startt]})
    writeOrAppendFile("{0}/{1}_{2}_{3}_{4}_params_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), dfParamOut, paramFileCols)
    dfLogOut = pd.DataFrame({"sensorname": [sensorname], "outcome_status": [outcome_status], "processName": [processName], "paramDict": [paramDictStr], "status": ["START"], "time": [startt], "time_since_start": [0]})
    writeOrAppendFile("{0}/{1}_{2}_{3}_{4}_log_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), dfLogOut, logFileCols)
    modelpath = getFolderName(serverDataPath, outcome_status, sensorname, modelname, curr_suffix_foldername)
    return (modelpath, processName, paramDictStr, curr_process, startt)
    
## Check status of running models repeatedly using continue and sleep. Break when all are done. 
def checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist):
    flag = 0 
    while True:
        statuslist = []
        for pi in range(0, len(processobjlist)):
            processName = processnamelist[pi]
            paramDictStr = processparamdictlist[pi]
            currT = time.time()
            sinceStart = (currT - processstarttimelist[pi])/60.0
            if processobjlist[pi].poll() is None:
                print ("STATUS CHECK: {0} is still running, since {1} minutes".format(processName, sinceStart))
                #statusline = processobjlist[pi].stdout.readline() 
                #status = "RUNNING, curr_output: {0}".format(statusline)
                status = "RUNNING"
                statuslist.append(0)
            else:
                status = "ENDED"
                statuslist.append(1)
            dfLogOut = pd.DataFrame({"sensorname": [sensorname], "outcome_status": [outcome_status], "processName": [processName], "paramDict": [paramDictStr], "status": [status], "time": [currT], "time_since_start": [sinceStart]})
            limflag = ""
            writeOrAppendFile("{0}/{1}_{2}_{3}_{4}_log_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), dfLogOut, logFileCols)
        # Check status of all and if we must exit
        if 0 not in statuslist: # all have ended
            print ("STATUS CHECK: all have ended")
            flag = 1
            break
        else:
            # Sleep for X minutes
            print ("STATUS CHECK: sleeping for 30s")
            time.sleep(30)
    return (flag) # this function should always return 1 after running



In [4]:
## Check accuracy (while keeping an eye on N for models that are done running)
def checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict):
    dfOut = pd.DataFrame()
    dfOut_per_fold = pd.DataFrame()
    cols_final = []
    cols_per_fold_final = []
    mainMetric_final = None
    mainMetricAscending_final = None
    for modelpath in modelFoldersList:
        modeltype = modelFoldersParamDict[modelpath][0]
        modeli = modelFoldersParamDict[modelpath][1]
        dfRow, dfRow_per_fold, cols, cols_per_fold, mainMetric, mainMetricAscending, N, accuracy, f1meanOrf11, error = runForFolderDetailed10Fold(modelpath, modeltype, modeli, outcome_status, printtruepreds = False, verbose = True)
        print (modelpath)
        print (cols)
        dfOut = dfOut.append(dfRow)
        dfOut_per_fold = dfOut_per_fold.append(dfRow_per_fold)
        if N > 0:
            cols_final = cols
            cols_per_fold_final = cols_per_fold
            mainMetric_final = mainMetric
            mainMetricAscending_final = mainMetricAscending
    dfOut = dfOut.sort_values(by=mainMetric_final, ascending=mainMetricAscending_final)
#     print (dfOut)
    Nmax = dfOut["N"].max()
    dfOutNmax = dfOut[dfOut["N"] >= (Nmax - 3)]
    topAcc = dfOutNmax[mainMetric_final].values[0]
    topAccModel = dfOutNmax.index[0]
    topAccModelParamDict = modelFoldersParamDict[topAccModel][2]
    return (dfOut, dfOut_per_fold, cols_final, cols_per_fold_final, topAccModel, topAcc, topAccModelParamDict)



In [5]:
#### ------------------------------------- Program ------------------------------------- ####
paramFileCols = ["sensorname", "outcome_status", "processName", "paramDict", "startRun"]
logFileCols = ["sensorname", "outcome_status", "processName", "status", "time", "time_since_start"]
modelFoldersList = []
modelFoldersParamDict = {}

init_Cs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] # default
init_selection_threshold = 0.3
init_sample_fraction = 0.80
init_scaling = 0.5
try_selection_threshold = [0.1, 0.2, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# try_selection_threshold = [0.1, 0.2]
try_scaling = [0.3, 0.4, 0.6, 0.7]
try_sample_fraction = [0.75, 0.85]
try_modelname = ["GBC", "LOGR"]


## ---------- VARY C AND RUN MODELS ---------- ##
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = 0 ## starting foldername minus 1
    for c1 in init_Cs:  
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": c1, "scaling": init_scaling, "sample_fraction": init_sample_fraction, "n_resampling": 200, "selection_threshold": init_selection_threshold,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying C - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying C")
print (modelFoldersList)



calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 1 {"scaling": 0.5, "n_resampling": 200, "C": 0.1, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 2 {"scaling": 0.5, "n_resampling": 200, "C": 0.2, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 3 {"scaling": 0.5, "n_resampling": 200, "C": 0.3, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 4 {"scaling": 0.5, "n_resampling": 200, 

STATUS CHECK: psqi_total_lbl_scr_GBC_10_10Fold is still running, since 0.522085281213 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_3_10Fold is still running, since 0.522898968061 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_4_10Fold is still running, since 0.520832383633 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_5_10Fold is still running, since 0.521694580714 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_6_10Fold is still running, since 0.521184583505 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_7_10Fold is still running, since 0.521462666988 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_8_10Fold is still running, since 0.519752017657 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_9_10Fold is still running, since 0.519678886731 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_10_10Fold is still running, since 0.519924267133 minutes
STATUS CHECK: sleeping for 30s
STATUS CHECK: psqi_total_lbl_scr_GBC_4_10Fold is still running, since 1.03628356457 minutes
STATUS CHECK: psqi_total_lbl_s

In [6]:
## Check accuracy and choose best C
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)



./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_1/
0 files found
NUMFOLDS = 0
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_1/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_2/
1 files found
NUMFOLDS = 1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_2/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/
5 files found
NUMFOLDS = 5


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_6/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_7/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_7/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precis

In [7]:
display (currModels_df.head(5))

Unnamed: 0_level_0,N,accuracy,error,f10,f11,f1allmean,modeli,modeltype,outcome_status,precision0,precision1,recall0,recall1
modelpath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_2/,10,0.7,,0.666667,0.727273,0.69697,2,LOGR,psqi_total_lbl,0.6,0.8,0.75,0.666667
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/,49,0.573333,,0.260952,0.698168,0.47956,5,GBC,psqi_total_lbl,0.333333,0.621825,0.216667,0.8
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/,49,0.593333,,0.44127,0.674326,0.557798,3,GBC,psqi_total_lbl,0.48,0.662143,0.433333,0.7
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_2/,10,0.6,,0.5,0.666667,0.583333,2,GBC,psqi_total_lbl,0.5,0.666667,0.5,0.666667
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_10/,49,0.573333,,0.359365,0.664802,0.512083,10,GBC,psqi_total_lbl,0.366667,0.648254,0.366667,0.7


In [8]:
limflag = ""
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_C_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_C_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)


print ("\nTOPMODELS")
print (topAccModel)
print (topAcc)
print (topAccModelParamDict)


TOPMODELS
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
0.6981684981684982
{'scaling': 0.5, 'n_resampling': 200, 'C': 0.5, 'normalize': False, 'sample_fraction': 0.8, 'n_jobs': 1, 'tol': 0.001, 'selection_threshold': 0.3, 'random_state': 0}


In [9]:
## ---------- VARY selection_threshold AND RUN MODELS ---------- ##
bestC = topAccModelParamDict["C"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) ## starting foldername minus 1
    for s1 in try_selection_threshold:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": init_scaling, "sample_fraction": init_sample_fraction, "n_resampling": 200, "selection_threshold": s1,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sel thres - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying selection threshold")
print (modelFoldersList)




calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 11 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.1, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 12 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.2, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 13 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.4, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 14 {"scaling": 0.5, "n_resampling": 2

In [10]:
## Check accuracy and choose best sel thres
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)


./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_1/
0 files found
NUMFOLDS = 0
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_1/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_2/
1 files found
NUMFOLDS = 1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_2/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/
['modeltype', 'modeli', 'outcome_status', '

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_11/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_12/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_12/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_13/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_13/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precis

In [11]:
display(currModels_df.head(5))
print (currModels_df_cols)

Unnamed: 0_level_0,N,accuracy,error,f10,f11,f1allmean,modeli,modeltype,outcome_status,precision0,precision1,recall0,recall1
modelpath,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_2/,10,0.7,,0.666667,0.727273,0.69697,2,LOGR,psqi_total_lbl,0.6,0.8,0.75,0.666667
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_16/,10,0.7,,0.666667,0.727273,0.69697,16,LOGR,psqi_total_lbl,0.6,0.8,0.75,0.666667
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/,49,0.573333,,0.260952,0.698168,0.47956,5,GBC,psqi_total_lbl,0.333333,0.621825,0.216667,0.8
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/,49,0.593333,,0.44127,0.674326,0.557798,3,GBC,psqi_total_lbl,0.48,0.662143,0.433333,0.7
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_16/,10,0.6,,0.5,0.666667,0.583333,16,GBC,psqi_total_lbl,0.5,0.666667,0.5,0.666667


['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']


In [12]:
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_selection_threshold_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_selection_threshold_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)


In [13]:
## ---------- VARY scaling AND RUN MODELS ---------- ##
bestC = topAccModelParamDict["C"]
bestSel = topAccModelParamDict["selection_threshold"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) ## starting foldername minus 1
    for sc1 in try_scaling:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": sc1, "sample_fraction": init_sample_fraction, "n_resampling": 200, "selection_threshold": bestSel,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying scaling - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying scaling")
print (modelFoldersList)

## Check accuracy and choose best scaling
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)



calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 20 {"scaling": 0.3, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 21 {"scaling": 0.4, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 22 {"scaling": 0.6, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 23 {"scaling": 0.7, "n_resampling": 2

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_3/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_14/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_15/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_15/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_16/
1 files found
NUMFOLDS = 1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_16/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precis

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_23/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_23/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']


In [14]:
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_scaling_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_scaling_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)


In [15]:
## ---------- VARY sample_fraction AND RUN MODELS ---------- ##
bestC = topAccModelParamDict["C"]
bestSel = topAccModelParamDict["selection_threshold"]
bestScaling = topAccModelParamDict["scaling"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) + len(try_scaling) ## starting foldername minus 1
    for sf1 in try_sample_fraction:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": bestScaling, "sample_fraction": sf1, "n_resampling": 200, "selection_threshold": bestSel,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sample_fraction - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying sample_fraction")
print (modelFoldersList)

## Check accuracy and choose best sample_fraction
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_sample_fraction_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_sample_fraction_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)


calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 24 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.75, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 25 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.85, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 24 {"scaling": 0.5, "n_resampling": 200, "C": 0.5, "normalize": false, "sample_fraction": 0.75, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 25 {"scaling": 0.5, "n_resamplin

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_8/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_8/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_11/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_11/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_loc

In [16]:
## + or - 0.05 for each of the params
## C
tryC_last = [topAccModelParamDict["C"]-0.05, topAccModelParamDict["C"]+0.05]
bestSel = topAccModelParamDict["selection_threshold"]
bestScaling = topAccModelParamDict["scaling"]
bestSampleFrac = topAccModelParamDict["sample_fraction"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) + len(try_scaling) + len(try_sample_fraction) ## starting foldername minus 1
    for c1 in tryC_last:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": c1, "scaling": bestScaling, "sample_fraction": bestSampleFrac, "n_resampling": 200, "selection_threshold": bestSel,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sample_fraction - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying best C by + or - 0.05")
print (modelFoldersList)

## Check accuracy and choose best sample_fraction
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_C_by_0.05_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_C_by_0.05_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)


calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 26 {"scaling": 0.5, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 27 {"scaling": 0.5, "n_resampling": 200, "C": 0.55, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 26 {"scaling": 0.5, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.3, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 27 {"scaling": 0.5, "n_resamplin

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_14/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_14/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_15/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_loc

In [17]:
## Selection Thres
bestC = topAccModelParamDict["C"]
trySel_last = [topAccModelParamDict["selection_threshold"]-0.05, topAccModelParamDict["selection_threshold"]+0.05]
bestScaling = topAccModelParamDict["scaling"]
bestSampleFrac = topAccModelParamDict["sample_fraction"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) + len(try_scaling) + len(try_sample_fraction) ## starting foldername minus 1
    for sel1 in trySel_last:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": bestScaling, "sample_fraction": bestSampleFrac, "n_resampling": 200, "selection_threshold": sel1,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sample_fraction - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying best selection_threshold by + or - 0.05")
print (modelFoldersList)

## Check accuracy and choose best sample_fraction
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_selection_threshold_by_0.05_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_selection_threshold_by_0.05_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)



calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 26 {"scaling": 0.5, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 27 {"scaling": 0.5, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.35, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 26 {"scaling": 0.5, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 27 {"scaling": 0.5, "n_resamp

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_11/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'p

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_27/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_26/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_26/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_27/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_27/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'pr

In [18]:
## Scaling
bestC = topAccModelParamDict["C"]
bestSel = topAccModelParamDict["selection_threshold"]
tryScaling_last = [topAccModelParamDict["scaling"]-0.05, topAccModelParamDict["scaling"]+0.05]
bestSampleFrac = topAccModelParamDict["sample_fraction"]
processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) + len(try_scaling) + len(try_sample_fraction) ## starting foldername minus 1
    for sc1 in tryScaling_last:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": sc1, "sample_fraction": bestSampleFrac, "n_resampling": 200, "selection_threshold": bestSel,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sample_fraction - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying best scaling by + or - 0.05")
print (modelFoldersList)

## Check accuracy and choose best sample_fraction
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_scaling_by_0.05_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_scaling_by_0.05_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)



calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 26 {"scaling": 0.45, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 27 {"scaling": 0.55, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 26 {"scaling": 0.45, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 27 {"scaling": 0.55, "n_re

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_4/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_15/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_16/
1 files found
NUMFOLDS = 1
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_16/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_17/
0 files found
NUMFOLDS = 0
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_17/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precis

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_23/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_24/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_24/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_25/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_25/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'preci

In [19]:
## Sample_Fraction
bestC = topAccModelParamDict["C"]
bestSel = topAccModelParamDict["selection_threshold"]
bestScaling = topAccModelParamDict["scaling"]
trySampleFrac_last = [topAccModelParamDict["sample_fraction"]-0.05, topAccModelParamDict["sample_fraction"]+0.05]
trySampleFrac_last = [x for x in trySampleFrac_last if x not in try_sample_fraction+[init_sample_fraction]]

processnamelist = []
processparamdictlist = []
processobjlist = []
processstarttimelist = []
for modelname in try_modelname:
    curr_suffix_foldername = len(init_Cs) + len(try_selection_threshold) + len(try_scaling) + len(try_sample_fraction) ## starting foldername minus 1
    for sf1 in trySampleFrac_last:
        curr_suffix_foldername = curr_suffix_foldername + 1
        selParamsDict = {"C": bestC, "scaling": bestScaling, "sample_fraction": sf1, "n_resampling": 200, "selection_threshold": bestSel,  "tol": 0.001, "normalize": False, "random_state": 0, "n_jobs": n_jobs}
        ## Running the model
        modelpath, processName, paramDictStr, curr_process, startt = runForModel(serverDataPath, THRES_NUM_NANS_PER_COL, THRES_PERCENT_NANS_PER_SUBJECT, outcome_status, sensorname, modelname, curr_suffix_foldername, selParamsDict)
        # save process related stuff
        modelFoldersParamDict[modelpath] = tuple([modelname, curr_suffix_foldername, selParamsDict])
        modelFoldersList.append(modelpath)
        processnamelist.append(processName)
        processparamdictlist.append(paramDictStr)
        processobjlist.append(curr_process)
        processstarttimelist.append(startt)

## Check status after varying sample_fraction - function will always return 1
checkIfAllDone(processobjlist, processparamdictlist, processnamelist, processstarttimelist)

print ("Model Paths To Check after varying best sample fraction by + or - 0.05")
print (modelFoldersList)

## Check accuracy and choose best sample_fraction
currModels_df, currModels_df_per_fold, currModels_df_cols, currModels_df_cols_per_fold, topAccModel, topAcc, topAccModelParamDict = checkModelsAndGetBestParam(modelFoldersList, modelFoldersParamDict)
currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_sample_fraction_by_0.05_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
currModels_df_per_fold.to_csv("{0}/{1}_{2}_{3}_{4}_models_after_varying_sample_fraction_by_0.05_10Fold_per_fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols_per_fold)

currModels_df.to_csv("{0}/{1}_{2}_{3}_{4}_models_END_10Fold.csv".format(logsparamsDir, runTimeForLog, outcome_status, sensorname, limflag), columns = currModels_df_cols)
print ("Best Model is {0} with Accuracy {1}".format(topAccModel, topAcc))

calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr GBC 26 {"scaling": 0.45, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8500000000000001, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
calling python ./tochiBasedPipeline.py . 14 0.2 psqi_total_lbl per_phase pre_lock_change_feats_to_lock_mh scr LOGR 26 {"scaling": 0.45, "n_resampling": 200, "C": 0.45, "normalize": false, "sample_fraction": 0.8500000000000001, "n_jobs": 1, "tol": 0.001, "selection_threshold": 0.25, "random_state": 0}
STATUS CHECK: psqi_total_lbl_scr_GBC_26_10Fold is still running, since 0.000751932462056 minutes
STATUS CHECK: psqi_total_lbl_scr_LOGR_26_10Fold is still running, since 0.000449351469676 minutes
STATUS CHECK: sleeping for 30s
STATUS CHECK: all have ended
Model Paths To Check after varying best sample fraction by + or - 0.05
['./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsG

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_5/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_6/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_7/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0'

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_11/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_12/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_13/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'p

./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsGBC_10Fold_rlog_27/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_26/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_26/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11', 'precision0', 'recall0', 'f10', 'precision1', 'recall1', 'f11']
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_27/
5 files found
NUMFOLDS = 5
./models/scr_psqi_total_lbl_pre_lock_change_feats_to_lock_mh_resultsLOGR_10Fold_rlog_27/
['modeltype', 'modeli', 'outcome_status', 'N', 'accuracy', 'f1allmean', 'error', 'pr