# Import

In [12]:
import pandas as pd

In [13]:
%matplotlib inline
import matplotlib.pyplot as plt

In [14]:
import numpy as np

In [15]:
import importlib
import efrc_ml_production as ml
importlib.reload(ml)

<module 'efrc_ml_production' from '/home/rgur/py_scripts/efrc_ml_production.py'>

In [16]:
from sklearn.metrics import r2_score as r2

In [17]:
import os
from os import listdir
from os.path import isfile, join

In [43]:
import rishi_utils as ru
importlib.reload(ru)

<module 'rishi_utils' from '/home/rgur/py_scripts/rishi_utils.py'>

In [18]:
os.getcwd()

'/home/rgur/py_scripts'

# Define variables

In [19]:
results_dir = '/data/rgur/efrc/ml/results/'

In [20]:
stacked_path = '/data/rgur/efrc/prep_data/all_v1/stacked.csv'
ml_d_path = '/data/rgur/efrc/prep_data/all_v1/ml_data.csv'

In [21]:
Y_DATA_PATH = '/data/rgur/efrc/data_DONOTTOUCH/hMOF_allData_March25_2013.xlsx'

In [22]:
stacked = pd.read_csv('/data/rgur/efrc/prep_data/all_v1/stacked.csv')
ml_d = pd.read_csv('/data/rgur/efrc/prep_data/all_v1/ml_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
ml_d = ml.merge_data(ml_d, False, Y_DATA_PATH)

In [24]:
stacked_map = {}
for i in stacked[['filename', 'pressure','vol_uptake']].iterrows():
    val = i[1]
    stacked_map[(val['filename'], val['pressure'])] = val['vol_uptake']

In [25]:
grav_map = {}
for i in ml_d[['filename', 'CH4_cm3/g_35_bar']].iterrows():
    val = i[1]
    grav_map[val['filename']] = val['CH4_cm3/g_35_bar']

# Define functions

In [26]:
def getMetrics(true, pred, metrics):
    '''
    Get metric values from a list if metric functions
    '''
    return [f(true, pred) for f in metrics]

In [27]:
def checkIsoFile(filename):
    '''
    Determine whether or not filename should be included
    '''
#     if filename[-4:] == '.csv':
#         if 'iso' in filename:
#             with open(filename, 'r') as handle:
#                 row_count = sum(1 for row in handle) - 1
#             if row_count == 533430:
#                 return True
    if filename[-4:] == '.csv':
        if 'iso' in filename:
            row_count = ru.count_lines(filename) - 1
            if row_count == 533430:
                return True
    return False

In [28]:
def checkGravFile(filename):
    '''
    Determine whether or not filename should be included
    '''
    if filename[-4:] == '.csv':
        if 'grav' in filename:
            with open(filename, 'r') as handle:
                row_count = sum(1 for row in handle) - 1
            if row_count == 126369:
                return True
    return False

In [29]:
def get_t_pct(filename):
    return float(filename.split('train_')[1][:2])

In [30]:
def get_Code(filename):
    return filename.split('code_')[1][:4]

In [31]:
def getSeed(filename):
    return filename.split('seed_')[1][:1]

In [32]:
def getAllFilenames(parent_dir):
    return [parent_dir+f for f in listdir(parent_dir) if isfile(join(parent_dir, f))]

In [33]:
def getTruthVals(train_keys, test_keys, my_map):
    train_truth = [my_map[f] for f in train_keys]
    test_truth = [my_map[f] for f in test_keys]
    return train_truth, test_truth

In [34]:
def getPredsAndKeys(df):
    n_unique_pressures = len(df['Pressure'].unique())
    train_df = df[df['Class'] == 'Train']
    test_df = df[df['Class'] == 'Test']
    train_preds = train_df['Prediction'].tolist()
    test_preds = test_df['Prediction'].tolist()
    if n_unique_pressures == 1:
        train_keys = train_df['Filename'].tolist()
        test_keys = test_df['Filename'].tolist()
    else:
        train_keys = [tuple(r) for r in train_df[['Filename', 'Pressure']].to_numpy()]
        test_keys = [tuple(r) for r in test_df[['Filename', 'Pressure']].to_numpy()]
    return train_preds, train_keys, test_preds, test_keys

In [35]:
def get_time(f):
    return '_'.join(f.split('.')[0].split('_')[-6:])

In [36]:
def unflatten(l, n):
    new = []
    length = len(l)
    prev = 0
    split_inds = list(range(0, length+1, n))
    prev = 0
    for ind in split_inds[1:]:
        new.append(l[prev:ind])
        prev = ind
    return new

In [37]:
def getMetricsFromDf(f, fix_preds, stacked):
    df = pd.read_csv(f)
    if fix_preds:
        fix_predictions(df)
    train_preds, train_keys, test_preds, test_keys = getPredsAndKeys(df)
    if stacked:
        my_map = stacked_map
    else:
        my_map = grav_map
    train_truth, test_truth = getTruthVals(train_keys, test_keys, my_map)
    train_vals = getMetrics(true=train_truth, pred=train_preds, metrics=[ml.get_rmse, r2])
    test_vals = getMetrics(true=test_truth, pred=test_preds, metrics=[ml.get_rmse, r2])
    return (train_vals, test_vals)

In [38]:
def getResultsDf(good_codes=None, good_time=None, fix_preds=True, include_geo=False, stacked=False):
    
    seeds = []
    run1_r2s = []
    run1_rmses = []
    run2_r2s = []
    run2_rmses = []
    run3_r2s = []
    run3_rmses = []
    train_pcts = []
    codes = []
    classes = []
    all_files = getAllFilenames(results_dir)
    #relevant_iso = [checkIsoFile(f) for f in all_files]
    if stacked:
        checkFile = checkIsoFile
    else:
        checkFile = checkGravFile
    relevent = [f for f in all_files if checkFile(f)]
    unique_codes = list(set([get_Code(f) for f in relevent]))
    if include_geo:
        good_codes = [code for code in unique_codes if code[-1] == '1']
    else:
        good_codes = [code for code in unique_codes if code[-1] == '0']
    print(good_codes)
    if good_codes != None:
        relevent = [f for f in relevent if get_Code(f) in good_codes]
    if good_time != None:
        relevent = [f for f in relevent if get_time(f) in good_time]
    relevent = sorted(relevent)
    nested_relevent = unflatten(relevent, 3)
    for l in nested_relevent:
        f = l[0]
        codes += [get_Code(f)]*2
        classes.append("Train")
        classes.append("Test")
        #seeds += [getSeed(f)]*2
        train_pcts += [get_t_pct(f)]*2
        vals = [getMetricsFromDf(f, fix_preds, stacked) for f in l] #[seed 0, seed 1, seed 2], [train, test], [rmse, r2]
        #print(vals)
        run1_r2s.append(vals[0][0][1])
        run1_r2s.append(vals[0][1][1])
        
        run1_rmses.append(vals[0][0][0])
        run1_rmses.append(vals[0][1][0])
        
        run2_r2s.append(vals[1][0][1])
        run2_r2s.append(vals[1][1][1])
        
        run2_rmses.append(vals[1][0][0])
        run2_rmses.append(vals[1][1][0])
        
        run3_r2s.append(vals[2][0][1])
        run3_r2s.append(vals[2][1][1])
        
        run3_rmses.append(vals[2][0][0])
        run3_rmses.append(vals[2][1][0])

    return pd.DataFrame({"Run 1 R2": run1_r2s, "Run 1 RMSE": run1_rmses, "Run 2 R2": run2_r2s, 
                         "Run 2 RMSE": run2_rmses, "Run 3 R2": run3_r2s, "Run 3 RMSE": run3_rmses, 
                         "Training %": train_pcts, "Code": codes, "Class": classes})

In [39]:
def labelFromCode(code):
    s = ''
    if code[0] == '1':
        s+='SI+'
    if code[1] == '1':
        s+='SD+'
    if code[2] == '1':
        s+='LS+'
    if code[3] == '1':
        s+='Geo+'
    return s[:-1]

In [40]:
def fix_predictions(df):
    preds = df['Prediction'].tolist()
    new = []
    for i in preds:
        if i<0:
            new.append(0)
        else:
            new.append(i)
    df['Prediction'] = new

In [41]:
def plot_results(df, title='Learning Curve', good_codes=None, score_name='RMSE', ylim=None, axes=None):
    colors = ['r', 'k', 'b', 'c', 'm', 'y', 'darkorange', 'slategray', 'tan', 'deeppink', 'greenyellow','g']
    n_plots = len
    if axes is None:
        _, axes = plt.subplots(1, 1, figsize=(8, 8))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel(score_name)
    if good_codes == None:
        good_codes = df['Code'].unique().tolist()
    for ind, code in enumerate(good_codes):
        LABEL = labelFromCode(code)
        df_code = df[df['Code'] == code]
        train_df = df_code[df_code['Class'] == 'Train']
        test_df = df_code[df_code['Class'] == 'Test']
        score_train_df = train_df[['Run 1 %s' %score_name, 'Run 2 %s' %score_name, 'Run 3 %s' %score_name]]
        score_test_df = test_df[['Run 1 %s' %score_name, 'Run 2 %s' %score_name, 'Run 3 %s' %score_name]]
        train_sizes = df['Training %'].unique()

        train_scores_mean = np.mean(score_train_df, axis=1)
        train_scores_std = np.std(score_train_df, axis=1)
        test_scores_mean = np.mean(score_test_df, axis=1)
        test_scores_std = np.std(score_test_df, axis=1)
        #fit_times_mean = np.mean(fit_times, axis=1)
        #fit_times_std = np.std(fit_times, axis=1)

        train_scores_mean = train_scores_mean
        train_scores_std = train_scores_std
        test_scores_mean = test_scores_mean
        test_scores_std = test_scores_std

        # Plot learning curve
        n_done = len(train_scores_mean) #in case not all runs are done yet
        axes.grid()
        axes.fill_between(train_sizes[0:n_done], train_scores_mean - train_scores_std,
                             train_scores_mean + train_scores_std, alpha=0.1,
                             color=colors[ind])
        axes.fill_between(train_sizes[0:n_done], test_scores_mean - test_scores_std,
                             test_scores_mean + test_scores_std, alpha=0.1,
                             color=colors[ind])
        axes.plot(train_sizes[0:n_done], train_scores_mean, 'o-', color=colors[ind],
                     label="Training %s - %s" %(score_name, LABEL) )
        axes.plot(train_sizes[0:n_done], test_scores_mean, 'x-', color=colors[ind],
                     label="Test %s - %s" %(score_name, LABEL) )
    axes.legend(loc="best")

    return plt

# Run 

In [42]:
#GOOD_CODES = ['1010', '1000']
#GOOD_TIME = ['08_37PM_on_April_18_2020', '02_56PM_on_April_20_2020']
GOOD_TIME = ['05_52PM_on_April_26_2020', '11_15AM_on_April_28_2020']


results_df = getResultsDf(good_codes=None, good_time=GOOD_TIME, fix_preds=True, include_geo=False, stacked=True)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

In [None]:
%matplotlib inline

plot_results(results_df, score_name='RMSE')

In [33]:
results_df

Unnamed: 0,Run 1 R2,Run 1 RMSE,Run 2 R2,Run 2 RMSE,Run 3 R2,Run 3 RMSE,Training %,Code,Class
0,0.970985,15.028968,0.975885,13.693248,0.980611,12.267472,50.0,100,Train
1,0.935698,22.339433,0.938589,21.844361,0.936462,22.239241,50.0,100,Test
2,0.966213,16.207153,0.975426,13.805333,0.976767,13.434579,60.0,100,Train
3,0.939073,21.758697,0.938325,21.930825,0.939624,21.671886,60.0,100,Test
4,0.95349,19.013907,0.961985,17.198969,0.968538,15.647018,70.0,100,Train
5,0.936168,22.27312,0.940732,21.435716,0.940894,21.405519,70.0,100,Test
6,0.969757,15.332945,0.971084,14.997478,0.9756,13.774696,80.0,100,Train
7,0.942017,21.224364,0.943153,20.988541,0.943738,20.893389,80.0,100,Test
8,0.967063,16.005242,0.965603,16.34968,0.968113,15.741234,90.0,100,Train
9,0.944109,20.786467,0.942142,21.224364,0.946202,20.47416,90.0,100,Test


# Scratch

In [None]:
train_df = results_df[(results_df['Class'] == 'Test') & (results_df['Training %'] == 70.0)]

In [None]:
train_df['Average RMSE'] = np.mean(train_df[['Run 1 RMSE', 'Run 2 RMSE', 'Run 3 RMSE']], axis=1)

In [None]:
train_df['Average R2'] = np.mean(train_df[['Run 1 R2', 'Run 2 R2', 'Run 3 R2']], axis=1)

In [None]:
train_df

# Test functions

In [None]:
getMetrics([1,0,0], [1,0,0], [ml.get_rmse, r2]) 

In [None]:
a = '/data/rgur/efrc/ml/results/results_grav_code_1110_train_60_seed_0_12_34PM_on_April_17_2020.csv'

In [None]:
checkGravFile(a)

In [None]:
unflatten([0,1,2,3,4,6,7,8,9], 3)

In [None]:
labelFromCode('1111')