In [None]:
%matplotlib inline
import captest as pvc
import pandas as pd
import numpy as np
from bokeh.io import output_notebook, show

import operator

output_notebook

In [None]:
def to_mindex(df):
    my_index = pd.MultiIndex(levels=[[],[]],
                               labels=[[],[]],
                               names=[u'location', u'instrument'])

    df_mindex = pd.DataFrame(columns=my_index)

    for col in df.columns:
        location = col.split('_')[0]
        if len(col.split('_')) == 3:
            inst = col.split('_')[1] + '_' + col.split('_')[2]
        else:
            inst = col.split('_')[1]
        df_mindex[location, inst] = df.loc[:, col]

    df_mindex.index.name = None
    
    return df_mindex

In [None]:
def inject_err(df, col, meas_err='std', flt_val=None, flt_col=None, between_time=None, perc=100, seed=None):
    """
    df : dataframe
    col : str
        column name
    meas_err : str or float
        If 'std' adds error where std is 0.03 * col.std
        If 'outlier' adds error where std is 1/6th the range of the data
        If float adds error where std is float * col.std
    flt_val : str, optional default None
        String used in pd.DataFrame.query to filter
        Ex: '> 100'
    between_time : (str, str), optional default None
        Tuple of two strings, which are converted to start and end times of time of day to keep data.
    perc : int or float, default 100
        Percent of points to transform to outliers. Applied after value and time filters.
    seed : integer, optional default None
        Integer to seed np.random.RandomState
    """
    df_out = df.copy()
    df['ix'] = np.arange(0, df.shape[0])
    column = df.loc[:, col]
    col_num = df.columns.get_loc(col)
    
    if seed is None:
        rng = np.random.RandomState()
    else:
        rng = np.random.RandomState(seed)
    
    if meas_err is 'std':
        std = column.std() * 0.03
    elif meas_err is 'outlier':
        std = (column.max() - column.min()) / 6
    elif isinstance(meas_err, float) or isinstance(meas_err, int):
        std = column.std() * meas_err
        
    if flt_val is not None:
#         print('in flt_val')
        if flt_col is not None:
            ix_flt_val = df.query(flt_col + flt_val).loc[:, 'ix'].values
        else:
            ix_flt_val = df.query(col + flt_val).loc[:, 'ix'].values
        if between_time is None:
#             print('    between_time is none')
            ix_flt = ix_flt_val
    if between_time is not None:
#         print('in between_time')
        #need to take the intersection of the ix_flt_val from above and result from below
        ix_flt_time = df.between_time(between_time[0], between_time[1]).loc[:, 'ix'].values
        if flt_val is None:
#             print('    flt_val is None')
            ix_flt = ix_flt_time
        if flt_val is not None:
#             print('    flt_val is not None')
            ix_flt = np.intersect1d(ix_flt_val, ix_flt_time)
    
    if flt_val is not None or between_time is not None:
        if perc is not 100:
#             print('perc is not 100')
#             ix_flt_perc = np.random.choice(ix_flt, int(perc / 100 * ix_flt.shape[0]), replace=False)
            ix_flt_perc = rng.choice(ix_flt, int(perc / 100 * ix_flt.shape[0]), replace=False)
        elif perc is 100:
#             print('perc is 100')
            ix_flt_perc = ix_flt

    if flt_val is None and between_time is None:
#         print('flt_val and between_time are None')
        df_out.loc[:, col] = std * rng.randn(df.shape[0]) + column
    elif flt_val is not None or between_time is not None:
#         print('either between_time or flt_val is not None')
        df_out.iloc[ix_flt_perc, col_num] = std * rng.randn(ix_flt_perc.shape[0]) + column.iloc[ix_flt_perc]
    
    return df_out

    # add index column to dataframe
#     df['ix'] = np.arange(0, df.shape[0])

    # non query filter
    # indices = df[df['globHor_err'] >= 100].loc[:, 'ix'].values

    # query with operator string argument
#     operator = '>='
#     indices = df.query('globHor_err' + operator + '100').loc[:, 'ix'].values

    # select random sample from filtered indices
#     indices_rand = np.random.choice(indices, int(0.015 * df.shape[0]), replace=False)

    # get column to apply error to
#     df_egrid_err = df.loc[:, 'egrid_err']
    # calc std dev of error
#     std_dev = (df_egrid_err.max() - df_egrid_err.min()) / 6
    # determine integer index of column for use in .iloc 
#     col_num = df.columns.get_loc('egrid_err')
    # write over the randomly selected rows in the column with values adjusted to have higher error
#     df.iloc[indices_rand, col_num] = std_dev * rng.randn(indices_rand.shape[0]) + df_egrid_err.iloc[indices_rand]

In [None]:
pvsyst = pvc.CapData()

In [None]:
pvsyst.load_data(load_pvsyst=True)

In [None]:
pvsyst.df.head()

In [None]:
test_per = pvsyst.df.loc['10/9/1990':'10/14/1990 0:00']

In [None]:
test_per_5min = test_per.resample('15s').interpolate()

In [None]:
test_per_5min = test_per_5min.iloc[0:-1]

In [None]:
test_per_5min.shape

In [None]:
df = test_per_5min.loc[:,['GlobInc', 'GlobHor', 'TAmb', 'TArray', 'WindVel', 'E_Grid', 'EOutInv']]

In [None]:
df.head()

In [None]:
tdata_meas = pd.DataFrame()
tdata_meas['met1_poa_refcell'] = df.loc[:, 'GlobInc']
tdata_meas['met2_poa_refcell'] = df.loc[:, 'GlobInc']
tdata_meas['met1_poa_pyranometer'] = df.loc[:, 'GlobInc']
tdata_meas['met2_poa_pyranometer'] = df.loc[:, 'GlobInc']
tdata_meas['met1_ghi_pyranometer'] = df.loc[:, 'GlobHor']
tdata_meas['met2_ghi_pyranometer'] = df.loc[:, 'GlobHor']

tdata_meas['met1_amb_temp'] = df.loc[:, 'TAmb']
tdata_meas['met2_amb_temp'] = df.loc[:, 'TAmb']

ix = df.query('TArray - TAmb < 0').index
df.loc[ix, 'TArray'] = df.loc[ix,'TAmb'] - 2.2

tdata_meas['met1_mod_temp1'] = df.loc[:, 'TArray']
tdata_meas['met1_mod_temp2'] = df.loc[:, 'TArray']
tdata_meas['met2_mod_temp1'] = df.loc[:, 'TArray']
tdata_meas['met2_mod_temp2'] = df.loc[:, 'TArray']
tdata_meas['met1_windspeed'] = df.loc[:, 'WindVel']
tdata_meas['met2_windspeed'] = df.loc[:, 'WindVel']

tdata_meas['meter_power'] = df.loc[:, 'E_Grid']

for num in range(8):
    inv = 'inv' + str(num + 1) + '_power'
    tdata_meas[inv] = df.loc[:, 'EOutInv'] / 8

In [None]:
tdata_meas[['met1_mod_temp2', 'met1_amb_temp']].plot()

In [None]:
# ix = tdata_meas.query('met1_mod_temp1 - met1_amb_temp < 0').index

# tdata_meas.loc[ix, 'met1_mod_temp1'] = tdata_meas.loc[ix,'met1_amb_temp'] - 2.2

In [None]:
tdata_meas.head()

### Creating multi-index to before exporting as csv to import using captest

In [None]:
# tdata_meas_mindex = to_mindex(tdata_meas)

In [None]:
# tdata_meas_mindex.to_csv('./data/test_dat_before_err.csv')

In [None]:
# tdata_meas_mindex.head()

## Injecting randomness into pvsyst data to simulate measured data

In [None]:
#make copy of df w/o randomness
tdm_rand = tdata_meas.copy()

In [None]:
# tdm_rand.columns.tolist()

In [None]:
met2 = ['met2_poa_refcell', 'met2_poa_pyranometer', 'met2_ghi_pyranometer', 'met2_amb_temp',
        'met2_mod_temp1', 'met2_mod_temp2', 'met2_windspeed']

In [None]:
not_no_night = tdm_rand.columns.tolist()[6:14]

In [None]:
no_night = tdm_rand.columns.tolist()[:6]
no_night.extend(tdm_rand.columns.tolist()[-9:])

In [None]:
seeds = [43, 34, 657, 342, 23, 238, 123, 3, 45, 90, 84, 29, 346, 34, 934]
for i, col in enumerate(no_night):
    tdm_rand = inject_err(tdm_rand, col, meas_err='std', flt_val='> 0.1', flt_col=col, seed=seeds[i])

In [None]:
seeds = [87, 7848, 298, 209, 983, 292, 20, 2]
for i, col in enumerate(not_no_night):
    tdm_rand = inject_err(tdm_rand, col, meas_err='std', seed=seeds[i])

In [None]:
# tdm_rand = inject_err(tdm_rand, 'meter_power', meas_err=3, flt_val=' >= 100',
#                       flt_col='met1_ghi_pyranometer', perc=10, seed=234)

In [None]:
# tdm_rand = inject_err(tdm_rand, 'meter_power', meas_err=6, flt_val=' >= 100',
#                       flt_col='met1_ghi_pyranometer', perc=1, seed=234)

In [None]:
# tdm_rand = inject_err(tdm_rand, 'meter_power', meas_err=4.0, flt_val=' >= 500',
#                       flt_col='met1_ghi_pyranometer', perc=1, seed=87)

In [None]:
seeds = [843, 294, 82, 2854, 9483, 54]
for i,col in enumerate(no_night[:6]):
    tdm_rand = inject_err(tdm_rand, col, meas_err='outlier', flt_val=' > 10',
                          flt_col='met1_poa_pyranometer', perc=30, between_time=('13:00', '16:00'), seed=seeds[i])

In [None]:
# seeds = [843, 294, 82, 2854, 9483, 54]
# for i,col in enumerate(no_night[:6]):
#     tdm_rand = inject_err(tdm_rand, col, meas_err=0.1, flt_val=' > 10',
#                           flt_col='met1_poa_pyranometer', perc=30, between_time=('15:00', '16:00'), seed=seeds[i])

In [None]:
seeds = [843, 294, 82, 2854, 9483, 54]
for i,col in enumerate(no_night[:6]):
    tdm_rand = inject_err(tdm_rand, col, meas_err=10, flt_val=' > 10',
                          flt_col='met1_poa_pyranometer', perc=2, between_time=('13:00', '16:00'), seed=seeds[i])

In [None]:
seeds = [843, 294, 82, 2854, 9483, 54]
for i,col in enumerate(no_night[:6]):
    tdm_rand = inject_err(tdm_rand, col, meas_err='outlier', flt_val=' > 10',
                          flt_col='met1_poa_pyranometer', perc=10, between_time=('8:30', '8:45'), seed=seeds[i])

In [None]:
# seeds = [843, 294, 82, 2854, 9483, 54]
# for i,col in enumerate(no_night[:6]):
#     tdm_rand = inject_err(tdm_rand, col, meas_err=12, flt_val=' > 10',
#                           flt_col='met1_poa_pyranometer', perc=1, between_time=('8:30', '8:45'), seed=seeds[i])

In [None]:
# seeds = [843, 294, 82, 2854, 9483, 54]
# for i,col in enumerate(no_night[:6]):
#     tdm_rand = inject_err(tdm_rand, col, meas_err='outlier', flt_val=' > 10',
#                           flt_col='met1_poa_pyranometer', perc=10, between_time=('14:00', '16:00'), seed=seeds[i])

In [None]:
mtr_ix = tdm_rand.loc[:,'meter_power'].between_time('13:00', '18:00').index

In [None]:
mtr_reduced = (tdm_rand.loc[:,'meter_power'].between_time('13:00', '18:00') * .95) - 100000

In [None]:
tdm_rand.loc[mtr_ix, 'meter_power'] = mtr_reduced

In [None]:
tdm_rand.loc['10/10/1990 12:50':'10/10/1990 12:55', 'met1_poa_refcell'] = 150

Am re-running the above to see if the graph below stays the same with everything seeded.
It does not because of the np.random.choice lines used by the percentage argument in inject_err
Looking for way to 'seed' random.choice

looks like need to use the choice method of a np.random.RandomState object

In [None]:
tdm_rand.plot(x='met1_poa_refcell', y='meter_power', kind='scatter', alpha=0.2)

In [None]:
tdm_rand_resample = tdm_rand.copy().resample('5min').mean()

In [None]:
tdm_rand_resample.plot(x='met1_poa_refcell', y='meter_power', kind='scatter', alpha=0.2)

In [None]:
tdm_rand_resample.loc['10/09/90 13:05':'10/09/90 14:20','met1_amb_temp'] = pd.np.NaN

In [None]:
for col in met2:
    tdm_rand_resample.loc['10/11/90 16:00':'10/11/90 17:15', col] = tdm_rand_resample.loc['10/11/90 16:00', col]

In [None]:
tdm_rand_resample_mindex = to_mindex(tdm_rand_resample)

In [None]:
tdm_rand_resample_mindex.to_csv('./data/example_meas_data.csv')