In [2]:
%run basic.ipynb

# General Utilities 

This will be included in all the files as generic utilities

In [None]:
%%writefile  ~/bin/gen/lstmutils1.py
#!/usr/local/bin/python 
'''
Some utility Functions to be used in all the apps
#=*** NOTE *** DO NOT EDIT THIS FILE - THIS iS CREATED FROM: 01_utils.ipynb
'''
import re, sklearn, sys, os, datetime, glob, argparse, json, base64, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (16, 4)
mpl.rcParams['axes.grid'] = False

pd.options.display.max_rows = 8

sys.path.append("~/bin/gen")
import ccallbacks

#-----------------------------------------------------------------------------------
def inJupyter():
    try:    get_ipython; return True
    except: return False

def getconfig(cf = "config*"):
    confFiles = sorted(glob.glob(cf))
    if ( len(confFiles) <= 0):
        print(f"No Configuration files {cf} found!!!")
        sys.exit(1)

    # Read and merge the configuration files
    ret = {}
    for cf in confFiles:
        print(f"#Getting Configuration from {cf}")
        with open(cf, "r") as f:
            cf = f.read()

        if (not cf.find('[START]') >=0 ):
            r1 = cf
        else:
            r1=re.findall("\[START](.*)\[END]", cf, flags=re.MULTILINE|re.DOTALL)
            if ( len(r1) <= 0):
                print(f"Ignoring: Configuration not found in {cf}! no worries")
                continue
            r1 = r1[0].replace("'", '"')    
        rj = eval(r1)
        ret.update(rj)
                
    return ret
    
#-----------------------------------------------------------------------------------
def getConfigList(conf, key=""):
    #print(f"Getting {key}")
    ll = conf.get(key, []);
    ret = []
    for l in ll:
        if type(l) == list:
            ret += l
        elif l.startswith("$"):
            ret += getConfigList(conf, l[1:])
        else:
            ret.append(l)

    return ret
#-----------------------------------------------------------------------------------
def getConfigObject(conf, key=""):
    ll = conf.get(key, "");
    if not ll:
        return;
    
    sst = ll[0]
    dec = base64.b64decode(sst)
    ret = pickle.loads(dec, fix_imports=True)

    return ret
#--------------------------------------------------------------------------------
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False
#--------------------------------------------------------------------------------
class Map(dict):
    """
    Example:
    m = Map({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])
    """
    def __init__(self, *args, **kwargs):
        super(Map, self).__init__(*args, **kwargs)
        for arg in args:
            if isinstance(arg, dict):
                for k, v in arg.items():
                    self[k] = v

        if kwargs:
            for k, v in kwargs.iteritems():
                self[k] = v

    def __getattr__(self, attr):
        return self.get(attr)

    def __setattr__(self, key, value):
        self.__setitem__(key, value)

    def __setitem__(self, key, value):
        super(Map, self).__setitem__(key, value)
        self.__dict__.update({key: value})

    def __delattr__(self, item):
        self.__delitem__(item)

    def __delitem__(self, key):
        super(Map, self).__delitem__(key)
        del self.__dict__[key]
        
#--------------------------------------------------------------------------------
def runMethod(pyMethod, **kwargs):
    spl = pyMethod.split('.');

    assert len(spl) >= 2, "Hmmm ... May be not what is intended!! module name"

    modName = ".".join(spl[:-1])
    __import__(modName, fromlist="dummy")

    funName = spl[-1]
    ret = None
    for v in sys.modules:
        if (v.startswith(modName)):
            method= getattr(sys.modules[v], funName)
            print(v, type(v), funName, method, type(method), callable(method))
            ret = method(**kwargs)
    return ret


#--------------------------------------------------------------------------------
def getInvertedPreds(conf, yh):
    scalerY = utils1.getConfigObject(conf, "scalerYString")
    sOuputs = utils1.getConfigList(conf, 'scaleOutputs')
    ouputs  = utils1.getConfigList(conf, 'outputs')
    
    yhdf    = pd.DataFrame(yh, columns=ouputs)    # Dataframe of Predictions
    ys      = yhdf[sOuputs].values                # Values to be scaledback
    yi      = scalerY.inverse_transform(ys)       # inverse transform the outputs
    yidf    = yhdf.copy()                         # Copy and set the values
    yidf[sOuputs] = yi
    return yhdf, yidf
    
#--------------------------------------------------------------------------------
def getOriginal(conf, unnormdf, index=0):
    inputs  = utils1.getConfigList(conf, 'inputs')
    ouputs  = utils1.getConfigList(conf, 'outputs')
    tsParams= conf.get("tsParams", {})
    
    index   = 0
    startIX = index + tsParams['length']
    batchSZ = 1 # batch size
    stride  = tsParams.get('stride', 1)
    i = startIX + batchSZ * stride * index
    return unnormdf[i:], inputs, ouputs

#--------------------------------------------------------------------------------
def plotInverted(conf, yh, unnormdf, s =-400, howmany=100):
    e=s+howmany
    
    yhdf, yidf = getInvertedPreds(conf, yh)
    yorg,ips,ops = getOriginal(conf, unnormdf)

    x = pd.to_datetime(yorg[yorg.columns[0]][s:e])
    
    plt.plot(x, yorg[ops].values[s:e], marker='.', label="Original")
    plt.plot(x, yidf.values[s:e], marker='x', label="Predicted")
    plt.title("Plotting Inverted Values:")
    plt.grid()
    plt.legend()

'''
Reconstruct the original diffed columns
'''    
def reconstructOrig(conf, unnormdf, yh, ouputs):
    yhdf, yidf = getInvertedPreds(conf, yh)
    yorg,ips,ops = getOriginal(conf, unnormdf)
    
    for o in ouputs:
        if(o.endswith("___diff1")):
            oc = o[:-8]
            print(f"Getting Original column for: '{oc}' ")
            if ( oc not in yorg.columns):
                print('Cannot compute the orginal column values from diffs for {oc}')
                continue;
                
            ## WOW <== this is heavy - undo the diffing in the opposite way
            yidf[oc] = yorg[oc].values + yidf[o].shift(-1)

    return yidf # y inverted dataframe with adjusted cols for diffs



#runMethod("gen.utils1.is_number", **{"s":"123.78"})

In [None]:
%%writefile  ~/bin/plotutils1.py
#!/usr/local/bin/python 

import matplotlib.pyplot as plt
import re, sys, os, datetime, glob, json, base64, pickle, sklearn
import pandas as pd
import numpy as np

import keras
from keras.models import Model
from keras.models import load_model
from keras.callbacks import Callback
import IPython
from IPython.display import display
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (16, 3)
mpl.rcParams['axes.grid'] = True

sys.path.append(".")
sys.path.append("gen")
sys.path.append("/opt/LMCO/git/bin/gen")
import lstmutils1;
import lstmfit;
import sklearn.metrics

def predict(modelFile, valg, model=None):
    m1 = model or load_model(modelFile)
    #xxt = np.array([valg[i][0][0] for i in range(len(valg))])
    #yyt = np.array([valg[i][1][0] for i in range(len(valg))])
    yh=m1.predict(valg)
    return yh
            
def plot1(modelFile, valg, model=None, idx=0, n=-150, howmany=50):
    yh = predict(modelFile, valg, model)
    yy = np.array([valg[i][1][idx] for i in range(len(valg))])
                 
    plt.gcf().set_size_inches(22, 10, forward=True)
    plt.plot( yy[n:n+howmany], marker='o', label="original-")
    plt.plot( yh[n:n+howmany], marker='x', label="predicted")
    mse = sklearn.metrics.mean_squared_error(yy, yh)
    
    plt.title(f"{modelFile} : {model}: MSE: {mse} <==")
    plt.grid()
    plt.legend()
    plt.show()
    
    return yy, yh, mse


def plotyyyh(columns, xxx, yy1, yh1, s, e):
    for i in range(yh1.shape[-1]):
        yyy = yy1[:,i]
        yyh = yh1[:,i]

        r2 = sklearn.metrics.r2_score(yyy, yyh)
        if (r2 <0.5):
            print(f'--{i}: {columns[i+1]} R^2: {r2}')
            continue

        plt.title(f'{i}: {columns[i+1]} R^2: {r2}')

        plt.plot(xxx[s:e], yyy[s:e], marker='.', label="original")
        plt.plot(xxx[s:e], yyh[s:e], marker='x', label="predictd")
        plt.grid()
        plt.legend()
        plt.show()
    
def plotstuff(valg, confile, normeddf=None, model = None, mcpoint=None ):
    if ( model is None):
        conf, unnormdf, normeddf, inps, oups = lstmfit.getConf(confile)
        model, mcpoint = lstmfit.getModel(conf)

    print(len(valg), mcpoint.best)
    
    xxx = pd.to_datetime(normeddf.time[-len(valg):])
    mcpoint.drawLosses()

    yh1 = model.predict(valg)            
    yy1 = np.array([valg[j][1][0] for j in range(len(valg))])

    s = 0
    e = s + len(valg)
    plotyyyh(normeddf.columns, xxx, yy1, yh1, s, e)
    
    return model, mcpoint, normeddf, yy1, yh1

# Get Stocks Data

In [None]:
%%writefile  ~/bin/gen/getstocksdata.py
#!/usr/local/bin/python 

'GENERATED BY NNBook/notebooks/NNetworks/LSTM/01_Utils.ipynb'

import os, datetime, glob, sys
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
sys.path.append(".")
sys.path.append("gen")
import time, datetime

'''
The following code: it is just here to protect my API-KEY code not really useful
for the problem at hand. If your key is in your or update your API_KEY then you may safely ignore 
this code.
'''

def getkey(key='password'):
    API_KEY, lines, file =None, None, os.path.expanduser('~/.keys/keys.json')
    if os.path.exists(file):
        with open(file, 'r') as f:
            r = f.read()
        j = eval(r)
        return j['AV_API_KEY'], j['NEWSAPI_KEY']

    #alpha_vantage key
    avk = decrypt('baTEje52rx+kAsuAN9PdxMeC03p/HuRVTzskLiso1/c=', key)
    newsk = decrypt('505Db5sDvHvptBPzE8IhsewneuanOKV3gKpN+26lS3A=', key)
    return avk, newsk;
        
def encrypt(msg_text = b'message', secret_key='password'):
    if (type(msg_text) == str):
        msg_text = bytes(msg_text, encoding='utf-8').rjust(32)
    if (type(secret_key) == str):
        secret_key = bytes(secret_key, encoding='utf-8') .rjust(32)

    cipher = AES.new(secret_key,AES.MODE_ECB) 
    encoded = base64.b64encode(cipher.encrypt(msg_text))
    ret = encoded.decode("utf-8")
    print(ret)
    return ret

def decrypt(encoded, secret_key='password'):
    if (type(secret_key) == str):
        secret_key = bytes(secret_key, encoding='utf-8') .rjust(32)

    cipher = AES.new(secret_key,AES.MODE_ECB) 
    if (type(encoded) == str):
        encoded = bytes(encoded, encoding='utf-8')
    decoded = cipher.decrypt(base64.b64decode(encoded))
    ret =decoded.decode("utf-8").strip()
    print(ret)
    return ret

'''
This will read required symbols and saves them to data directory
'''
def save_data(symbol, API_KEY="", check=True):
    from alpha_vantage.timeseries import TimeSeries
    
    outf = f'daily_{symbol}.csv'
    if (check and os.path.exists(outf)):
        
        dt = datetime.datetime.fromtimestamp(os.path.getmtime(outf))
        dn = datetime.datetime.now()
        ts = (dn - dt)
        hr = (ts.days * 24 * 60 * 60 + ts.seconds)//60//60
        if (hr < 4): #if it was created less than 4 hours ago
            print(f"{outf:22} exists, ... recently crested < 4 at {dt} ")
            return;
    ts = TimeSeries(key=API_KEY, output_format='pandas')
    
    retry = 0
    data = None
    while retry <= 5:
        try:
            print(f"Getting data for {symbol}")
            data, meta_data = ts.get_daily(symbol, outputsize='full')
            break;
        except ValueError as ve:
            retry += 1
            print(f"Sleep for a Minute, {retry}/5 attempts");
            time.sleep(60)
            
    if data is None:
        print(f"Could not get data for {symbol}")
        return data
    
    data.insert(0, 'timestamp', value=data.index)

    data.columns = 'timestamp,open,high,low,close,volume'.split(',')
    data.to_csv(outf, index=False)
    return data

'''
Read all the files in data with daily_*, reads them and returns a list
'''
def read_data():
    a={}
    for f in glob.glob('daily_*'):
        symbol = os.path.basename(os.path.splitext(f)[0]).split("_")[1]
        print(f'Reading {f} Symbol: {symbol}')
        df = pd.read_csv(f)
        df.sort_values(by='timestamp', ascending=False, inplace=True)
        df.index=(range(0,len(df)))
        ncs = ['timestamp'] + [f'{symbol}_{c}' for c in df.columns[1:]]
        df.columns = ncs
        a[symbol] = df
        
    minrows = min([len(d) for d in a.values()])
    return a, minrows, df

'''
Combines all the dataframes into one

The problem for us to join multiple index funds from ASIA is that they have different holidays
Therefore we have gaps in the trading days. Therefore 
'''
def combine_data(a, outp="stockdata.csv"):
    # Different excahnges have various holidays, therefore values may be missing  
    # Get All Data Frames and their corresponding time stamps
    #
    ar=np.array([d['timestamp'].values for d in a.values()])
    at=np.concatenate(ar)
    at=set(at)
    af = pd.DataFrame()
    af['timestamp'] = list(at);

    for k,v in a.items():
        #print(f"Getting {k:32} \r", end='')
        af=pd.merge(af,v, how="left", left_on="timestamp".split(), right_on="timestamp".split())
    print()
    af.sort_values(by='timestamp', ascending=True, inplace=True)
    af.dropna(inplace=True)
    af = af.reset_index(drop=True)
    #af = af.fillna(method='ffill' ).fillna(method='bfill')
    af.to_csv(outp, index=False)
    return af

def addDummyCols(df):
    tf1=df
    if ( "MSFT_+ve" not in tf1.columns):
        tf1.insert(1, "MSFT_+ve", value=[f"S__{k}" for k in np.random.randint(0,3, size=len(tf1))] )
    if ( "MSFT_-ve" not in tf1.columns):
        #tf.insert(1, "MSFT_-ve", value=[f"S__{k}" for k in np.random.randint(0,1, size=len(nf))] )
        tf1.insert(1, "MSFT_-ve", value=[k for k in np.random.randint(0,2, size=len(tf1))] )
    tf1.to_csv("stockdata_ext.csv", index=False);
    return tf1


def getdata(symbs='MSFT GLD GOOGL SPX AAPL IBM' , dontforce=False):
    
    stockfile="data/stockdata.csv"
    if (dontforce and os.path.exists(stockfile)):
        print(f"{stockfile} exists")
        return
    
    API_KEY = None  # Put your API KEY if you need to test data download or just use the data
    API_KEY, NEWS_API_KEY = API_KEY or getkey()

    #print("API_KEY ", API_KEY)
    for f in symbs.split():
        save_data(f, API_KEY=API_KEY)

    ASIA='''TOKYO 6758.T HITACHI 6501.T HNGKNG 0168.HK SHANGAI 601288.SS SHNZEN'''
    s=ASIA.split()
    ASIA = {k[0]:k[1] for k in   zip(s[0::2], s[1::2])}

    for k, v in ASIA.items():
        print(f'getting data for {k} => symbol {v}')
        save_data(v, API_KEY)

def getCombined():
    #Get All data togehe
    a, maxrows, ldf  = read_data()
    nf= combine_data(a, "stockdata.csv")
    addDummyCols(nf)
    return nf
    
#-----------------------------------------------------------------------------------
#*** NOTE: DO NOT EDIT THIS FILE - THIS iS CREATED FROM: inv_utils.ipynb
def inJupyter():
    try:    get_ipython; return True
    except: return False
#-----------------------------------------------------------------------------------
if __name__ == '__main__':
    if (not inJupyter()):
        t1 = datetime.datetime.now()
        getdata()
        getCombined();
        t2 = datetime.datetime.now()
        print(f"All Done in {str(t2-t1)} ***")
    else:
        pass
        '''
        a, minrows, ldf  = read_data()
        stockfile="data/stockdata.csv"
        nf= combine_data(a, stockfile)
        addDummyCols(nf)
        '''

# Data Configure  - dataconfig.py

In [None]:
%%writefile  ~/bin/gen/dataconfig.py
#!/usr/local/bin/python 

import re, sys, os, datetime, getopt, glob, argparse, datetime, json, base64, pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
sys.path.append(".")
sys.path.append("gen")
sys.path.append("~/bin/gen")
import lstmutils1

#~~~~ Find any sensor highly correlated with time and drop them.
def detectTimeCorrelated(df, val=0.94):
    timeCorSensors = []
    if ( not 'time' in df.columns): #assume first column is time column
        dcols = ['time'] + [c for c in df.columns[1:]]
        df.columns = dcols
    
    timeser = pd.Series(df[['time']].values.reshape(-1))
    if ( timeser.dtype != np.number ):
        timeser = pd.to_datetime(timeser).astype(int)
    
    
    DROP_INDEX = 0;
    for sensor in df.columns:
        if (sensor == 'time'):
            continue;
        #print(f"#Testing {sensor}...")
        sensorSeries = pd.Series(df[sensor].values.reshape(-1))
        for i in range(8):
            c1 = timeser[i:].corr(sensorSeries[i:])
            c2 = timeser[i:].corr(sensorSeries[:-i])
            if np.abs(c1) >= val or np.abs(c2) >= val:
                timeCorSensors.append(sensor)
                DROP_INDEX = max(DROP_INDEX, i) #lets drop first few rows
                break;
                
    if ( len(timeCorSensors) > 0):
        print(f"#Time Cor: #{len(timeCorSensors)}, #Shape before:{df.shape}")
        #df.drop(timeCorSensors, axis=1, inplace=True)
        #df = df[DROP_INDEX:]
        #print(f"#After dropping: {DROP_INDEX} =>{df.shape}")
        
    return timeCorSensors
#-----------------------------------------------------------------------------------
def precheck(df):
    cols = df.columns[df.dtypes.eq('object')]
    if (len(cols) > 0):
        print(f"WARNING: *** Non numeric columns => {cols}")
        return 0
    return 1
#-----------------------------------------------------------------------------------
'Covert to one_hot encoding with prefix for columns'
def makeOneHotCols(tf1, oheCols=[]):
    ret = []
    for c in oheCols:
        one_hot = pd.get_dummies(tf1[c])
        ret += [f'{c}___{k}' for k in one_hot.columns]

    return ret
#-----------------------------------------------------------------------------------
def inJupyter():
    try:    get_ipython; return True
    except: return False
#-----------------------------------------------------------------------------------
def detectCols(file, nUnique=4, tcoeff=0.92):
    tf1 = file
    if (type(tf1) == str):
        tf1 = pd.read_csv(tf1, comment="#")
    
    #Lets check if it has any non-numeric columns! Warning
    precheck(tf1)
    
    unique_vals  = tf1.nunique()
    constantCols = unique_vals[ unique_vals == 1].index                             # constant Columns
    onehotECols  = unique_vals[(unique_vals > 2 ) & (unique_vals<=nUnique)].index   # Categorical Columns
    categorCols  = unique_vals[(unique_vals >=2 ) & (unique_vals <= nUnique)].index # Categorical Columns
    binaryCols   = unique_vals[(unique_vals == 2)].index                      # Binary

    numericCols  = tf1.select_dtypes(include=np.number).columns           # numerics
    numericCols  = [c for c in numericCols if c not in categorCols]
    numericCols  = [c for c in numericCols if c not in constantCols]
    numericCols  = [c for c in numericCols if c != 'time']
    notNumerics  = tf1.select_dtypes(exclude=np.number).columns           # non - numerics
    notNumerics  = [c for c in notNumerics if c not in categorCols]       # non - numerics

    try:
        timeCorCols  = detectTimeCorrelated(tf1, tcoeff)
    except:
        timeCorCols = []
    
    onehotEC_ext = makeOneHotCols(tf1, onehotECols)
   
    ret1 =f'''[START]
{{
    "file"           : {[file] if (type(file) == str) else ["??"]},
    "nrowsXncols"    : {[len(tf1), len(tf1.columns )] }     , 
    "number_Unique"  : {nUnique}            , 
    "constantCols"   : {list(constantCols )},   # No Signals
    "#constantCols"  : {len(constantCols  )},   # No Signals
    "categorCols"    : {list(categorCols  )},   # Categorical Columns
    "#categorCols"   : {len(categorCols   )},   # Categorical Columns
    "onehotECols"    : {list(onehotECols  )},   # Cats > 2 and < Unique Values
    "onehotEC_ext"   : {list(onehotEC_ext )},   # Cats > 2 and < Unique Values
    "#onehotECols"   : {len(onehotECols   )},   # Cats > 2 and < Unique Values
    "binaryCols"     : {list(binaryCols   )},   # Binary
    "#binaryCols"    : {len(binaryCols    )},   # Binary
    "notNumerics"    : {list(notNumerics  )},
    "timeCorrelation": {tcoeff             },   # Time correlated
    "timeCorrCols"   : {list(timeCorCols  )},   # Time correlated Columns
    "#timeCorrCols"  : {len(timeCorCols   )},    # Time correlated Columns
    "excludePattern" : [] , #Exclude patterns
    "includePattern" : [] , #include patterns
    "dropColumns"    : [],
    "diff_suffix"    : {['__diff1']},
    "addDiffs"       : [],
    "train_pct"      : .9,
    "#numericCols"   : {len(numericCols   )},  
    "scaleInputs"    : {list(numericCols  )},  
    "scaleOutputs"   : {["$scaleInputs"]},  
    "inputs"         : {["$binaryCols", "$scaleInputs", "$onehotECols"]},
    "outputs"        : {["$scaleOutputs"]},
#-----Copy this generated file and add customization
    "loadModel"      : 1,
    "scale"          : 1,
    "scaler"         : ["sklearn.preprocessing.StandardScaler()"],
    "scaler"         : ["sklearn.preprocessing.MinMaxScaler()"],
    "scalerXString"  : [],
    "scalerYString"  : [],
    "tsParams"       : {{"length": 60, "batch_size": 1, "stride": 1, "sampling_rate": 1}},
    "lookahead"      : 60,
    "nsteps"         : 1,
    "modelFile"      : ["lstm.56.h5"],
    "monitor"        : "val_loss",
    "modelName"      : "gen.somemodels.SimpleModel1(50, 5, 1, **{{}})"
}}
[END]
    '''

    return ret1, tf1;
#-----------------------------------------------------------------------------------

    
def process():
    n  = len(sysargs.input_files)
    un = sysargs.unique
    tc = sysargs.tcoeff
    for i, file1 in enumerate(sysargs.input_files):
        print(f"#=>Processing {i+1}/{n} {file1} #unique: {un} tcoeff: {tc} - standby")
        outs, df = detectCols(file1, un, tc)
        
        break;
    print(outs)
    return outs
    
#-----------------------------------------------------------------------------------
sysargs=None
def addargs():
    sysargs = None
    p = argparse.ArgumentParser(f"{os.path.basename(sys.argv[0])}:")
    p.add_argument('-u', '--unique', type=int,   default=6,    help="# of unique values!")
    p.add_argument('-t', '--tcoeff', type=float, default=0.94, help="# Time Correlation value Sensors!")
    p.add_argument('args', nargs=argparse.REMAINDER)
    p.add_argument('input_files',action="store", type=str, nargs='+', help="input file(s)")

    #p.print_help() # always print -help
    try:
        sysargs=p.parse_args(sys.argv[1:])
        #print(f'using:\n{sysargs}')
    except argparse.ArgumentError as exc:
        #par.print_help()
        print(exc.message )
        
    return sysargs
#-----------------------------------------------------------------------------------
if __name__ == '__main__':
    if (not inJupyter()):
        t1 = datetime.datetime.now()
        sysargs = addargs()
        process()
        t2 = datetime.datetime.now()
        print(f"#All Done in {str(t2-t1)} ***")
    else:
        pass

In [None]:
f='/opt/SCHAS/NNBook/notebooks/NNetworks/LSTM/data/stockdata_ext.csv'
#r=detectCols(f)
#pp=r[0]

f='/opt/SCHAS/NNBook/notebooks/NNetworks/LSTM/data/daily_MSFT.csv'
pd.read_csv(f)

# Data Prepare

In [None]:
%%writefile  ~/bin/gen/dataprepare.py
#!/usr/local/bin/python 

import re, sklearn, sys, os, datetime, getopt, glob, argparse, datetime, json, base64, pickle
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

sys.path.append(".")
sys.path.append("gen")
sys.path.append("~/bin/gen")
import lstmutils1;

'''
Make sure data is sorted in asceding order of the time for LSTM to work and all these 
data prep tools to work.
'''


'Covert to one_hot encoding with prefix for columns'
def makeOneHot(tf1, oheCols=[]):
    ohe = pd.DataFrame();
    for c in oheCols:
        one_hot = pd.get_dummies(tf1[c])
        nc = [f'{c}___{k}' for k in one_hot.columns]
        one_hot.columns = nc
        ohe = pd.concat([ohe, one_hot], axis=1)

    return ohe

'''
Assuming the tf1 is sorted in ascending order of time
'''
def addDiff(tf1, col):
    #col = "MSFT_open"
    if (type(col) == str):
        col = [col]
    for c in col:
        if ( c not in tf1.columns):
            print(f"*WARNING* Column {c} Not FOUND")
            continue
        print(f"+++ Adding {c}")
        tf1[f'{c}___diff1'] = tf1[c] - tf1[c].shift(1)
    return tf1

#-----------------------------------------------------------------------------------
#*** NOTE: DO NOT EDIT THIS FILE - THIS iS CREATED FROM: inv_utils.ipynb
def inJupyter():
    try:    get_ipython; return True
    except: return False

#-----------------------------------------------------------------------------------
def formatConfig(out: dict):
    outj = f"[START]\n{{\n"
    for k,v in out.items():
        kk = f'"{k}"'
        vv = f"'{v}'" if type(v) == str else v
            
        outj += f'{kk:>20}: {vv},\n'
    outj += '"end": 0 \n}\n[END]\n'
    
    return outj

#-----------------------------------------------------------------------------------
def getFinalColumns(df, conf):
    inputCols = lstmutils1.getConfigList(conf, "inputs")
    ouputCols = lstmutils1.getConfigList(conf, "outputs")

    allcols = set(inputCols +ouputCols);
    
    assert allcols.issubset(set(df.columns)), "Hmmm ... columns missing"
    return sorted(list(allcols))
#-----------------------------------------------------------------------------------
'''
This will scale the numeric columns - if this changes - you need to use dataprep
'''
def scaleNumerics(df, conf={}):
    scaleInputCols = lstmutils1.getConfigList(conf, 'scaleInputs')
    scaleOuputCols = lstmutils1.getConfigList(conf, 'scaleOutputs')
    
    dfninps = df[ scaleInputCols ]
    dfnouts = df[ scaleOuputCols ]

    scale   = conf.get('scale', 0)
    dfninpsn, dfnoutsn, scalerX, scalerY = dfninps, dfnouts, None, None
    
    if (scale):
        scalerXStr = conf['scalerXString']
        scalerYStr = conf['scalerYString']
        trnPct     = conf.get('train_pct', 0.9);
        trnCnt     = int(len(df) * trnPct)

        conf["train_pct"   ] = trnPct
        conf["train_count" ] = trnCnt

        if (not scalerXStr):
            scaler  = conf.get('scaler', ["sklearn.preprocessing.MinMaxScaler()"]);
            scalerX = eval(scaler[0]) if type(scaler[0]) == str else scaler
            scalerX = scalerX.fit(dfninps[:trnCnt])
            scalerstr = base64.b64encode(pickle.dumps(scalerX, protocol=None, fix_imports=True))
            scalerstr = scalerstr.decode("utf-8")
            conf["scalerXString"] = [scalerstr]
            #print(f'==>+1 shape: {dfninps.shape} {scalerX.mean_}')
        else:
            scalerXStr = scalerXStr[0]
            decoded    = base64.b64decode(scalerXStr)
            scalerX    = pickle.loads(decoded,fix_imports=True)

        if (not scalerYStr):
            scaler  = conf.get('scaler', ["sklearn.preprocessing.MinMaxScaler()"]);
            scalerY = eval(scaler[0]) if type(scaler[0]) == str else scaler
            scalerY = scalerY.fit(dfnouts[:trnCnt])
            scalerstr = base64.b64encode(pickle.dumps(scalerY, protocol=None, fix_imports=True))
            scalerstr = scalerstr.decode("utf-8")
            conf["scalerYString"] = [scalerstr]
            #print(f'==>+2 shape: {dfninps.shape} {scalerY.mean_}')
        else:
            scalerYStr = scalerYStr[0]
            decoded    = base64.b64decode(scalerYStr)
            scalerY    = pickle.loads(decoded,fix_imports=True)

        di = scalerX.transform(dfninps)
        do = scalerY.transform(dfnoutsn)
        
        dfninpsn = pd.DataFrame(di, columns=scaleInputCols )
        dfnoutsn = pd.DataFrame(do, columns=scaleOuputCols )
        
        #print(f'==>++ shape: {dfninpsn.shape} {scalerX.mean_}')
        #print(f'==>++ shape: {dfnoutsn.shape} {scalerY.mean_}')

    return dfninpsn, dfnoutsn, scalerX, scalerY;
#-----------------------------------------------------------------------------------
def process(config, input_files, output=None):
    conf = lstmutils1.getconfig(config)
    
    n  = len(input_files)
    adfInp = pd.DataFrame();
    for i, file1 in enumerate(input_files):
        print(f"=>Processing {i+1}/{n} {file1} - standby")
        df = pd.read_csv(file1, comment='#')
        
        drps = lstmutils1.getConfigList(conf, "dropColumns")
        df.drop(drps, inplace=True, errors="ignore")
        
        # STEP 1: Add diffs
        cdiffs  = lstmutils1.getConfigList(conf, 'addDiffs')
        addDiff(df, cdiffs)    #<< 1. Add tis
        df.dropna(inplace=True)
        df.reset_index(inplace=True, drop=True)

        # STEP 2: => One hot encode 
        ohe = None
        ohecols = lstmutils1.getConfigList(conf, 'onehotECols')
        if len(ohecols) > 0:
            ohe=makeOneHot(df, conf['onehotECols'])  #< === ADD
            df=pd.concat([df,ohe], axis=1)
         
        # STEP 3: Add
        allCols = [df.columns[0]] + getFinalColumns(df,conf)
        dfunNormalized = df[allCols]
        #Numeric Columns
        dfiNorm, dfoNorm, sX, sY = scaleNumerics(dfunNormalized, conf)
        
        dfNormalized = dfunNormalized.copy()
        dfNormalized[dfiNorm.columns] = dfiNorm;
        dfNormalized[dfoNorm.columns] = dfoNorm;
        
        #FINALLY        
        if (output is not None):
            fi,ext = os.path.splitext(file1)
            nfu  = f'{os.path.basename(fi)}_Orig_{i}{ext}'
            print(f"writing unnormalized to: {nfu}")
            dfunNormalized.to_csv(nfu, index=False)
            
            
            nfn  = f'{os.path.basename(fi)}_Norm_{i}{ext}'
            print(f"writing normalized to. : {nfn}")
            dfNormalized.to_csv  (nfn, index=False)
            
            conf['normalizedFile']   = nfn
            conf['unnormalizedFile'] = nfu
            
        break;

    outj = formatConfig(conf)
    print(outj)
    
    return conf, dfunNormalized, dfNormalized
#-----------------------------------------------------------------------------------
sysargs=None
def addargs():
    sysargs = None
    p = argparse.ArgumentParser(f"{os.path.basename(sys.argv[0])}:")
    p.add_argument('-c', '--config', type=str, default="config.txt", help="Config Files")
    p.add_argument('-o', '--output', type=str, default=0, help="output file")
    p.add_argument('args', nargs=argparse.REMAINDER)
    p.add_argument('input_files',action="store", type=str, nargs='+', help="input file(s)")

    #p.print_help() # always print -help
    try:
        sysargs=p.parse_args(sys.argv[1:])
    except argparse.ArgumentError as exc:
        print(exc.message )
        
    return sysargs
    
#-----------------------------------------------------------------------------------
if __name__ == '__main__':
    if (not inJupyter()):
        t1 = datetime.datetime.now()
        sysargs = addargs()
        ret = process(sysargs.config, sysargs.input_files, sysargs.output)
        t2 = datetime.datetime.now()
        print(f"#All Done in {str(t2-t1)} ***")
    else:
        pass

In [None]:
f='/opt/SCHAS/NNBook/notebooks/NNetworks/LSTM/data/stockdata_ext.csv'
f='/opt/SCHAS/NNBook/notebooks/NNetworks/LSTM/data/daily_MSFT.csv'
conf, dfUnNormalized, dfNormalized = process('config.*', [f], "out.csv")

# Custom Callback for Saving and Loading Models

In [None]:
%%writefile  ~/bin/gen/ccallbacks.py
#!/usr/local/bin/python 

import matplotlib.pyplot as plt
import re, sys, os, datetime, glob, json, base64, pickle, sklearn
import pandas as pd
import numpy as np

import keras
from keras.models import Model
from keras.models import load_model
from keras.callbacks import Callback
import IPython
from IPython.display import display

sys.path.append(".")
sys.path.append("gen")
sys.path.append("/opt/LMCO/git/bin/gen")
import lstmutils1;
import sklearn.metrics

class ModelCheckAndLoad(Callback):
    def __init__(self, filepath, monitor='val_loss', best=np.inf, 
                 stop_at=False, verbose=0, drawLoss=False):
        super(Callback, self).__init__()
        self.monitor  = monitor
        self.filepath = filepath
        self.verbose  = verbose
        self.best     = best or np.inf
        self.stop_at  = stop_at;
        self.history  = {}
        self.epochs   = []
        self.drawLoss = drawLoss
        self.epochNum = 0
        self.numSaved = 0
        
    def save_ext(self):
        ef = self.filepath+"_ext"
        with open(ef, "wb") as f:
            myParams = {
                'best'     : self.best,
                'bestEpoch': self.bestEpoch,
                'epochNum' : self.epochNum,
                'history'  : self.history,
                'monitor'  : self.monitor
            }
            pickle.dump(myParams, f, protocol=pickle.HIGHEST_PROTOCOL)
                
    def save_latest(self):
        self.model.save(self.filepath+"_latest", overwrite=True)
        self.save_ext();
            
    def load_ext(self):
        ret = None;
        if ( os.path.exists(self.filepath+"_latest")):
            ret = load_model(self.filepath+"_latest")
            print("Loading from the latest:...")
        elif ( os.path.exists(self.filepath)):
            ret = load_model(self.filepath)
        
        ef = self.filepath+"_ext"
        if ( not os.path.exists(ef) or os.path.getsize(ef) <= 0):
            return ret
        
        with open(ef, "rb") as f:
            myParams      = pickle.load(f)
            self.best     = myParams.get('best'    , np.inf)
            self.epochNum = myParams.get('epochNum', 0);
            self.history  = myParams.get('history', {});
            self.monitor  = myParams.get('monitor'  , "val_loss");
            
        print(f"Best Loaded {self.best} occured at: {self.epochNum}")
        return ret;

    def drawLosses(self):
        history, best = self.history, self.best
        #IPython.display.clear_output(wait=True)
        plt.clf()

        fig, ax1 = plt.subplots()
        i, colors, marks = 0, "rgbcmykw", "v.xo+"

        color = colors[i]
        ax1.set_xlabel('epochs')
        k, v = "loss", history['loss']
        ax1.set_ylabel(k, color=color)
        l1= ax1.plot(v, color=color, marker=marks[i], label=f"{k}")

        ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis

        i +=1
        k, v = "val_loss", history['val_loss']
        color = colors[i]
        ax2.set_ylabel(k, color=color)
        l2 = ax2.plot(v, color=color, marker=marks[i], label=f"{k}")

        fig.tight_layout()  # otherwise the right y-label is slightly clipped
        l3 = plt.plot(0, best, marker="o",  c="b", label=f"BEST: {best}")
        ax1.grid()

        lns  = l1 + l2 + l3;
        labs = [l.get_label() for l in lns]
        plt.legend(lns, labs, loc=0)
        plt.show()
        
    def on_epoch_end(self, epoch, logs={}):
        self.epochs.append(epoch)
        self.epochNum += 1;
        
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
            
        self.current = logs.get(self.monitor)
        if self.current is None:
            warnings.warn(f'Can save best model only with {self.monitor} available')
            return;
                    
        if (self.best > self.current):
            ou= f'{self.monitor}: {self.best} > {self.current}\n'
            print(f"Epoch: {epoch+1} Saving: {ou}");
            
            self.numSaved += 1
            self.bestEpoch+= 1
            self.best      = self.current
            self.model.save(self.filepath, overwrite=True)
            self.save_ext();
            self.model.stop_training = self.stop_at
        elif self.verbose > 0:
            ou= f'{self.monitor}: {self.best} <= {self.current}'
            print(f"{epoch+1} din't improve : {ou} from {self.bestEpoch}\r", end="")
            
        if (self.drawLoss):
            drawLosses(self.history, self.best)


# Definition of Some Model Architectures

In [None]:
%%writefile  ~/bin/gen/somemodels.py
#!/usr/local/bin/python 

import tensorflow as tf
import keras
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate
from keras.layers import RepeatVector, TimeDistributed
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers

def SimpleModel1(history, nfeatures, nOut, **kwargs) :
    lstm_input = Input(shape=(history, nfeatures), name='lstm_input')
    x = LSTM(50, name='lstm_0')(lstm_input)
    x = Dropout(0.2, name='lstm_dropout_0')(x)
    x = Dense(64, name='dense_0')(x)
    x = Activation('sigmoid', name='sigmoid_0')(x)
    x = Dense(nOut, name='dense_1')(x)
    output = Activation('linear', name='linear_output')(x)

    model = Model(inputs=lstm_input, outputs=output)
    adam = optimizers.Adam(lr=0.0005)
    model.compile(optimizer=adam, loss='mse')
    
    return model

def SimpleModel2(inps, inshape, units2=None, nsteps=1, opt="adam", loss="mse", bi=False, dropout=None):
    s= inshape
    print(locals())
    print(f"Creating LSTM: inuts= {inps} time-steps: {s[0]}, features: {s[1]} #out: {nsteps}")
    m = keras.models.Sequential()

    if (bi):
        m.add(keras.layers.Bidirectional(
            keras.layers.LSTM(inps, return_sequences= (units2 is not None), input_shape=s) ) )
    else:
        m.add(keras.layers.LSTM(inps, return_sequences= (units2 is not None), input_shape=s) )
    
    if(units2 is not None): #Lets just keep it simple for 2 layers only
        m.add(keras.layers.LSTM(units2, activation='relu'))
    if (dropout is not None):
        m.add( keras.layers.Dropout(dropout) )
    m.add(keras.layers.Dense(nsteps))
    m.compile(optimizer = opt, loss= loss)
    return m

def UberModel(lookBack, nFeatures, lstm_IPDim=256, lstm_OPDim=1, opt=None, loss="mse",  drop=0.3):
    opt        = opt or optimizers.Adam(lr=0.0005)
    k_rrizer   = None
    r_rrizer   = None

    input_layer  = Input(shape=(lookBack, nFeatures), dtype='float32', name='input')
    memory_layer = LSTM( lstm_IPDim, return_sequences=True, name="memory1")(input_layer)
    memory_layer = LSTM (int(lstm_IPDim/2), return_sequences=False, name="memory2")(memory_layer)
    repeated     = RepeatVector(lookBack)(memory_layer)
    memory_layer = LSTM (int(lstm_IPDim/2), return_sequences=True, name="first1out")(repeated)
    memory_layer = LSTM (lstm_IPDim,  return_sequences=True, name="first2out")(memory_layer)
    decoded_inputs = TimeDistributed(Dense(units=lstm_OPDim, activation='linear'))( memory_layer)

    #  Try spatial dropout?
    dropout_input = Dropout(drop)(input_layer)
    concat_layer  = concatenate([dropout_input, decoded_inputs])

    memory_layer = LSTM (units=lstm_IPDim, 
                             kernel_regularizer = k_rrizer, 
                             recurrent_regularizer = r_rrizer, 
                             return_sequences=False)(concat_layer)
    preds = Dense(units=lstm_OPDim, activation='linear')(memory_layer)

    model1 = Model(input_layer, preds)
    model1.compile(optimizer = opt, loss= loss)             

    return model1

# Fit Function

In [None]:
%%writefile  ~/bin/gen/lstmfit.py
#!/usr/local/bin/python 

import matplotlib.pyplot as plt

import re, sys, os, datetime, getopt, glob, argparse, json, base64, pickle
import pandas as pd
from sklearn import preprocessing
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (16, 5)
mpl.rcParams['axes.grid'] = False

import tensorflow as tf
tf.random.set_seed(13)
pd.options.display.max_rows = 8

import keras
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers
import numpy as np
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, Callback

sys.path.append(".")
sys.path.append("gen")
sys.path.append("~/bin/gen")
sys.path.append("~/bin")
import lstmutils1;
import ccallbacks

np.random.seed(4)

def getConf(cfile = "myconfig"):
    conf    = lstmutils1.getconfig(cfile)
    trnFile = conf['normalizedFile']
    orgFile = conf['unnormalizedFile']

    ddir = ""
    if not os.path.exists(trnFile):
        ddir     = os.path.abspath(cfile)
        ddir     = os.path.dirname(ddir) 
        trnFile  = f'{ddir}/{trnFile}'
        orgFile  = f'{ddir}/{orgFile}'

    normeddf     = pd.read_csv(trnFile)
    unnormdf     = pd.read_csv(orgFile)
    inputs       = lstmutils1.getConfigList(conf, 'inputs')
    ouputs       = lstmutils1.getConfigList(conf, 'outputs')
    train_pct    = conf.get('train_pct', 0.9)
    train_count  = conf.get('train_count', int(len(normeddf) * train_pct) )

    print(f'''
    TrnFile: {trnFile},
    I/P    : {inputs[0:4]} ...
    O/P    : {ouputs[0:4]} ...
    Shape  : {normeddf.shape}
    trnCnt : {train_count}"
    ''')
    
    return conf, unnormdf, normeddf, inputs, ouputs

def getGenerators(conf, normeddf, inputs, ouputs):
    modelFile    = conf['modelFile'] or "models/simpleModel.h5"
    tsParams     = conf['tsParams']
    lookahead    = conf['lookahead']
    history      = tsParams['length']
    train_pct    = conf.get('train_pct', 0.9)
    train_count  = conf.get('train_count', int(len(normeddf) * train_pct) )

    X, y = normeddf[inputs].values, normeddf[ouputs].values
    X=X[:(-lookahead+1) or None]
    y=y[lookahead-1:]

    Xtrn,ytrn = X[:train_count], y[:train_count], 
    Xtst,ytst = X[train_count:], y[train_count:], 

    tsParams1 = tsParams.copy()
    tsParams2 = tsParams.copy()
    tsParams2['batch_size'] =1

    trng1 = TimeseriesGenerator(Xtrn, ytrn, **tsParams1 )
    valg1 = TimeseriesGenerator(Xtst, ytst, **tsParams1 )
    valg2 = TimeseriesGenerator(X, y, **tsParams2 )

    #history, tsParams1, len(trng1), len(valg1), len(valg2), #trng1[0]
    #print(Xtrn.shape, "\n", Xtrn, "\n", ytrn.shape, "\n", ytrn, Xtst.shape)

    return modelFile, history, lookahead, trng1, valg1, valg2, X, y

def getModel(conf):
    modelFile = conf.get('modelFile', ["model.h5"])[0]
    modelName = conf['modelName']
    loadModel = conf.get('loadModel', 1)
    tsParams  = conf['tsParams']
    monitor   = conf.get('monitor', "val_loss")
    
    importName= modelName.split(".")[0]
       
    exec(f"import {importName}")

    mcp= ccallbacks.ModelCheckAndLoad(modelFile, monitor, best=np.inf, stop_at=False, verbose=1)
    m1 = None
    if ( loadModel and os.path.exists(modelFile)):
        m1 = mcp.load_ext()
    if ( m1 is None):
        m1=eval(modelName)

    print(f'''
    import: {importName}
    using : {modelName}
    saved : {modelFile}
    reload: {loadModel}, exists: {os.path.exists(modelFile)},best: {mcp.best}
    ''')
        
    return m1, mcp

#-----------------------------------------------------------------------------------
def predictyh(valg, modelFile="", model=None  ):
    m1 = model or keras.models.load_model(modelFile)
    yh=m1.predict(valg)
    return yh, m1

#-----------------------------------------------------------------------------------
def printModelMetrics(columns, yy1, yh1):
    mts = ["column                       r2            fit    maxYd    minYd    stdYd"]
    for i in range(yh1.shape[-1]):
        yyy = yy1[:,i]
        yyh = yh1[:,i]
        r2v = sklearn.metrics.r2_score(yyy, yyh)
        col = columns[i+1]

        yd = yyy - yyh
        yym= np.mean(yyy)
        dn = np.sum((yyy - yym)**2)+0.000000000001
        nm = np.sum(yd ** 2)
        fit= 1 - np.sqrt(nm/dn)

        mxd = max(abs(yd))
        mid = min(abs(yd))
        syd = np.std(yd)
        op = col, r2v, fit, yym, mxd, mid, syd

        a = f"{col:24} {r2v:>9.4} {max(0.,fit):>11.5} {mxd:8.3} {mid:8.3} {syd:10.6}"
        mts.append(a)
    return mts

#-----------------------------------------------------------------------------------
def fit(model, trng1, valg1, mcpoint,validation_steps=50, vv =0, ep = 1, spe =200 ):
    model.fit(trng1, verbose=vv, epochs=ep, validation_data=valg1,steps_per_epoch=spe, shuffle=True, 
                        validation_steps=validation_steps, callbacks=[mcpoint])

#-----------------------------------------------------------------------------------
def main(config=None, epochs=0):
    # Step 1 - read the files from configuration
    
    conf, unnormdf, normeddf, inps, oups    = getConf(config)
    modelFile, history, lookahead, trng1, valg1, valg2, X, y = getGenerators(conf, normeddf, inps, oups)
    
    model, mcpoint = getModel(conf)
    losses = None
    if ( epochs > 0):
        print(f"Calling fit : #epochs {epochs} : {mcpoint.best}")
        losses = fit(model, trng1, valg1, mcpoint, ep=epochs)
        mcpoint.save_ext()

    return conf, unnormdf, normeddf, inps, oups, model, mcpoint, trng1, valg1, valg2, X, y, losses
        
#-----------------------------------------------------------------------------------
sysargs=None
def addargs():
    sysargs = None
    p = argparse.ArgumentParser(f"{os.path.basename(sys.argv[0])}:")
    p.add_argument('-c', '--config', type=str, required=True, default="", help="config file")
    p.add_argument('-e', '--epochs', type=int, default=10, help="# of epochs!")
    p.add_argument('files', nargs='*')

    try:
        sysargs=p.parse_args(sys.argv[1:])
    except argparse.ArgumentError as exc:
        print(exc.message )

    return sysargs
#-----------------------------------------------------------------------------------
if __name__ == '__main__':
    if ( not lstmutils1.inJupyter()):
        t1 = datetime.datetime.now()
        sysargs = addargs()
        main(sysargs.config, sysargs.epochs)
        t2 = datetime.datetime.now()
        print(f"{sys.argv[0]}: All Done in {str(t2-t1)} ***")
    else: 
        #dbs= client.get_list_database()
        #print("Running in Jupyter", dbs)
        pass


In [None]:
%%writefile  ~/bin/genfit.py
#!/usr/local/bin/python 

import matplotlib.pyplot as plt

import re, sys, os, datetime, getopt, glob, argparse, json, base64, pickle, sklearn
import pandas as pd
from sklearn import preprocessing
import numpy as np
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (16, 5)
mpl.rcParams['axes.grid'] = False

import tensorflow as tf
pd.options.display.max_rows = 8

import keras
from keras.models import Model
from keras.layers import Dense, Dropout, LSTM, Input, Activation, concatenate
from keras.preprocessing.sequence import TimeseriesGenerator
from keras import optimizers
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, Callback

sys.path.append("gen")
sys.path.append("/opt/LMCO/git/bin")
import lstmutils1;
import lstmfit;
import ccallbacks

np.random.seed(4)


#-----------------------------------------------------------------------------------
def predictyh(valg, modelFile="", model=None  ):
    m1 = model or keras.models.load_model(modelFile)
    yh=m1.predict(valg)
    return yh, m1

#-----------------------------------------------------------------------------------
def getModelMetrics(columns, yy1, yh1):
    #mts = ["column                       r2            fit    maxYd    minYd    stdYd"]
    mtsc = "column r2  fit maxYd minYd, std".split()
    mts  = []
    for i in range(yh1.shape[-1]):
        yyy = yy1[:,i]
        yyh = yh1[:,i]
        r2v = sklearn.metrics.r2_score(yyy, yyh)
        col = columns[i+1]

        yd = yyy - yyh
        yym= np.mean(yyy)
        dn = np.sum((yyy - yym)**2)+0.000000000001
        nm = np.sum(yd ** 2)
        fit= 1 - np.sqrt(nm/dn)

        mxd = max(abs(yd))
        mid = min(abs(yd))
        syd = np.std(yd)

        #a = f"{col:24} {r2v:>9.4} {max(0.,fit):>11.5} {mxd:8.3} {mid:8.3} {syd:10.6}"
        a = [col, r2v, max(0.,fit), mxd, mid, syd]
        mts.append(a)
    return mts, mtsc
#-----------------------------------------------------------------------------------
def main(configFile):
    conf, unnormdf, normeddf, inputs, ouputs = lstmfit.getConf(configFile)
    lookAhead, valg,gt = lstmfit.getGeneratorsPreds(conf, normeddf)
    
    model, mcpoint = lstmfit.getModel(conf)

    yy1 = np.array([valg[i][1][0] for i in range(len(valg))])
    yh1,_ = predictyh(valg, None, model)

    mts, mtsc=getModelMetrics(normeddf.columns, yy1, yh1)
    
    
    opt=pd.DataFrame(mts, columns=mtsc)
    
    sfile = conf['modelFile'][0] + "_metrics"
    opt.to_csv(sfile, index=False)
    
    print(f"Saved to file: {sfile}")
    return opt

#-----------------------------------------------------------------------------------
sysargs=None
def addargs():
    sysargs = None
    p = argparse.ArgumentParser(f"{os.path.basename(sys.argv[0])}:")
    p.add_argument('-c', '--config', type=str, required=True, default="", help="config file")

    try:
        sysargs=p.parse_args(sys.argv[1:])
    except argparse.ArgumentError as exc:
        print(exc.message )

    return sysargs
#-----------------------------------------------------------------------------------
if __name__ == '__main__':
    if ( not lstmutils1.inJupyter()):
        t1 = datetime.datetime.now()
        sysargs = addargs()
        main(sysargs.config,)
        t2 = datetime.datetime.now()
        print(f"{sys.argv[0]}: All Done in {str(t2-t1)} ***")
    else: 
        #dbs= client.get_list_database()
        #print("Running in Jupyter", dbs)
        pass


# EXTENDED Stuff for Future

In [None]:
lookahead = 3
X, y = np.array(range(20)), np.array(range(20))
X=X[:(-lookahead+1) or None]
y=y[lookahead-1:]
print(f'{X}\n{y}')

print("---------------")
tsParams  = {"length": 5, "batch_size": 1, "stride": 1, "sampling_rate": 1}
train_count = len(X) * 6 //10

Xtrn,ytrn = X[:train_count], y[:train_count]
Xtst,ytst = X[train_count:], y[train_count:]
#Xtrn,ytrn = X, y 

print(Xtrn.shape, "\n", Xtrn, "\n", ytrn.shape, "\n", ytrn)
print(f'Test: {Xtst} {ytst}')
tsParams1 = tsParams.copy()
tsParams2 = tsParams.copy()
tsParams2['batch_size'] =1

trng1 = TimeseriesGenerator(Xtrn, ytrn, **tsParams1 )
valg1 = TimeseriesGenerator(Xtst, ytst, **tsParams1 )

for i in range(len(trng1)):
    print(f"{trng1[i][0]} : {trng1[i][1]}")
print("Testing ---")
for i in range(len(valg1)):
    print(f"{valg1[i][0]} : {valg1[i][1]}")

In [None]:
tf1 = pd.DataFrame([i for i in range(0,5)])
tf1

In [None]:
c=0
tf1[f'{c}___diff1'] = tf1[c] - tf1[c].shift(1)
tf1

In [None]:
tf1[c].shift(1)