In [1]:
import pandas as pd
import json
import numpy as np
def load_json(fp:str):
    with open(fp, 'r') as f:
        return json.load(f)

def create_in_df(dtype: str = 'Temp'):
    if dtype not in ['Temp', 'Throughput']:
        raise ValueError("dtype must be either 'Temp' or 'Throughput'")
    tanks = []
    if dtype == 'Temp':
        tanks = ['TBD910', 'TBD911', 'TBD912']
    elif dtype == 'throughputs':
        tanks = ['TBD910', 'TBD911', 'TBD912', 'TBD913', 'TB3301', 'TBD301', 'TUT604',
                 'TUT605', 'TUT918', 'TOL400', 'TOL600', 'G354', 'G356']

    columns = ['TimeStep']
    attributes = ['Value', 'TentativeQuality', 'TrueQuality']
    types = ['OG', 'Processed']

    if dtype == 'Throughput':
        types.append('Calculated')

    for tank in tanks:
        for data_type in types:
            for attr in attributes:
                columns.append(f'{tank}{data_type}{attr}')

    return pd.DataFrame(columns=columns)


def create_out_df(dtype: str = 'Temp'):
    if dtype not in ['Temp', 'Throughput']:
        raise ValueError("dtype must be either 'Temp' or 'Throughput'")
    tanks = []
    attributes = []
    if dtype == 'Temp':
        tanks = ['TBD910', 'TBD911', 'TBD912']
        attributes = ['Status', 'MonthlyAvgTemp', 'AnnualAvgTemp',
                      'PeriodsOfGoodQualityDataPct', 'PeriodsOfGoodQualityDataDays']
    elif dtype == 'throughputs':
        tanks = ['TBD910', 'TBD911', 'TBD912', 'TBD913', 'TB3301', 'TBD301', 'TUT604',
                 'TUT605', 'TUT918', 'TOL400', 'TOL600', 'G354', 'G356']
        attributes = ['Status', 'MonthlyThroughput', 'MonthlyThroughputTarget', 'MaxPumpingRate', 'MaxPumpingRateLimit',
                      'PeriodsOfGoodQualityDataPct', 'PeriodsOfGoodQualityDataHours', 'MaxPumpRateExceedingLimitHours']

    columns = ['TimeStart', 'TimeEnd']

    for tank in tanks:
        for attr in attributes:
            columns.append(f'{tank}{attr}')
    return pd.DataFrame(columns=columns)


def process_tank_dimensions(tank_dimensions: dict):
    rows = []
    for tank, dimensions in tank_dimensions.items():
        row = {'TankID': tank}
        row.update(dimensions)
        rows.append(row)
    return pd.DataFrame(rows).T


def populate_temp_ins(df: pd.DataFrame, data: list) -> pd.DataFrame:
    tanks = ['TBD910', 'TBD911', 'TBD912']
    rows = []
    for month in data:
        og = month['OriginalData']
        pc = month['ProcessedData']
        for o, p in zip(og, pc):
            row = {'TimeStep': o['TimeStep']}
            for tank in tanks:
                row.update({
                    f'{tank}OGValue': o[tank]['Value'],
                    f'{tank}OGTentativeQuality': o[tank]['TentativeQuality'],
                    f'{tank}OGTrueQuality': o[tank]['TrueQuality'],
                    f'{tank}ProcessedValue': p[tank]['Value'],
                    f'{tank}ProcessedTentativeQuality': p[tank]['TentativeQuality'],
                    f'{tank}ProcessedTrueQuality': p[tank]['TrueQuality'],
                })
            rows.append(row)
    
    return pd.DataFrame(rows)


def populate_throughput_ins(df: pd.DataFrame, data: list) -> pd.DataFrame:
    tanks = ['TBD910', 'TBD911', 'TBD912', 'TBD913', 'TB3301', 'TBD301', 'TUT604',
             'TUT605', 'TUT918', 'TOL400', 'TOL600', 'G354', 'G356']
    rows = []
    for month in data:
        om = month['OriginalData']
        pm = month['ProcessedData']
        cm = month['CalculatedData']
        for o, p, c in zip(om, pm, cm):
            row = {'TimeStep': o['TimeStep']}
            for tank in tanks:
                row.update({
                    f'{tank}OGValue': o[tank]['Value'],
                    f'{tank}OGTentativeQuality': o[tank]['TentativeQuality'],
                    f'{tank}OGTrueQuality': o[tank]['TrueQuality'],
                    f'{tank}ProcessedValue': p[tank]['Value'],
                    f'{tank}ProcessedTentativeQuality': p[tank]['TentativeQuality'],
                    f'{tank}ProcessedTrueQuality': p[tank]['TrueQuality'],
                    f'{tank}CalculatedValue': c[tank]['Value'],
                    f'{tank}CalculatedTentativeQuality': c[tank]['TentativeQuality'],
                    f'{tank}CalculatedTrueQuality': c[tank]['TrueQuality'],
                })
            rows.append(row)
    return pd.DataFrame(rows)

def populate_temp_outs(df:pd.DataFrame, data:list) -> pd.DataFrame:
    rows = []
    for month in data:
        row = {'TimeStart' : month['OriginalData'][0]['TimeStep'], 'TimeEnd' : month['OriginalData'][-1]['TimeStep']}
        for tank in month['OutputData']:
            row.update({
                f'{tank["TankID"]}Status' : tank['Status'],
                f'{tank["TankID"]}MonthlyAvgTemp' : tank['MonthlyAvgTemp'],
                f'{tank["TankID"]}AnnualAvgTemp' : tank['AnnualAvgTemp'],
                f'{tank["TankID"]}PeriodsOfGoodQualityDataPct' : tank['PeriodsOfGoodQualityDataPct'],
                f'{tank["TankID"]}PeriodsOfGoodQualityDataDays' : tank['PeriodsOfGoodQualityDataDays']
            })
        rows.append(row)
    return pd.DataFrame(rows)
            
def populate_throughput_outs(df:pd.DataFrame, data:list) -> pd.DataFrame:
    rows = []
    for month in data:
        row = {'TimeStart' : month['OriginalData'][0]['TimeStep'], 'TimeEnd' : month['OriginalData'][-1]['TimeStep']}
        for tank in month['OutputData']:
            row.update({
                f'{tank["TankID"]}Status' : tank['Status'],
                f'{tank["TankID"]}MonthlyThroughput' : tank['MonthlyThroughput'],
                f'{tank["TankID"]}MonthlyThroughputTarget' : tank['MonthlyThroughputTarget'],
                f'{tank["TankID"]}MaxPumpingRate' : tank['MaxPumpingRate'],
                f'{tank["TankID"]}MaxPumpingRateLimit' : tank['MaxPumpingRateLimit'],
                f'{tank["TankID"]}PeriodsOfGoodQualityDataPct' : tank['PeriodsOfGoodQualityDataPct'],
                f'{tank["TankID"]}PeriodsOfGoodQualityDataHours' : tank['PeriodsOfGoodQualityDataHours'],
                f'{tank["TankID"]}MaxPumpRateExceedingLimitHours' : tank['MaxPumpRateExceedingLimitHours']
            })
        rows.append(row)
    return pd.DataFrame(rows)



In [289]:
temp_data = load_json('../Preprocessing/TempOutputs.json')
throughput_data = load_json('../Preprocessing/ThroughputOutputs.json')

tankdims = process_tank_dimensions(
    throughput_data[0]['TankDimensionsReadOnly']) # constant

temp_in_df = create_in_df('Temp')
temp_out_df = create_out_df('Temp')
throughput_in_df = create_in_df('Throughput')
throughput_out_df = create_out_df('Throughput')

In [290]:
temp_in_df = populate_temp_ins(temp_in_df, temp_data)
throughput_in_df = populate_throughput_ins(throughput_in_df, throughput_data)
temp_out_df = populate_temp_outs(temp_out_df, temp_data)
throughput_out_df = populate_throughput_outs(throughput_out_df, throughput_data)

In [291]:
tankdims # constant data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
TankID,TBD910,TBD911,TBD912,TBD913,TB3301,TBD301,TUT604,TUT605,TUT918,TOL400,TOL600,G354,G356
Diameter,21.25,21.0,21.5,15.0,20.0,,25.0,25.0,40.5,18.0,,120.0,124.0
Density,,,,,,6.366553,,,,,6.672,,
Height,,,,,,,36.0,36.0,,23.0,,,


In [292]:
temp_in_df.head(5) # daily data

Unnamed: 0,TimeStep,TBD910OGValue,TBD910OGTentativeQuality,TBD910OGTrueQuality,TBD910ProcessedValue,TBD910ProcessedTentativeQuality,TBD910ProcessedTrueQuality,TBD911OGValue,TBD911OGTentativeQuality,TBD911OGTrueQuality,TBD911ProcessedValue,TBD911ProcessedTentativeQuality,TBD911ProcessedTrueQuality,TBD912OGValue,TBD912OGTentativeQuality,TBD912OGTrueQuality,TBD912ProcessedValue,TBD912ProcessedTentativeQuality,TBD912ProcessedTrueQuality
0,2020-01-01T12:00:00,214.531876,G,False,214.531876,G,True,59.142708,G,False,59.142708,G,True,57.993233,G,False,57.993233,G,True
1,2020-01-02T12:00:00,214.483322,G,False,214.483322,G,True,59.336441,G,False,59.336441,G,True,58.69928,G,False,58.69928,G,True
2,2020-01-03T12:00:00,214.479431,G,False,214.479431,G,True,59.655926,G,False,59.655926,G,True,59.910675,G,False,59.910675,G,True
3,2020-01-04T12:00:00,214.501617,G,False,214.501617,G,True,59.205444,G,False,59.205444,G,True,58.459412,G,False,58.459412,G,True
4,2020-01-05T12:00:00,214.438065,G,False,214.438065,G,True,59.270561,G,False,59.270561,G,True,58.174763,G,False,58.174763,G,True


In [293]:
temp_out_df.head(5) # outputs from temp_in_df that correspond to a month 

Unnamed: 0,TimeStart,TimeEnd,TBD910Status,TBD910MonthlyAvgTemp,TBD910AnnualAvgTemp,TBD910PeriodsOfGoodQualityDataPct,TBD910PeriodsOfGoodQualityDataDays,TBD911Status,TBD911MonthlyAvgTemp,TBD911AnnualAvgTemp,TBD911PeriodsOfGoodQualityDataPct,TBD911PeriodsOfGoodQualityDataDays,TBD912Status,TBD912MonthlyAvgTemp,TBD912AnnualAvgTemp,TBD912PeriodsOfGoodQualityDataPct,TBD912PeriodsOfGoodQualityDataDays
0,2020-01-01T12:00:00,2020-01-31T12:00:00,IN SERVICE,214.48994,,1,31,IN SERVICE,60.226209,,1,31,IN SERVICE,60.066977,,1,31
1,2020-02-01T12:00:00,2020-02-29T12:00:00,IN SERVICE,214.489403,,1,29,IN SERVICE,58.451405,,1,29,IN SERVICE,58.793788,,1,29
2,2020-03-01T12:00:00,2020-03-31T12:00:00,IN SERVICE,214.390449,,1,31,IN SERVICE,68.948737,,1,31,IN SERVICE,70.267992,,1,31
3,2020-04-01T12:00:00,2020-04-30T12:00:00,IN SERVICE,214.378541,,1,30,IN SERVICE,72.556152,,1,30,IN SERVICE,72.401116,,1,30
4,2020-05-01T12:00:00,2020-05-31T12:00:00,IN SERVICE,214.306817,,1,31,IN SERVICE,77.703501,,1,31,IN SERVICE,77.59991,,1,31


In [294]:
throughput_in_df.head(5) # hourly data

Unnamed: 0,TimeStep,TBD910OGValue,TBD910OGTentativeQuality,TBD910OGTrueQuality,TBD910ProcessedValue,TBD910ProcessedTentativeQuality,TBD910ProcessedTrueQuality,TBD910CalculatedValue,TBD910CalculatedTentativeQuality,TBD910CalculatedTrueQuality,...,G354CalculatedTrueQuality,G356OGValue,G356OGTentativeQuality,G356OGTrueQuality,G356ProcessedValue,G356ProcessedTentativeQuality,G356ProcessedTrueQuality,G356CalculatedValue,G356CalculatedTentativeQuality,G356CalculatedTrueQuality
0,2020-01-01T12:00:00,7.719081,G,False,7.719081,G,False,0.0,G,False,...,False,,B,False,,B,False,0,B,False
1,2020-01-01T13:00:00,7.721051,G,False,7.721051,G,True,5.224666,G,True,...,True,,B,False,,B,False,0,B,False
2,2020-01-01T14:00:00,7.717679,G,False,7.717679,G,True,0.0,G,True,...,True,,B,False,,B,False,0,B,False
3,2020-01-01T15:00:00,7.719844,G,False,7.719844,G,True,5.745867,G,True,...,True,,B,False,,B,False,0,B,False
4,2020-01-01T16:00:00,7.717999,G,False,7.717999,G,True,0.0,G,True,...,True,,B,False,,B,False,0,B,False


In [295]:
throughput_out_df.head(5) # outputs from throughput_in_df that correspond to a month

Unnamed: 0,TimeStart,TimeEnd,TBD910Status,TBD910MonthlyThroughput,TBD910MonthlyThroughputTarget,TBD910MaxPumpingRate,TBD910MaxPumpingRateLimit,TBD910PeriodsOfGoodQualityDataPct,TBD910PeriodsOfGoodQualityDataHours,TBD910MaxPumpRateExceedingLimitHours,...,G354PeriodsOfGoodQualityDataHours,G354MaxPumpRateExceedingLimitHours,G356Status,G356MonthlyThroughput,G356MonthlyThroughputTarget,G356MaxPumpingRate,G356MaxPumpingRateLimit,G356PeriodsOfGoodQualityDataPct,G356PeriodsOfGoodQualityDataHours,G356MaxPumpRateExceedingLimitHours
0,2020-01-01T12:00:00,2020-01-31T23:00:00,IN SERVICE,1008.817194,25000,14.168585,12000,1,732,0,...,732,0,IN SERVICE,0,21717500,0,96000,0,0,0
1,2020-02-01T00:00:00,2020-02-29T23:00:00,IN SERVICE,1120.832511,25000,17.041519,12000,1,696,0,...,696,25,IN SERVICE,0,21717500,0,96000,0,0,0
2,2020-03-01T00:00:00,2020-03-31T23:00:00,IN SERVICE,1193.185911,25000,16.880857,12000,1,743,0,...,743,21,IN SERVICE,0,21717500,0,96000,0,0,0
3,2020-04-01T00:00:00,2020-04-30T23:00:00,IN SERVICE,1148.8686,25000,15.00858,12000,1,720,0,...,720,13,IN SERVICE,0,21717500,0,96000,0,0,0
4,2020-05-01T00:00:00,2020-05-31T23:00:00,IN SERVICE,1215.108001,25000,20.57734,12000,1,744,0,...,744,12,IN SERVICE,0,21717500,0,96000,0,0,0


In [296]:
tankdims.to_csv('./TankDimensionsDF.csv', index=False)
temp_in_df.to_csv('./TempInputsDF.csv', index=False)
temp_out_df.to_csv('./TempOutDF.csv', index=False)
throughput_in_df.to_csv('./ThroughputInputsDF.csv', index=False)
throughput_out_df.to_csv('./ThroughputOutDF.csv', index=False)

In [219]:
tankdims = pd.read_csv('./TankDimensionsDF.csv',)
temp_in_df = pd.read_csv('./TempInputsDF.csv')
temp_out_df = pd.read_csv('./TempOutDF.csv')
throughput_in_df = pd.read_csv('./ThroughputInputsDF.csv')
throughput_out_df = pd.read_csv('./ThroughputOutDF.csv')

# Temperature Model

In [220]:
temp_in_df.head(2)

Unnamed: 0,TimeStep,TBD910OGValue,TBD910OGTentativeQuality,TBD910OGTrueQuality,TBD910ProcessedValue,TBD910ProcessedTentativeQuality,TBD910ProcessedTrueQuality,TBD911OGValue,TBD911OGTentativeQuality,TBD911OGTrueQuality,TBD911ProcessedValue,TBD911ProcessedTentativeQuality,TBD911ProcessedTrueQuality,TBD912OGValue,TBD912OGTentativeQuality,TBD912OGTrueQuality,TBD912ProcessedValue,TBD912ProcessedTentativeQuality,TBD912ProcessedTrueQuality
0,2020-01-01T12:00:00,214.531876,G,False,214.531876,G,True,59.142708,G,False,59.142708,G,True,57.993233,G,False,57.993233,G,True
1,2020-01-02T12:00:00,214.483322,G,False,214.483322,G,True,59.336441,G,False,59.336441,G,True,58.69928,G,False,58.69928,G,True


In [221]:
# temp_in_df and temp_out_df to predict future temp_in_df.OriginalData.Value
# TODO: Somehow need model to assume future month Tentative and True Qualities will be good 

# Encode Timestamp frequencies (year, month, day) - day is only for input data
# Can use sin and cos transformations to make the frequencies cycle
# need both for complete interpretability of cycle!
def encode_timestamps(df, time_cols):
    """
    Adds sin and cos features for year, month, and day cycles for multiple datetime columns.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        time_cols (list): List of column names containing datetime information.
    
    Returns:
        pd.DataFrame: DataFrame with added cyclical features.
    """
    for time_col in time_cols:
        df[time_col] = pd.to_datetime(df[time_col])  # Ensure datetime format

        # Extract time features for the specific column
        df[f'{time_col}_Year'] = df[time_col].dt.year
        df[f'{time_col}_Month'] = df[time_col].dt.month
        df[f'{time_col}_Day'] = df[time_col].dt.day

        # Add cyclical features for month and day
        df[f'sin_{time_col}_month'] = np.sin(
            2 * np.pi * df[f'{time_col}_Month'] / 12)
        df[f'cos_{time_col}_month'] = np.cos(
            2 * np.pi * df[f'{time_col}_Month'] / 12)
        df[f'sin_{time_col}_day'] = np.sin(
            2 * np.pi * df[f'{time_col}_Day'] / 31)  # Approx. max days in a month
        df[f'cos_{time_col}_day'] = np.cos(
            2 * np.pi * df[f'{time_col}_Day'] / 31)

        # Optionally drop original datetime features (keeping only cyclical and year if needed)
        df.drop(columns=[f'{time_col}_Year',
                f'{time_col}_Month', f'{time_col}_Day'], inplace=True)

    return df


def one_hot_encoder(df:pd.DataFrame, columns:list):
    for col in columns:
        if col.endswith("TentativeQuality"):
            df[col] = df[col].map({"G": 1, "B": 0})  # Map "G" -> 1, "B" -> 0
        elif col.endswith("TrueQuality"):
            df[col] = df[col].astype(int)  # Convert boolean to 1/0
        elif col.endswith("Status"):
            df[col] = df[col].map({"IN SERVICE": 1, "OOS": 0})
        else:
            raise ValueError(f"Unexpected column type: {col}")
    return df

In [222]:


# One hot encoder for categorical inputs
def segment_by_month(input_df, output_df):
    """
    Segments daily input data into sequences based on monthly bounds from the output data.

    Args:
        input_df (pd.DataFrame): Daily input data
        output_df (pd.DataFrame): Monthly output data 
    """
    segmented_inputs = []

    for _, row in output_df.iterrows():
        start_date = row['TimeStart']
        end_date = row['TimeEnd']

        # Filter input rows that fall within the monthly range
        monthly_data = input_df[
            (input_df['TimeStep'] >= start_date) & (
                input_df['TimeStep'] <= end_date)
        ]
        segmented_inputs.append(monthly_data)

    #output_df.drop(columns=['TimeStart', 'TimeEnd'], inplace=True)
   # input_df.drop(columns=['TimeStep'], inplace=True)
    return segmented_inputs


In [223]:
tanks = ['TBD910', 'TBD911', 'TBD912']
types = ['OG', 'Processed']
categorical = []
continuous = []
for tank in tanks:
    for type in types:
        continuous.append(f'{tank}{type}Value')
        categorical.append(f'{tank}{type}TentativeQuality')
        categorical.append(f'{tank}{type}TrueQuality')

In [224]:
temp_in_df = one_hot_encoder(temp_in_df, categorical)
out_cat = ['TBD910Status', 'TBD911Status', 'TBD912Status']
temp_out_df = one_hot_encoder(temp_out_df, out_cat)


temp_in_df = encode_timestamps(temp_in_df, ['TimeStep'])
temp_out_df = encode_timestamps(temp_out_df, ['TimeStart', 'TimeEnd'])
x_cleaned = segment_by_month(temp_in_df, temp_out_df)
y_cleaned = temp_out_df
x = [i.drop(columns=['TimeStep']) for i in x_cleaned]
y = y_cleaned.drop(columns=['TimeStart', 'TimeEnd'])
# x = x_cleaned
# y = y_cleaned

In [225]:
# x is a list containing each month's daily temperature time series data
# y is a dataframe with each row corresopnding to a month's output
len(x), len(y)

(53, 53)

In [226]:

x[0].columns

Index(['TBD910OGValue', 'TBD910OGTentativeQuality', 'TBD910OGTrueQuality',
       'TBD910ProcessedValue', 'TBD910ProcessedTentativeQuality',
       'TBD910ProcessedTrueQuality', 'TBD911OGValue',
       'TBD911OGTentativeQuality', 'TBD911OGTrueQuality',
       'TBD911ProcessedValue', 'TBD911ProcessedTentativeQuality',
       'TBD911ProcessedTrueQuality', 'TBD912OGValue',
       'TBD912OGTentativeQuality', 'TBD912OGTrueQuality',
       'TBD912ProcessedValue', 'TBD912ProcessedTentativeQuality',
       'TBD912ProcessedTrueQuality', 'sin_TimeStep_month',
       'cos_TimeStep_month', 'sin_TimeStep_day', 'cos_TimeStep_day'],
      dtype='object')

In [227]:
# Feed X into lstm as inputs
# Then embed y
# should be predicting X.{TankID}OGValue
# this will be a list of daily OGValues for the prediction month

X_train_daily_data = x[:-1] # all months except for the last one
X_train_monthly_data = y.iloc[:-1]

test = x[-1] # all of the TankOGValues

In [228]:
# make sure to only take min and max from training set.
# otherwise we make unrealistic generalizations about the future test set

# Normalize OGValues
def normalize_column(df_list, column, min_val, max_val):
    for df in df_list:
        df[column] = (df[column] - min_val) / (max_val - min_val)
    return df_list


# Min-max normalization for OGValues
minmax_dict = {}
for tank in ['TBD910', 'TBD911', 'TBD912']:
    for type in ['OG', 'Processed']:
        column = f'{tank}{type}Value'
        values = pd.concat([df[column] for df in X_train_daily_data])
        min_val, max_val = values.min(), values.max()
        minmax_dict[column] = (min_val, max_val)
        X_train_daily_data = normalize_column(
            X_train_daily_data, column, min_val, max_val)
        test = normalize_column([test], column, min_val, max_val)[0]

In [229]:
# need to only predict tanks values that are currently in service
# assumes tank status will be the same for the next (prediction) month

def filter_in_service(df, monthly_data, tanks):
    for tank in tanks:
        status_col = f'{tank}Status'
        if monthly_data[status_col].iloc[0] == 0:  # Tank is out of service
            df.drop(columns=[f'{tank}OGValue'], inplace=True)
    return df


# Filter out tanks that are not in service
for i, df in enumerate(X_train_daily_data):
    X_train_daily_data[i] = filter_in_service(df, X_train_monthly_data.iloc[[i]], [
                                              'TBD910', 'TBD911', 'TBD912'])
test = filter_in_service(test, y.iloc[[-1]], ['TBD910', 'TBD911', 'TBD912'])

In [230]:
len(y.columns)

23

In [236]:
import numpy as np

# Check for NaN or infinite values in daily data
for i, daily_data in enumerate(X_train_daily_data):
    if not np.all(np.isfinite(daily_data.values)):
        print(f"NaN or infinite values found in daily data at index {i}")

# Check for NaN or infinite values in monthly data
if not np.all(np.isfinite(X_train_monthly_data.values)):
    print("NaN or infinite values found in monthly data")

# Check for NaN or infinite values in test data
if not np.all(np.isfinite(test.values)):
    print("NaN or infinite values found in test data")

# Check for zero ranges in normalization
for column, (min_val, max_val) in minmax_dict.items():
    if max_val - min_val == 0:
        print(f"Zero range for column: {column}")

# Check masks for each training step
for i, _ in enumerate(X_train_monthly_data):
    status = X_train_monthly_data.iloc[i][[
        f"{tank}Status" for tank in tanks
    ]].values
    if np.all(status == 0):
        print(f"All tanks are out of service at index {i}")

NaN or infinite values found in daily data at index 10
NaN or infinite values found in daily data at index 14
NaN or infinite values found in daily data at index 21
NaN or infinite values found in daily data at index 26
NaN or infinite values found in daily data at index 28
NaN or infinite values found in daily data at index 29
NaN or infinite values found in daily data at index 30
NaN or infinite values found in daily data at index 41
NaN or infinite values found in daily data at index 48
NaN or infinite values found in daily data at index 50
NaN or infinite values found in daily data at index 51
NaN or infinite values found in monthly data
NaN or infinite values found in test data


In [237]:
for i, daily_data in enumerate(X_train_daily_data):
    if not np.all(np.isfinite(daily_data.values)):
        print(f"NaN or infinite values in daily data at index {i}:")
        print(daily_data[daily_data.isnull().any(axis=1)])  # Rows with NaN

NaN or infinite values in daily data at index 10:
     TBD910OGValue  TBD910OGTentativeQuality  TBD910OGTrueQuality  \
316       0.997634                         1                    0   

     TBD910ProcessedValue  TBD910ProcessedTentativeQuality  \
316                   NaN                                1   

     TBD910ProcessedTrueQuality  TBD911OGValue  TBD911OGTentativeQuality  \
316                           0       0.613647                         1   

     TBD911OGTrueQuality  TBD911ProcessedValue  ...  TBD912OGValue  \
316                    0              0.613647  ...       0.650802   

     TBD912OGTentativeQuality  TBD912OGTrueQuality  TBD912ProcessedValue  \
316                         1                    0              0.650802   

     TBD912ProcessedTentativeQuality  TBD912ProcessedTrueQuality  \
316                                1                           1   

     sin_TimeStep_month  cos_TimeStep_month  sin_TimeStep_day  \
316                -0.5            0.

In [231]:
import tensorflow as tf
from tensorflow.keras import layers, Model


class TimeSeriesPredictor(Model):
    def __init__(self, tank_embedding_dim, embedding_dim, hidden_dim, num_tanks):
        super(TimeSeriesPredictor, self).__init__()
        # LSTM Encoder
        self.encoder = layers.LSTM(
            hidden_dim, return_sequences=False, return_state=False)
        self.encoder_to_embedding = layers.Dense(
            embedding_dim, activation='relu')

        # Monthly Context Network
        self.context_network = tf.keras.Sequential([
            layers.Dense(embedding_dim, activation='relu')
        ])

        # Tank Embedding
        self.tank_embedding = layers.Embedding(num_tanks, tank_embedding_dim)

        # Fusion Layer
        self.fusion_layer = layers.Dense(embedding_dim, activation='relu')

        # Sequential Decoder
        self.decoder_lstm = layers.LSTM(
            hidden_dim, return_sequences=True, return_state=False)
        self.decoder_output = layers.TimeDistributed(layers.Dense(num_tanks))

    def call(self, daily_inputs, monthly_context, tank_indices, num_days):
        # Encode daily inputs
        daily_embedding = self.encoder_to_embedding(self.encoder(daily_inputs))

        # Process monthly context
        context_embedding = self.context_network(monthly_context)

        # Tank embedding
        tank_embeddings = self.tank_embedding(tank_indices)
        tank_embedding_mean = tf.reduce_mean(
            tank_embeddings, axis=0, keepdims=True)

        # Combine embeddings
        fused_embedding = self.fusion_layer(
            tf.concat([daily_embedding, context_embedding,
                      tank_embedding_mean], axis=-1)
        )

        # Prepare input for the decoder
        # Shape: (batch_size, 1, embedding_dim)
        decoder_input = tf.expand_dims(fused_embedding, axis=1)
        # Repeat for num_days
        decoder_input = tf.tile(decoder_input, [1, num_days, 1])

        # Sequential decoding
        # Shape: (batch_size, num_days, hidden_dim)
        decoded_sequence = self.decoder_lstm(decoder_input)
        # Shape: (batch_size, num_days, num_tanks)
        predictions = self.decoder_output(decoded_sequence)

        return predictions

# Model Parameters
input_dim = X_train_daily_data[0].shape[1]
context_dim = X_train_monthly_data.shape[1]
embedding_dim = 128
hidden_dim = 128
num_tanks = 3
tank_embedding_dim = 16

# Create the model
model = TimeSeriesPredictor(
    tank_embedding_dim, embedding_dim, hidden_dim, num_tanks)

# Optimizer and Loss
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.001)
mse_loss = tf.keras.losses.MeanSquaredError()


og_value_columns = [
    col for col in temp_in_df.columns if col.endswith("OGValue")]
og_value_indices = [temp_in_df.columns.get_loc(
    col) for col in og_value_columns]


# Training Loop
epochs = 10

for epoch in range(epochs):
    epoch_loss = 0
    for i, daily_data in enumerate(X_train_daily_data):
        # Determine the number of days for the current month
        num_days = daily_data.shape[0]

        # Prepare daily inputs and monthly context
        daily_data_tensor = tf.convert_to_tensor(
            daily_data.values, dtype=tf.float32)
        daily_data_tensor = tf.expand_dims(
            daily_data_tensor, axis=0)  # Add batch dimension
        monthly_context = tf.convert_to_tensor(
            X_train_monthly_data.iloc[i].values, dtype=tf.float32
        )
        monthly_context = tf.expand_dims(monthly_context, axis=0)

        # Tank indices
        tank_indices = tf.convert_to_tensor([0, 1, 2], dtype=tf.int32)

        # Create mask for in-service tanks
        status = X_train_monthly_data.iloc[i][[
            f"{tank}Status" for tank in tanks
        ]].values
        mask = tf.convert_to_tensor(status, dtype=tf.float32)

        # Target OGValues for the current month
        targets = tf.convert_to_tensor(
            daily_data[og_value_columns].values, dtype=tf.float32
        )
        targets = tf.expand_dims(targets, axis=0)  # Add batch dimension

        with tf.GradientTape() as tape:
            # Forward pass
            predictions = model(daily_data_tensor,
                                monthly_context, tank_indices, num_days)

            # Apply mask
            masked_predictions = predictions * tf.expand_dims(mask, axis=0)
            masked_targets = targets * tf.expand_dims(mask, axis=0)

            # Compute loss
            loss = mse_loss(masked_predictions, masked_targets)

        # Backward pass
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        epoch_loss += loss.numpy()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {epoch_loss:.4f}")

Epoch 1/10, Loss: nan
Epoch 2/10, Loss: nan
Epoch 3/10, Loss: nan
Epoch 4/10, Loss: nan
Epoch 5/10, Loss: nan
Epoch 6/10, Loss: nan
Epoch 7/10, Loss: nan
Epoch 8/10, Loss: nan
Epoch 9/10, Loss: nan
Epoch 10/10, Loss: nan


In [232]:
# Prepare test data
num_days_test = test.shape[0]
test_daily_data = tf.convert_to_tensor(test.values, dtype=tf.float32)
test_daily_data = tf.expand_dims(
    test_daily_data, axis=0)  # Add batch dimension
test_monthly_context = tf.convert_to_tensor(
    y.iloc[-1].values, dtype=tf.float32)
test_monthly_context = tf.expand_dims(test_monthly_context, axis=0)
test_tank_indices = tf.convert_to_tensor([0, 1, 2], dtype=tf.int32)

# Create mask for in-service tanks in the test month
test_status = y.iloc[-1][[f"{tank}Status" for tank in tanks]].values
test_mask = tf.convert_to_tensor(test_status, dtype=tf.float32)

# Target OGValues for the test month
test_targets = tf.convert_to_tensor(
    test[og_value_columns].values, dtype=tf.float32
)
test_targets = tf.expand_dims(test_targets, axis=0)  # Add batch dimension

# Run inference
test_predictions = model(
    test_daily_data, test_monthly_context, test_tank_indices, num_days_test)

# Apply mask to predictions
masked_test_predictions = test_predictions * tf.expand_dims(test_mask, axis=0)
masked_test_targets = test_targets * tf.expand_dims(test_mask, axis=0)

# Compute test MSE loss
test_loss = mse_loss(masked_test_predictions, masked_test_targets)
print("Test MSE Loss:", test_loss.numpy())

# View predictions
print("Predicted OGValues for the test month:", test_predictions.numpy())

Test MSE Loss: nan
Predicted OGValues for the test month: [[[nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]
  [nan nan nan]]]
