In [146]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")


def read_wl_csv(file_path):
    wl_df = pd.read_csv(file_path)

    ##### if the csv is from lighthouse then this drop function is always true
    ##### if the csv is not from lighthouse then you will need to modify the function

    wl_df.drop(labels=range(len(wl_df)-6,len(wl_df)), axis=0, inplace=True)

    keys = wl_df.keys().to_list()
    
    wl_df['date'] = pd.to_datetime(wl_df[keys[0]])
    wl_df[keys[1]].replace([-999, -99, 99, 'NA', 'RM'], np.nan, inplace=True)
    wl_df[keys[2]].replace([-999, -99, 99, 'NA', 'RM'], np.nan, inplace=True)
    wl_df[keys[3]].replace([-999, -99, 99, 'NA', 'RM'], np.nan, inplace=True)
    wl_df['pwl'] = pd.to_numeric(wl_df[keys[1]],errors= 'coerce')
    wl_df['bwl'] = pd.to_numeric(wl_df[keys[2]],errors= 'coerce')
    wl_df['harmwl'] = pd.to_numeric(wl_df[keys[3]],errors= 'coerce')
    wl_df['pwl surge'] = wl_df['pwl'] - wl_df['harmwl']
    wl_df['bwl surge'] = wl_df['bwl'] - wl_df['harmwl']
    wl_df = wl_df.drop(columns=keys[0],axis=0)
    wl_df = wl_df.drop(columns=keys[1],axis=0)
    wl_df = wl_df.drop(columns=keys[2],axis=0)
    wl_df = wl_df.drop(columns=keys[3],axis=0)
    del keys
    return wl_df

def locate_gaps(WL_data):
    lengthMissVal = []
    dates = []
    count = 0
    for i in range(len(WL_data)):
        if pd.isna(WL_data['pwl surge'][i]):
            if count == 0:  # Start of a new NaN gap
                dates.append(WL_data['date'][i])  # Record the start date of the gap
            count += 1  # Increment the gap length

        else:
            if count > 0:  # End of a NaN gap
                lengthMissVal.append(count)
                count = 0  # Reset count after recording the gap length

    # Finalize the DataFrame
    WL_data_gaps = pd.DataFrame()
    WL_data_gaps['date'] = pd.to_datetime(dates)
    WL_data_gaps['gapLength'] = lengthMissVal
    WL_data_gaps['gapTime(min)'] = WL_data_gaps['gapLength'] * 6

    del lengthMissVal,dates,count

    return WL_data_gaps

def eligible_gap_length(WL_gaps): #Function to sort the lengh of the gaps into three categories
    WL_gaps_filter_6min = WL_gaps['gapLength'] == 1
    WL_gaps_filter = (WL_gaps['gapLength'] <= 576) & (WL_gaps['gapLength'] > 1)

    #filters the data into individual dataframes
    linear_gaps = WL_gaps[WL_gaps_filter_6min]
    gaps_less_5_days = WL_gaps[WL_gaps_filter]

    del WL_gaps_filter,WL_gaps_filter_6min

    return linear_gaps,gaps_less_5_days




def linear_fill(Wl_data,linear_gaps): #function to fill in gaps with length of 1 using linear approach

    if len(linear_gaps) > 0:

        matching_dates = Wl_data[Wl_data['date'].isin(linear_gaps['date'])]

        index_locations = matching_dates.index.tolist()

        for i in range(len(index_locations)):
            new_value = ((Wl_data.loc[(index_locations[i])-1,'pwl surge']+ Wl_data.loc[index_locations[i]+1,'pwl surge']) / 2) + Wl_data.loc[index_locations[i],'harmwl']
            Wl_data.loc[index_locations[i],'pwl'] = new_value

        del matching_dates, index_locations, new_value
        
        return Wl_data
    
    else:
        print('No single gaps to fill')

        return Wl_data


def check_bwl(Wl_data,gaps):

    if len(gaps) > 0:

        matching_dates = Wl_data[Wl_data['date'].isin(gaps['date'])]

        index_locations = matching_dates.index.tolist()

        gap_length = gaps['gapLength'].tolist()

        valid_gaps = []

        for i in range(len(index_locations)):

            is_valid = Wl_data['bwl surge'][index_locations[i]:index_locations[i]+gap_length[i]].isna().sum() == 0
            valid_gaps.append(is_valid)
        
        filtered_gaps = gaps[valid_gaps].reset_index(drop=True)

        del matching_dates, index_locations, gap_length, valid_gaps, is_valid

        return filtered_gaps
    
    else:
        print('No gaps avaliable to fill')

        return gaps



def poly_gap_fill(Wl_data, gaps):

    if len(gaps) > 0:

        poly_df_list = list()
        
        matching_dates = Wl_data[Wl_data['date'].isin(gaps['date'])]

        index_locations = matching_dates.index.tolist()

        gap_length = gaps['gapLength'].tolist()

        gap_date_list = list()

        for i in range(len(matching_dates)):

            gap_date_df = pd.DataFrame()

            gap_date_df['date'] = Wl_data['date'][index_locations[i]:index_locations[i]+gap_length[i]]

            gap_date_list.append(gap_date_df)
            


        for i in range(len(index_locations)):

            if index_locations[i]- 2161  > 0 and index_locations[i]+2161+gap_length[i] < len(Wl_data):

                pwl_30_days = Wl_data['pwl surge'][(index_locations[i]- 2160):index_locations[i]+2160+gap_length[i]].tolist()

                bwl_30_days = Wl_data['bwl surge'][(index_locations[i]- 2160):index_locations[i]+2160+gap_length[i]].tolist()

                dates = Wl_data['date'][(index_locations[i]- 2160):index_locations[i]+2160+gap_length[i]].tolist()

                model = sm.OLS(pwl_30_days, sm.add_constant(bwl_30_days), missing='drop')

                results = model.fit()

                slope = results.params[1]

                intercept = results.params[0]

                poly_df = pd.DataFrame({'bwl surge': bwl_30_days, 'pwl surge': pwl_30_days,'date' : pd.to_datetime(dates)})

                poly_df['mwl surge'] = intercept + slope*poly_df['bwl surge']

                poly_df.loc[abs(poly_df['mwl surge'] - poly_df['pwl surge']) > 0.1, ['mwl surge', 'pwl surge']] = np.nan

                
                if poly_df['bwl surge'].isna().sum() + poly_df['pwl surge'].isna().sum() < len(poly_df)*0.1:


                    poly_df_copy = poly_df.copy()

                    poly_df_copy.dropna(inplace=True)

                    poly =np.polynomial.polynomial.Polynomial.fit(poly_df_copy['pwl surge'],poly_df_copy['bwl surge'],4)

                    pred_values = poly(poly_df['bwl surge'].values)

                    poly_df['mwl surge'] = pred_values

                    poly_df_list.append(poly_df)

                    del poly_df_copy, poly, pred_values

                    print(poly_df_list)


                    matched_dates1 = []
                    matched_dates2 = []

                    for df1, df2 in zip(gap_date_list, poly_df_list):


                        df1['date'] = pd.to_datetime(df1['date'])

                        #print('df1',df1)
                        #print('df2', df2)


                        df2['date'] = pd.to_datetime(df2['date'])

                        
                    
                        common_dates = df1['date'][df1['date'].isin(df2['date'])]

                        
                        filtered_df1 = df1[df1['date'].isin(common_dates)]
                        filtered_df2 = df2[df2['date'].isin(common_dates)]
                        
                        
                        matched_dates1.append(filtered_df1)
                        matched_dates2.append(filtered_df2)


                        #print(matched_dates1)
                        #print(matched_dates2)


                    
                    match_df_1 = pd.concat(matched_dates1, ignore_index=True)
                    match_df_2 = pd.concat(matched_dates2, ignore_index=True)

                    print(match_df_1)
                    print(match_df_2)
                    


                    Wl_data_total = match_df_2.merge(Wl_data,on='date', how='outer')

                    Wl_data_total = Wl_data_total.drop(columns='bwl surge_x',axis=0)
                    Wl_data_total = Wl_data_total.drop(columns='pwl surge_x',axis=0)

                    Wl_data_total['pwl surge'] = Wl_data_total['pwl surge_y']
                    Wl_data_total['bwl surge'] = Wl_data_total['bwl surge_y']

                    Wl_data_total = Wl_data_total.drop(columns='pwl surge_y',axis=0)
                    Wl_data_total = Wl_data_total.drop(columns='bwl surge_y',axis=0)


                    del poly_df_list, matching_dates, gap_date_list, matched_dates1, matched_dates2, match_df_1, match_df_2


                    return Wl_data_total, index_locations,gap_length
    

                else:
                   print('Can not fill gap not enough points')


            else:
                print('Can not fill gap out of bounds')

    else:
        print('No gaps to Fill')

        return Wl_data
    

def adjustment(filled_data,index_locations,gap_length):

    for i in range(len(index_locations)):

        average_before_gap = np.mean(filled_data['pwl surge'][(index_locations[i]-6):index_locations[i]].tolist())
        average_after_gap = np.mean(filled_data['pwl surge'][(index_locations[i]+1+gap_length[i]):index_locations[i]+6+gap_length[i]].tolist())
        n_length = gap_length[i]

        for k in range(n_length):

            adjustment = average_after_gap+(k/n_length) * (average_before_gap - average_after_gap)

            print(adjustment)

        

def create_gaps(dataset):

    import random

    wl_data =  dataset.copy() #pd.DataFrame(dataset)

    random_index = [random.randint(0,len(wl_data))for _ in range(1000)]

    max_gap_size = 100
    random_index = random.sample(range(len(wl_data) - max_gap_size), 1000)

    #print(len(random_index))

    #create one six min gap

    wl_data.loc[random_index[0], 'pwl surge'] = np.nan
    random_index = random_index[1:]


    # create 5 30 min gaps

    for i in range(5):

        wl_data.loc[random_index[i]:random_index[i] + 4, 'pwl surge'] = np.nan
    
    random_index = random_index[5:]

    #create 10 1hr gaps

    for i in range(10):

        wl_data.loc[random_index[i]:random_index[i] + 9, 'pwl surge'] = np.nan
    
    random_index = random_index[10:]

    #creates 50 5 hr gaps

    for i in range(50):

        wl_data.loc[random_index[i]:random_index[i] + 49, 'pwl surge'] = np.nan
    
    random_index = random_index[50:]

    #creates 100 10hr gaps

    for i in range(100):

        wl_data.loc[random_index[i]:random_index[i] + 99, 'pwl surge'] = np.nan
    
    random_index = random_index[100:]






    #print((wl_data.isna().sum()))

    return wl_data




    

In [147]:
def cbi_gapfill(filepath):

    print('Reading dataset')
    wl_dataset = read_wl_csv(filepath)

    
    #wl_dataset_gaps = create_gaps(wl_dataset)
    #print('Gaps Created')

    Wl_gaps = locate_gaps(wl_dataset)

    print('Total number of gaps: ', len(Wl_gaps))

    linear_gaps,multi_gaps = eligible_gap_length(Wl_gaps)

    print('Number of Linear Gaps filled:', len(linear_gaps))

    dataset_LF = linear_fill(wl_dataset,linear_gaps)

    print('Single gaps filled')

    valid_multi_gaps = check_bwl(dataset_LF,multi_gaps)

    print(valid_multi_gaps)

    print('Number of gaps with backup water level:', len(valid_multi_gaps))

    filled_wl_dataset, index_location, gap_length = poly_gap_fill(dataset_LF,valid_multi_gaps)

    print('Gaps filled')

    complete_dataset = adjustment(filled_wl_dataset,index_location,gap_length)

    return filled_wl_dataset , wl_dataset, Wl_gaps

filled_data, orig_data , gap_list = cbi_gapfill(r'C:\Users\mrpro\Documents\Code\CBI\Gap_Filling\P21_2016_gaps.csv')

Reading dataset
Total number of gaps:  38
Number of Linear Gaps filled: 8
Single gaps filled
                 date  gapLength  gapTime(min)
0 2016-06-13 16:30:00         11            66
1 2016-10-02 02:30:00         96           576
2 2016-11-03 21:12:00         16            96
Number of gaps with backup water level: 3
[      bwl surge  pwl surge                date  mwl surge
0         1.104      0.388 2016-06-04 16:30:00   0.078732
1         1.094      0.376 2016-06-04 16:36:00   0.157165
2         1.098      0.381 2016-06-04 16:42:00   0.126206
3         1.106      0.391 2016-06-04 16:48:00   0.062628
4         1.119      0.407 2016-06-04 16:54:00  -0.045511
...         ...        ...                 ...        ...
4326      1.037      0.299 2016-06-22 17:06:00   0.541747
4327      1.045      0.306 2016-06-22 17:12:00   0.493849
4328      1.044      0.302 2016-06-22 17:18:00   0.499940
4329      1.041      0.298 2016-06-22 17:24:00   0.518034
4330      1.043      0.299 2016-06-22 

In [148]:
gap_list

Unnamed: 0,date,gapLength,gapTime(min)
0,2016-01-06 17:30:00,1,6
1,2016-01-06 20:06:00,2,12
2,2016-01-06 20:36:00,3,18
3,2016-01-06 21:06:00,7,42
4,2016-01-06 22:06:00,5,30
5,2016-01-06 22:48:00,1,6
6,2016-01-07 21:48:00,1,6
7,2016-01-07 22:06:00,4,24
8,2016-01-11 23:00:00,27,162
9,2016-01-17 22:30:00,1,6


In [149]:
filled_data.to_clipboard()

In [150]:
filled_data = filled_data.dropna(subset=['mwl surge'])

filled_data

Unnamed: 0,date,mwl surge,pwl,bwl,harmwl,pwl surge,bwl surge
39525,2016-06-13 16:30:00,0.909502,,2.634,1.673,,0.961
39526,2016-06-13 16:36:00,0.848591,,2.65,1.674,,0.976
39527,2016-06-13 16:42:00,0.77293,,2.668,1.675,,0.993
39528,2016-06-13 16:48:00,0.818309,,2.659,1.676,,0.983
39529,2016-06-13 16:54:00,0.881742,,2.645,1.677,,0.968
39530,2016-06-13 17:00:00,0.889791,,2.644,1.678,,0.966
39531,2016-06-13 17:06:00,0.768252,,2.672,1.678,,0.994
39532,2016-06-13 17:12:00,0.75882,,2.674,1.678,,0.996
39533,2016-06-13 17:18:00,0.777582,,2.67,1.678,,0.992
39534,2016-06-13 17:24:00,0.782208,,2.668,1.677,,0.991
