In [1]:
import xarray
import geopandas as gpd
import numpy as np
import pandas as pd
import s3fs
import geoglows

In [2]:
bucket_uri = 's3://geoglows-v2-retrospective/retrospective.zarr'
region_name = 'us-west-2'
s3 = s3fs.S3FileSystem(anon=True, client_kwargs=dict(region_name=region_name))
s3store = s3fs.S3Map(root=bucket_uri, s3=s3, check=False)

ds = xarray.open_zarr(s3store)

In [3]:
results = []

In [4]:
linkno_files = {
    160221792: "/Users/rachel1/Downloads/rachel nile/kenya_1GD03.csv",
    160266239: "/Users/rachel1/Downloads/rwanda_4326.csv",
    160212492: "/Users/rachel1/Downloads/rachel nile/grdc_1270900.csv",
    160168155: "/Users/rachel1/Downloads/rachel nile/grdc_1269200.csv",
    160213625: "/Users/rachel1/Downloads/rachel nile/grdc_1769050.csv",
    160184420: "/Users/rachel1/Downloads/rachel nile/grdc_1769100.csv",
    160191425: "/Users/rachel1/Downloads/rachel nile/grdc_1769200.csv",
    160128354: "/Users/rachel1/Downloads/rachel nile/grdc_1769150.csv",
    160504154: "/Users/rachel1/Downloads/rachel nile/grdc_1563680.csv",
    160528679: "/Users/rachel1/Downloads/rachel nile/grdc_1563700.csv",
    160622096: "/Users/rachel1/Downloads/rachel nile/grdc_1563900.csv",
    160590528: "/Users/rachel1/Downloads/rachel nile/grdc_1563600.csv",
    160536792: "/Users/rachel1/Downloads/rachel nile/grdc_1563450.csv",
    160596343: "/Users/rachel1/Downloads/rachel nile/grdc_1563500.csv",
    160608077: "/Users/rachel1/Downloads/rachel nile/grdc_1563550.csv",
    160553167: "/Users/rachel1/Downloads/rachel nile/grdc_1563050.csv",
}

In [5]:
stdStart = 1991
stdEnd = 2020
def HydroSOS(flowdata):
        flowdata.columns = ['date', 'flow']
        # flowdata['date'] = pd.to_datetime(flowdata['date'], format="%d/%m/%Y")
        flowdata['date'] = pd.to_datetime(flowdata['date'])  # Ensure the 'date' column is in datetime format
        # flowdata['date'] = flowdata['date'].dt.strftime('%d/%m/%Y')

        # check dates are sequential

        diff = pd.date_range(start=flowdata['date'].min(), end=flowdata['date'].max()).difference(flowdata['date'])
        if len(diff) > 0:
            flowdata.set_index('date', inplace=True)
            for md in diff:
                flowdata.loc[md, 'flow'] = pd.NA
            flowdata.reset_index(inplace=True)
        # month and year column
        flowdata['month'] = flowdata['date'].dt.month
        flowdata['year'] = flowdata['date'].dt.year

        # check whether or not there is enough data?
        # print(f"There are {flowdata['year'].max() - flowdata['year'].min()} years of data in this file.")
        # print(f"There are {sum(flowdata['flow'].isnull())} missing data points, which is {np.round(sum(flowdata['flow'].isnull())/len(flowdata) * 100,4)}% of the total data")
        #print("starting step 1")
        """ STEP 1: CALCULATE MEAN MONTHLY FLOWS """

        # calculate percentage completeness for each year/month
        groupBy = (flowdata.groupby(['month', 'year']).count()['flow'] / flowdata.groupby(['month', 'year']).count()[
            'date']) * 100
        groupBy = pd.DataFrame(groupBy)
        groupBy.rename(columns={0: 'monthly%'}, inplace=True)
        # calculate mean flows for each year/month
        groupBy['mean_flow'] = flowdata.groupby(['month', 'year'])['flow'].mean()
        # set the mean flow to NAN if there is less than 50 % data
        # groupBy.loc[groupBy['monthly%'] < 50,'mean_flow'] = pd.NA
        groupBy.reset_index(inplace=True)
        #print("finished step 1")
        """ STEP 2: CALCULATE MEAN MONTHLY FLOWS AS A PERCENTAGE OF AVERAGE """

        # calculate long term average
        LTA = groupBy[(groupBy['year'] >= stdStart) & (groupBy['year'] <= stdEnd)].groupby(['month'])[
            'mean_flow'].mean()
        if LTA.empty:
            #print("Skipping")
            return
        #print("NOT SKIPPED")

        # divide each month by this long term average
        for i in range(1, 13):
            groupBy.loc[groupBy['month'] == i, 'percentile_flow'] = groupBy['mean_flow'][groupBy['month'] == i] / LTA[
                i] * 100
        #print("FINISHED STEP 2")
        """ STEP 3: CALCULATE RANK PERCENTILES """
        # na values automatically set as rank na
        for i in range(1, 13):
            groupBy.loc[groupBy['month'] == i, 'weibell_rank'] = groupBy.loc[
                                                                     groupBy['month'] == i, 'percentile_flow'].rank(
                na_option='keep') / (groupBy.loc[groupBy['month'] == i, 'percentile_flow'].count() + 1)
        #print("FINISHED STEP 3")
        """ STEP 4: ASSIGN STATUS CATEGORIES """


        def flow_status(weibell_rank):
            status = pd.NA
            if weibell_rank <= 0.13:
                status = 1
            elif weibell_rank <= 0.28:
                status = 2
            elif weibell_rank <= 0.71999:
                status = 3
            elif weibell_rank <= 0.86999:
                status = 4
            elif weibell_rank <= 1:
                status = 5
            return status


        for i in groupBy.index:
            groupBy.loc[i, 'flowcat'] = flow_status(groupBy.loc[i, 'weibell_rank'])
        #print("FINISHED STEP 4")
        """ STEP 5: WRITE DATA """
        groupBy['date'] = pd.to_datetime(groupBy[['year', 'month']].assign(DAY=1))
        groupBy['date'] = groupBy['date'].dt.strftime('%Y-%m-%d')
        groupBy['flowcat'] = groupBy['flowcat'].astype('Int64')
        return(groupBy.sort_values(['year', 'month']).filter(['date', 'flowcat']))

In [6]:
for linkno, file_path in linkno_files.items():
    print(linkno)
    df = ds['Qout'].sel(rivid=linkno).to_dataframe()
    df = df.reset_index().set_index('time').pivot(columns='rivid', values='Qout')
    df = df[(df.index > '1950-12-31')]
    df = df[df[linkno] >= 0]
    df.reset_index(inplace=True)
    gauge = pd.read_csv(file_path)
    gauge = gauge[gauge['Streamflow (m3/s)'] >= 0]
    gauge['Datetime'] = pd.to_datetime(gauge['Datetime'], errors='coerce')
    gauge.set_index('Datetime', inplace=True)
    gauge.reset_index(inplace=True)
    geoglows_sos = HydroSOS(df)

    #print("FINISHED GEOGLOWS")
    gauge_sos = HydroSOS(gauge)
    if gauge_sos is not None:
        print("NOT SKIPPED")



160221792
NOT SKIPPED
160266239



Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.



NOT SKIPPED
160212492
160168155
160213625
160184420
160191425
160128354
160504154
160528679
160622096
160590528
160536792
160596343
160608077
160553167


In [13]:
gauge_sos

In [12]:
geoglows_sos

Unnamed: 0,date,flowcat
0,1951-01-01,5
74,1951-02-01,3
148,1951-03-01,3
222,1951-04-01,1
296,1951-05-01,3
...,...,...
591,2024-08-01,1
665,2024-09-01,3
739,2024-10-01,3
813,2024-11-01,1


In [7]:
df.reset_index(inplace=True)

In [8]:
df.columns = ['date', 'flow']

ValueError: Length mismatch: Expected axis has 5 elements, new values have 2 elements

In [9]:
gauge.reset_index(inplace=True)

In [10]:
gauge.columns = ['date', 'flow']

ValueError: Length mismatch: Expected axis has 5 elements, new values have 2 elements

In [None]:
gauge

In [12]:

# STEP 1: CALCULATE MEAN MONTHLY FLOWS """

        # calculate percentage completeness for each year/month
groupBy = (gauge.groupby(['month', 'year']).count()['flow'] / gauge.groupby(['month', 'year']).count()[
            'date']) * 100
groupBy = pd.DataFrame(groupBy)
groupBy.rename(columns={0: 'monthly%'}, inplace=True)
        # calculate mean flows for each year/month
groupBy['mean_flow'] = gauge.groupby(['month', 'year'])['flow'].mean()
        # set the mean flow to NAN if there is less than 50 % data
        # groupBy.loc[groupBy['monthly%'] < 50,'mean_flow'] = pd.NA
groupBy.reset_index(inplace=True)
print("finished step 1")
#STEP 2: CALCULATE MEAN MONTHLY FLOWS AS A PERCENTAGE OF AVERAGE """

        # calculate long term average
LTA = groupBy[(groupBy['year'] >= stdStart) & (groupBy['year'] <= stdEnd)].groupby(['month'])[
            'mean_flow'].mean()
print(LTA)

finished step 1
Series([], Name: mean_flow, dtype: float64)


In [14]:
groupBy

Unnamed: 0,month,year,monthly%,mean_flow
0,1,1978,100.0,4.483871
1,1,1979,100.0,2.548387
2,1,1980,100.0,0.548387
3,2,1978,100.0,4.714286
4,2,1979,100.0,5.928571
5,2,1980,100.0,0.034483
6,3,1978,100.0,18.612903
7,3,1979,100.0,9.129032
8,3,1980,100.0,0.193548
9,4,1978,100.0,23.666667


In [None]:
print("hi")