In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/Thames_Initiative_2009-2017.csv')
df.columns

Index(['Thames Initiative site code', 'Site name', 'Sampling Date',
       'Time of sampling', 'Temperature (oC)', 'Lab pH',
       'Gran alkalinity u eq/L', 'Suspended solids mg/L',
       'Soluble reactive phosphorus (ug/L)',
       'Total dissolved phosphorus (ug/L)', 'Total phosphorus (ug/L)',
       'Dissolved ammonium (NH4) (mg/l)',
       'Dissolved reactive silicon (mg Si/L) ', 'Chlorophyll-a (ug/L)',
       'Dissolved fluoride (mg F/L)', 'Dissolved chloride (mg Cl/L)',
       'Dissolved nitrite (mg NO2/L)', 'Dissolved nitrate (NO3)',
       'Dissolved sulphate (mg SO4/L)', 'Total dissolved nitrogen (mg N/L)',
       'Dissolved organic carbon (mg/L)', 'Field pH', 'Conductivity (uS/cm)',
       'Redox potential (Eh) (mV)', 'Dissolved B (ppb)', 'Dissolved Ca (ppm)',
       'Dissolved Fe (ug/l))', 'Dissolved K (mg/l)', 'Dissolved Mg (mg/l)',
       'Dissolved Na (mg/l)', 'Dissolved Mn (ug/l)', 'Dissolved Zn (ug/l)',
       'Dissolved Cu (ug/l)', 'Total Na (mg/l)', 'Total Ca (mg/l)

In [None]:
silicon_column = 'Dissolved reactive silicon (mg Si/L) '
# We have tried 4 of the concentrations.
# 'Dissolved reactive silicon (mg Si/L) '
# 'Dissolved sulphate (mg SO4/L)'
# 'Dissolved nitrate (NO3)'
# 'Dissolved chloride (mg Cl/L)'

site_column = 'Site name'
date_column = 'Sampling Date'

df[date_column] = pd.to_datetime(df[date_column], format='%d/%m/%Y')

df['Year'] = df[date_column].dt.year

df_filtered = df[df['Year'].between(2010, 2017)]
site_year_groups = df_filtered.groupby([site_column, 'Year'])
sites = df_filtered[site_column].unique()
years = list(range(2010, 2017))
fixed_size = 49

def clean_data(data):
    cleaned_data = []
    for val in data:
        try:
            if isinstance(val, str):
                cleaned_val = float(val.replace('<', '').strip())
            else:
                cleaned_val = float(val)
        except ValueError:
            cleaned_val = np.nan
        cleaned_data.append(cleaned_val)
    return np.array(cleaned_data)

result_dict = {}
for site in sites:
    for year in years:
        if (site, year) in site_year_groups.groups:
            silicon_data = site_year_groups.get_group((site, year))[silicon_column].dropna().values
            cleaned_data = clean_data(silicon_data)
            mean_value = np.nanmean(cleaned_data) if len(cleaned_data) > 0 else 0
            if len(cleaned_data) < fixed_size:
                padded_data = np.pad(cleaned_data, (0, fixed_size - len(cleaned_data)), 'constant', constant_values=mean_value)
            else:
                padded_data = cleaned_data[:fixed_size]
            result_dict[(site, year)] = padded_data
        else:
            result_dict[(site, year)] = np.full(fixed_size, np.nan)

In [None]:
A = []
for i in result_dict.keys():
  A.append(i[0])
  B = list(set(A))
print(len(B))

24


In [None]:
result_dict[('Thames at Sonning', 2014)] = result_dict.pop(('Thames at Sonning  ', 2014))
result_dict[('Thames at Sonning', 2015)] = result_dict.pop(('Thames at Sonning  ', 2015))
result_dict[('Thames at Sonning', 2016)] = result_dict.pop(('Thames at Sonning  ', 2016))

keys_to_delete = [
  ('Jubilee River at Datchet', 2010),
  ('Jubilee River at Datchet', 2011),
  ('Jubilee River at Datchet', 2012),
  ('Jubilee River at Datchet', 2013),
  ('Jubilee River at Datchet', 2014),
  ('Jubilee River at Datchet', 2015),
  ('Jubilee River at Datchet', 2016),
  ('Wye at Bourne End', 2010),
  ('Wye at Bourne End', 2011),
  ('Wye at Bourne End', 2012),
  ('Wye at Bourne End', 2013),
  ('Wye at Bourne End', 2014),
  ('Wye at Bourne End', 2015),
  ('Wye at Bourne End', 2016),
  ('Wye at Bourne End', 2017),
  ('Thames at Sonning  ', 2010),
  ('Thames at Sonning  ', 2011),
  ('Thames at Sonning  ', 2012),
  ('Thames at Sonning  ', 2013),
  ('Colne at Staines', 2010),
  ('Colne at Staines', 2011),
  ('Colne at Staines', 2012),
  ('Colne at Staines', 2013),
  ('Colne at Staines', 2014),
  ('Colne at Staines', 2015),
  ('Colne at Staines', 2016)
]
for key in keys_to_delete:
    result_dict.pop(key, None)

In [None]:
custom_site_order = [
    'Cherwell at Hampton Poyle',
    'Cole at Lynt Bridge',
    'Coln at Whelford',
    'The Cut at Paley Street',
    'Enborne at Brimpton',
    'Evenlode at Cassington Mill',
    'Leach at Lechlade',
    'Lodden at Charvil',
    'Ock at Abingdon',
    'Pang at Tidmarsh',
    'Ray at Islip',
    'Thames at Hannington',
    'Thame at Wheatley',
    'Windrush at Newbridge',
    'Kennet at Woolhampton',
    'Thames at Newbridge',
    'Thames at Swinford',
    'Thames at Wallingford',
    'Thames at Sonning',
    'Thames at Runnymede'
]

filtered_site_data = {
    (site, year): data for (site, year), data in result_dict.items()
    if site in custom_site_order and 2010 <= year <= 2016
}
for key, values in filtered_site_data.items():
    # Calculate the mean of the non-NaN values
    mean_value = np.nanmean(values)
    # Replace NaN with the mean of the array
    filled_values = np.where(np.isnan(values), mean_value, values)
    # Update the dictionary with the filled values
    filtered_site_data[key] = filled_values

# Sort the filtered dictionary based on the custom site order
sorted_filtered_data = {
    (site, year): filtered_site_data[(site, year)]
    for site in custom_site_order
    for year in range(2010, 2017)
    if (site, year) in filtered_site_data
}
# for key, value in sorted_filtered_data.items():
#     print(f'{key}: {value}')

In [None]:
A = []
for i in sorted_filtered_data.keys():
  A.append(i[0])
  B = list(set(A))
print(len(B))

20


In [None]:
df1 = pd.DataFrame(list(sorted_filtered_data.items()), columns=['Site-Year', 'Values'])
df1[['Site', 'Year']] = pd.DataFrame(df1['Site-Year'].tolist(), index=df1.index)
df1 = df1.drop(columns='Site-Year')
df1 = df1[['Site', 'Year', 'Values']]
# print(df1)

In [None]:
import math

for rows in df1['Values']:
  contains_nan = any(math.isnan(x) for x in rows if isinstance(x, float))
  print(contains_nan)

In [None]:
df1[df1['Year']==2010]

Unnamed: 0,Site,Year,Values
0,Cherwell at Hampton Poyle,2010,"[3.77, 3.69, 4.0, 4.4, 3.88, 3.51, 3.33, 3.16,..."
7,Cole at Lynt Bridge,2010,"[7.59, 7.78, 6.79, 7.57, 7.99, 7.8, 7.61, 5.58..."
14,Coln at Whelford,2010,"[2.57, 2.62, 2.66, 2.85, 2.7, 2.62, 2.48, 2.4,..."
21,The Cut at Paley Street,2010,"[5.79, 6.12, 5.38, 6.03, 6.14, 5.96, 6.18, 4.0..."
28,Enborne at Brimpton,2010,"[5.88, 6.55, 4.88, 5.65, 6.59, 6.09, 6.5, 4.98..."
35,Evenlode at Cassington Mill,2010,"[2.95, 2.9, 3.28, 3.22, 3.0, 2.79, 2.6, 2.52, ..."
42,Leach at Lechlade,2010,"[2.33, 2.42, 2.26, 2.23, 1.99, 2.25, 2.21, 2.2..."
49,Lodden at Charvil,2010,"[5.62, 5.72, 4.35, 5.66, 5.8, 5.5, 5.52, 4.72,..."
56,Ock at Abingdon,2010,"[6.98, 7.12, 6.87, 7.37, 7.49, 7.27, 7.07, 6.4..."
63,Pang at Tidmarsh,2010,"[6.79, 6.95, 5.98, 6.82, 7.08, 6.96, 6.85, 6.9..."


In [None]:
import numpy as np
import pandas as pd

weeks_per_year = 49
years_range = df1['Year'].nunique()
num_sites = df1['Site'].nunique()
tensor_data = np.zeros((weeks_per_year * years_range, num_sites, 1))

for site_index, site in enumerate(df1['Site'].unique()):
    site_data = df1[df1['Site'] == site]

    for year_index, (_, row) in enumerate(site_data.iterrows()):
        measurements = row['Values']

        if len(measurements) != weeks_per_year:
            raise ValueError(f"Site {site} Year {row['Year']} does not have exactly 49 measurements")

        start_index = year_index * weeks_per_year
        tensor_data[start_index:start_index + weeks_per_year, site_index, 0] = measurements
print(f"Tensor shape: {tensor_data.shape}")


Tensor shape: (343, 20, 1)


In [None]:
nodes = ['CH','CL','CN','CU','EN','EV','LE','LO','OC','PA','RA','TH','TM','WI','KE','TN','TS','TW','TSO','TR']


In [None]:
df1[df1['Year']==2016]

Unnamed: 0,Site,Year,Values
6,Cherwell at Hampton Poyle,2016,"[4.59, 4.14, 4.06, 3.91, 3.74, 3.51, 3.72, 3.5..."
13,Cole at Lynt Bridge,2016,"[6.48, 5.81, 8.31, 8.07, 7.87, 6.18, 7.89, 7.6..."
20,Coln at Whelford,2016,"[3.35, 3.07, 2.96, 2.82, 2.71, 2.76, 2.41, 2.5..."
27,The Cut at Paley Street,2016,"[5.42, 4.34, 6.25, 6.38, 6.23, 4.69, 5.93, 6.1..."
34,Enborne at Brimpton,2016,"[5.15, 4.0, 6.62, 6.33, 5.86, 4.39, 6.08, 6.06..."
41,Evenlode at Cassington Mill,2016,"[3.73, 3.37, 3.22, 3.17, 3.16, 3.04, 2.82, 2.6..."
48,Leach at Lechlade,2016,"[3.21, 2.65, 2.51, 2.36, 2.08, 2.23, 1.53, 2.1..."
55,Lodden at Charvil,2016,"[4.96, 5.08, 5.94, 5.9, 5.77, 4.86, 5.38, 5.6,..."
62,Ock at Abingdon,2016,"[7.05, 6.67, 7.31, 7.42, 7.44, 6.37, 7.44, 7.3..."
69,Pang at Tidmarsh,2016,"[6.59, 4.96, 7.3, 7.38, 7.09, 6.39, 7.05, 6.97..."


In [None]:
tensor_data[0]

array([[3.77],
       [7.59],
       [2.57],
       [5.79],
       [5.88],
       [2.95],
       [2.33],
       [5.62],
       [6.98],
       [6.79],
       [3.96],
       [2.72],
       [6.29],
       [2.68],
       [7.06],
       [3.17],
       [3.05],
       [4.23],
       [4.86],
       [4.91]])

In [None]:
np.save('silicon.npy', tensor_data)

1 to 100 of 9130 entries
Filter

Thames Initiative site code	Site name	Sampling Date	Time of sampling	Temperature (oC)	Lab pH	Gran alkalinity u eq/L	Suspended solids mg/L	Soluble reactive phosphorus (ug/L)	Total dissolved phosphorus (ug/L)	Total phosphorus (ug/L)	Dissolved ammonium (NH4) (mg/l)	Dissolved reactive silicon (mg Si/L)	Chlorophyll-a (ug/L)	Dissolved fluoride (mg F/L)	Dissolved chloride (mg Cl/L)	Dissolved nitrite (mg NO2/L)	Dissolved nitrate (NO3)	Dissolved sulphate (mg SO4/L)