In [1]:
import pandas as pd
import torch

In [2]:
diatoms = pd.read_csv('DataDiatomGNN_GTstudentprojectGT/DiatomInventories_GTstudentproject.csv', sep=';')
info = pd.read_csv('DataDiatomGNN_GTstudentprojectGT/PressureStatus_GTstudentproject.csv', sep=';')
info.head()


Unnamed: 0,SamplingOperations_code,CodeSite_SamplingOperations,Date_SamplingOperation,Nitrogencompounds_Status1Y,Nitrogencompounds_Status180D,Nitrogencompounds_Status90D,Nitrates_Status1Y,Nitrates_Status180D,Nitrates_Status90D,Phosphorouscompounds_Status1Y,...,OrganicMatter_Status90D,SuspendedMatter_Status1Y,SuspendedMatter_Status180D,SuspendedMatter_Status90D,OrganicMicropollutants_Status1Y,OrganicMicropollutants_Status180D,OrganicMicropollutants_Status90D,MineralMicropollutants_Status1Y,MineralMicropollutants_Status180D,MineralMicropollutants_Status90D
0,S02000008_20170703,S02000008,2017-07-03,Good,Good,Good,Moderate,Moderate,Moderate,Good,...,Bad,High,High,High,Good,Good,Good,Good,Good,Good
1,S02000008_20200708,S02000008,2020-07-08,Good,Good,Good,Moderate,Moderate,Moderate,Moderate,...,Bad,High,High,High,Good,Good,Good,Good,Good,Good
2,S02000010_20070906,S02000010,2007-09-06,Good,Good,Good,Good,Good,Good,High,...,Bad,High,High,High,Moderate,Moderate,Moderate,Good,Good,Good
3,S02000010_20080811,S02000010,2008-08-11,Good,Good,Good,Good,Good,Good,High,...,Bad,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate
4,S02000010_20090721,S02000010,2009-07-21,Good,Good,Good,Good,Good,Good,Good,...,Bad,High,High,High,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate


In [3]:
# add one hot encoding to diatoms
diatoms['onehot'] = pd.Categorical(diatoms['TaxonCode']).codes

### Create diatoms_per_sampling_operation, which maps sampling operations to a list of one-hot encoded diatoms and their abundance.

In [4]:
# for each info row get all diatoms onehot number that belong to that sampling operation
sampling_operations = info['SamplingOperations_code']
diatoms_per_sampling_operation = {}
for i, sampling_operation in enumerate(sampling_operations):
    # get all rows in diatoms where diagtoms['SamplingOperations_code'] == sampling_operation
    diatoms_per_sampling_operation[sampling_operation] = diatoms[diatoms['SamplingOperations_code'] == sampling_operation][['onehot', 'Abundance_pm']]

In [5]:
import pickle as pkl
for key in diatoms_per_sampling_operation.keys():
    try:
        diatoms_per_sampling_operation[key]['Abundance_pm'] = diatoms_per_sampling_operation[key]['Abundance_pm'].apply(lambda x: float(x.replace(',', '.')))
    except AttributeError:
        print(key, 'already processed')
        
with open('diatoms_per_sampling_operation.pkl', 'wb') as f:
    pkl.dump(diatoms_per_sampling_operation, f)


In [6]:
diatoms_per_sampling_operation = pkl.load(open('diatoms_per_sampling_operation.pkl', 'rb'))

In [7]:
print(diatoms_per_sampling_operation['S02000008_20170703'])

         onehot  Abundance_pm
32318        69      7.407407
68603        85     88.888889
129441      124      2.469136
194448      176      4.938272
222251      191    187.654321
274455      248      2.469136
292708      294     44.444444
646505      919      4.938272
770249     1137      2.469136
792780     1149      2.469136
837637     1202     32.098765
875360     1203      4.938272
973744     1384    261.728395
1032167    1432    283.950617
1079745    1477     19.753086
1306969    1720      2.469136
1364330    1934      7.407407
1449906    2033      7.407407
1488642    2065     14.814815
1612890    2285     17.283951


### Create sampling_op_to_tensor, which maps Sampling Operations to torch tensors representing the model input (onehot encoding scaled by abundance or just the onehot encoding) and the output (the classification for each pressure status)

In [8]:
pressures_per_sampling_operation = pd.read_csv('DataDiatomGNN_GTstudentprojectGT/PressureStatus_GTstudentproject.csv', sep=';')
pressures_per_sampling_operation = pressures_per_sampling_operation.drop_duplicates(subset=['SamplingOperations_code'])
pressures_per_sampling_operation = pressures_per_sampling_operation.set_index('SamplingOperations_code').to_dict(orient='index')

In [9]:
print(pressures_per_sampling_operation['S02000008_20170703'])

{'CodeSite_SamplingOperations': 'S02000008', 'Date_SamplingOperation': '2017-07-03', 'Nitrogencompounds_Status1Y': 'Good', 'Nitrogencompounds_Status180D': 'Good', 'Nitrogencompounds_Status90D': 'Good', 'Nitrates_Status1Y': 'Moderate', 'Nitrates_Status180D': 'Moderate', 'Nitrates_Status90D': 'Moderate', 'Phosphorouscompounds_Status1Y': 'Good', 'Phosphorouscompounds_Status180D': 'Good', 'Phosphorouscompounds_Status90D': 'Moderate', 'Acidification_Status1Y': 'High', 'Acidification_Status180D': 'High', 'Acidification_Status90D': 'High', 'PAH_Status1Y': 'Unassessed', 'PAH_Status180D': 'Unassessed', 'PAH_Status90D': 'Unassessed', 'OrganicMatter_Status1Y': 'Bad', 'OrganicMatter_Status180D': 'Bad', 'OrganicMatter_Status90D': 'Bad', 'SuspendedMatter_Status1Y': 'High', 'SuspendedMatter_Status180D': 'High', 'SuspendedMatter_Status90D': 'High', 'OrganicMicropollutants_Status1Y': 'Good', 'OrganicMicropollutants_Status180D': 'Good', 'OrganicMicropollutants_Status90D': 'Good', 'MineralMicropollutants

In [10]:
sampling_op_to_tensor = {}
valid_ys = ["Nitrogencompounds_Status1Y","Nitrogencompounds_Status180D","Nitrogencompounds_Status90D","Nitrates_Status1Y","Nitrates_Status180D","Nitrates_Status90D","Phosphorouscompounds_Status1Y","Phosphorouscompounds_Status180D","Phosphorouscompounds_Status90D","Acidification_Status1Y","Acidification_Status180D","Acidification_Status90D","PAH_Status1Y","PAH_Status180D","PAH_Status90D","OrganicMatter_Status1Y","OrganicMatter_Status180D","OrganicMatter_Status90D","SuspendedMatter_Status1Y","SuspendedMatter_Status180D","SuspendedMatter_Status90D","OrganicMicropollutants_Status1Y","OrganicMicropollutants_Status180D","OrganicMicropollutants_Status90D","MineralMicropollutants_Status1Y","MineralMicropollutants_Status180D","MineralMicropollutants_Status90D"]
y_map = {"High": 0, "Good": 1, "Moderate": 2, "Poor": 3, "Bad": 4, "Unassessed": -1}
for key in diatoms_per_sampling_operation:
    scaled_onehot = torch.zeros((diatoms['onehot'].max()+1))
    one_hot = torch.zeros((diatoms['onehot'].max()+1))
    scaled_onehot[diatoms_per_sampling_operation[key]['onehot'].to_list()] = torch.tensor(diatoms_per_sampling_operation[key]['Abundance_pm'].to_list())
    one_hot[diatoms_per_sampling_operation[key]['onehot'].to_list()] = 1
    ys_list = []
    skip = False
    for y in valid_ys:
        ys_list.append(y_map[pressures_per_sampling_operation[key][y]])
    sampling_op_to_tensor[key] = (scaled_onehot, one_hot, torch.tensor(ys_list))
with open('sampling_op_to_tensor.pkl', 'wb') as f:
    pkl.dump(sampling_op_to_tensor, f)