In [None]:
%reload_ext autoreload
%autoreload 2

import pandas as pd
import json
from hashlib import md5
import pickle

import prepare_data
from components import PCOA
#from helpers import PipelineHelper, SMWrapper
from settings import Config, shortnames, target, featurelist
from cv import generate_feature_sets

## Data preparation

In [None]:
#%%capture
# cell magic to supress output. Comment it out to see output of this cell.

# What happened so far: DB extract and blank procedure. Now import resulting MP data from csv
mp_pdd = prepare_data.get_pdd()

# Also import sediment data (sediment frequencies per size bin from master sizer export)
grainsize_iow, grainsize_cau = prepare_data.get_grainsizes()[0:2]
scor_iow = PCOA(grainsize_iow, 2)[0]

# ...some data wrangling to prepare particle domain data and sample domain data for MP and combine with certain sediment aggregates.
sdd_iow = prepare_data.aggregate_SDD(mp_pdd)
sdd_iow = prepare_data.additional_sdd_merging(sdd_iow, how='outer')
sdd_iow = sdd_iow.merge(scor_iow, right_index=True, left_on='Sample', how='outer')
sdd_iow = sdd_iow.replace({'Sample': shortnames}).sort_values(by='Sample')

In [None]:
## Split data into samples used for building the model and samples used for predicting.

model_data = sdd_iow.loc[~sdd_iow[target].isna()].set_index('Sample')
model_X = model_data[featurelist]

In [None]:
# Workaround for creating long candidate lists faster...

featurelist_simplified = [p for p in featurelist if 'WWTP' not in p] + ['WWTP']

min_num, max_num = 5, 5
feature_candidates_list_simpl = generate_feature_sets(featurelist_simplified, Config.mutual_exclusive, Config.exclusive_keywords, num_feat=(min_num, max_num), n_jobs=1, save=False)

new_feature_candidates_list = [l for l in feature_candidates_list_simpl if 'WWTP' not in l]

for i, lst in enumerate(feature_candidates_list_simpl):
    if 'WWTP' in lst:
        # print(i, lst)
        for j, feat in enumerate(featurelist):
            if 'WWTP' in feat:
                lst = lst.copy()
                lst[[idx for idx, s in enumerate(lst) if 'WWTP' in s][0]] = feat
                new_feature_candidates_list.append(lst)
                
md5_tail = md5(json.dumps(featurelist, sort_keys=True).encode('utf-8')).hexdigest()[-5:]  # get the hash of featurelist
with open(f'../data/exports/feature_candidates_list_min{min_num}_max{max_num}_HASH{md5_tail}.pkl', 'wb') as f:
    pickle.dump(new_feature_candidates_list, f)

In [None]:
len(new_feature_candidates_list)