K-means clustering

In [4]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from ema_workbench import (Policy, ema_logging, MultiprocessingEvaluator, Samplers)
from problem_formulation import get_model_for_problem_formulation

# Setup logging
ema_logging.log_to_stderr(ema_logging.INFO)

# Load the model
dike_model, planning_steps = get_model_for_problem_formulation(6)

# REPLACE POLICY BELOW WITH OWN POLICY ???
# Define the zero policy
zero_policy = {"DaysToThreat": 0}
zero_policy.update({f"DikeIncrease {n}": 0 for n in planning_steps})
zero_policy.update({f"RfR {n}": 0 for n in planning_steps})

pol0 = {}
for key in dike_model.levers:
    s1, s2 = key.name.split("_")
    pol0.update({key.name: zero_policy[s2]})
policy0 = Policy("Policy 0", **pol0)



# Perform experiments
with MultiprocessingEvaluator(dike_model) as evaluator:
    results = evaluator.perform_experiments(scenarios=1000, policies=policy0, uncertainty_sampling=Samplers.LHS)
exp, outcomes = results

# Convert the experimental results into a DataFrame
exp_df = pd.DataFrame(exp)

# Extract uncertainty variables
uncertainty_vars = [var.name for var in dike_model.uncertainties]
uncertainty_data = exp_df[uncertainty_vars].values

# Normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
normalized_uncertainty_data = scaler.fit_transform(uncertainty_data)

# Define the number of clusters
n_clusters = 5  # Adjust the number of clusters based on your needs

# Perform K-means clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(normalized_uncertainty_data)
labels = kmeans.labels_

# Select a representative scenario from each cluster
representative_scenarios = []
for i in range(n_clusters):
    cluster_indices = np.where(labels == i)[0]
    cluster_center = kmeans.cluster_centers_[i]
    # Find the scenario closest to the cluster center
    distances = np.linalg.norm(normalized_uncertainty_data[cluster_indices] - cluster_center, axis=1)
    representative_index = cluster_indices[np.argmin(distances)]
    representative_scenarios.append(representative_index)

# Print the selected representative scenarios
print("Selected representative scenarios based on uncertainty variables:")
print(representative_scenarios)


[MainProcess/INFO] pool started with 12 workers
[MainProcess/INFO] performing 1000 scenarios * 1 policies * 1 model(s) = 1000 experiments
100%|██████████████████████████████████████| 1000/1000 [01:31<00:00, 10.95it/s]
[MainProcess/INFO] experiments finished
[MainProcess/INFO] terminating pool


Selected representative scenarios based on uncertainty variables:
[119, 667, 882, 37, 872]


In [3]:
exp_df.loc[[37, 60, 5, 28, 36]]

Unnamed: 0,A.0_ID flood wave shape,A.1_Bmax,A.1_Brate,A.1_pfail,A.2_Bmax,A.2_Brate,A.2_pfail,A.3_Bmax,A.3_Brate,A.3_pfail,...,A.3_DikeIncrease 2,A.4_DikeIncrease 0,A.4_DikeIncrease 1,A.4_DikeIncrease 2,A.5_DikeIncrease 0,A.5_DikeIncrease 1,A.5_DikeIncrease 2,scenario,policy,model
37,80,146.814252,1.0,0.664392,172.799984,1.0,0.331798,327.243063,10.0,0.656563,...,0,0,0,0,0,0,0,37,Policy 0,dikesnet
60,48,82.432006,1.5,0.689514,274.825576,1.0,0.52184,303.280548,1.0,0.075431,...,0,0,0,0,0,0,0,60,Policy 0,dikesnet
5,61,272.948683,1.0,0.67105,149.116912,1.5,0.972173,73.335783,10.0,0.487231,...,0,0,0,0,0,0,0,5,Policy 0,dikesnet
28,96,222.784445,10.0,0.892629,262.315396,1.0,0.471709,40.114086,1.5,0.511433,...,0,0,0,0,0,0,0,28,Policy 0,dikesnet
36,47,249.742751,1.5,0.376924,215.018222,1.0,0.957129,170.275372,1.0,0.897431,...,0,0,0,0,0,0,0,36,Policy 0,dikesnet


Eker & Kwakkel --> Distance

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from SALib.analyze import sobol
import warnings
from ema_workbench.analysis import feature_scoring
from ema_workbench.em_framework.salib_samplers import get_SALib_problem
from ema_workbench import (
    Model,
    Policy,
    ema_logging,
    SequentialEvaluator,
    MultiprocessingEvaluator,
    perform_experiments,
    Samplers,
    SequentialEvaluator,
)
import time
import itertools
import multiprocessing
from ema_workbench.em_framework import sample_uncertainties
# from dike_model_function import DikeNetwork 
from problem_formulation import get_model_for_problem_formulation, sum_over, sum_over_time
from ema_workbench.analysis import prim
from ema_workbench.em_framework.parameters import Constant

warnings.filterwarnings("ignore")

In [2]:
def normalize_out_dic(outcomes):
    norm_outcomes = {}
    for ooi in outcomes.keys():
        data = outcomes[ooi]
        mx = max(data)
        mn = min(data)
        if mx == mn:
            norm_outcomes[ooi] = data - mn
        else:
            norm_outcomes[ooi] = (data - mn)/(mx-mn)
    return norm_outcomes

In [3]:
dike_model, planning_steps = get_model_for_problem_formulation(6)

zero_policy = {"DaysToThreat": 0}
zero_policy.update({f"DikeIncrease {n}": 0 for n in planning_steps})
zero_policy.update({f"RfR {n}": 0 for n in planning_steps})

pol0 = {}
for key in dike_model.levers:
    s1, s2 = key.name.split("_")
    pol0.update({key.name: zero_policy[s2]})
policy0 = Policy("Policy 0", **pol0)

with MultiprocessingEvaluator(dike_model) as evaluator:
    results = evaluator.perform_experiments(scenarios=100,policies=policy0,uncertainty_sampling=Samplers.LHS)
exp, outcomes = results   
norm_new_out = normalize_out_dic(outcomes)
oois = list(outcomes.keys())

100%|████████████████████████████████████████| 100/100 [00:09<00:00, 10.28it/s]


In [4]:
def calculate_distance(data, oois, scenarios=None, distance='euclidean'):
    '''data is the outcomes of exploration results,
    scenarios is a list of scenario indices (decision variables), 
    oois is a list of variable names,
    distance is to choose the distance metric. options:
            bray-curtis, canberra, chebyshev, cityblock (manhattan), correlation, 
            cosine, euclidian, mahalanobis, minkowski, seuclidian,
            sqeuclidian, wminkowski
    returns a list of distance values
    '''
    #make a matrix of the data n_scenarios x oois
    scenario_data = np.zeros((len(scenarios), len(oois)))
    for i, s in enumerate(scenarios):
        for j, ooi in enumerate(oois):
            scenario_data[i][j] = data[ooi][s]
                
    distances = pdist(scenario_data, distance)
    return distances


In [5]:
def evaluate_diversity_single(x, data=norm_new_out, oois=oois, weight=0.5, distance='euclidean'):
    '''
    takes the outcomes and selected scenario set (decision variables), 
    returns a single 'diversity' value for the scenario set.
    outcomes : outcomes dictionary of the scenario ensemble
    decision vars : indices of the scenario set
    weight : weight given to the mean in the diversity metric. If 0, only minimum; if 1, only mean
    '''
    distances = calculate_distance(data, oois, list(x), distance)
    minimum = np.min(distances)
    mean = np.mean(distances)
    diversity = (1-weight)*minimum + weight*mean
    
    return [diversity]

In [6]:
def find_maxdiverse_scenarios(combinations):
    diversity = 0.0
    solutions = []
    for sc_set in combinations:
        temp_div = evaluate_diversity_single(list(sc_set))
        if temp_div[0] > diversity:
            diversity = temp_div[0]
            solutions = []
            solutions.append(sc_set)
        elif temp_div[0] == diversity:
            solutions.append(sc_set)
    #print("found diversity ", diversity)
    return diversity, solutions

In [7]:
# n_scen = 5
# indices = range(n_scen)
# set_size = 2
# combinations = itertools.combinations(indices, set_size)
# combinations = list(combinations)
# print(len(combinations))

In [None]:
# n_scenarios = 100000
# scenarios = sample_uncertainties(dike_model, n_scenarios)

n_scen = 3
indices = range(n_scen)
set_size = 2
combinations = itertools.combinations(indices, set_size)
combinations = list(combinations)

print(1)
# import random
# 
# combinations = random.sample(combinations, 100000)

no_workers = multiprocessing.cpu_count()
pool = multiprocessing.Pool(processes=no_workers)
print(2)
with open('output_scenarioselection.txt', 'a') as file:
    print(3)
    start_time = time.time()
    #now, divide this data for each worker
    worker_data = np.array_split(combinations, no_workers)
    print(4) 
    result = pool.imap(find_maxdiverse_scenarios, worker_data)
    print(result)        
    #find the max of these 8 
    max_diversity = 0.0
    for r in result:
        print("result : ", r)
        if r[0] >= max_diversity:
            max_diversity = r[0]
            solutions = []
            solutions.append(r[1])
        elif r[0] == max_diversity:
            solutions.append(r[1])                  

    end_time = time.time()
    file.write("Calculations took {} seconds.\n".format(end_time-start_time))
    print("Calculations took {} seconds.\n".format(end_time-start_time))
    file.write("maximum diversity and solutions: {}, {} \n\n".format(max_diversity, solutions))
    print("maximum diversity and solutions: {}, {} \n\n".format(max_diversity, solutions))


file.close()
    
pool.close()
pool.join()

1
2
3
4
<multiprocessing.pool.IMapIterator object at 0x000002A3776AFD10>
