In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans,DBSCAN,AgglomerativeClustering
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn_extra.cluster import KMedoids

from sklearn.manifold import TSNE
import umap

import sys
import cluster_validation_metrics as cvm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import glob
import seaborn as sns

import sweetviz as sv

import os
import kahypar as kahypar
from scipy.sparse import csr_matrix

## Data
- On shore windfarm in Scotland- 14 wind turbines with 2MW rated powe
- 5 years of SCADA data with 10 minutes sampling rate


In [None]:
data_path = os.path.join('../app/data/Penmanshiel/')

In [None]:
def read_data_penmanshiel(turbine_number):
    
    local_file_scada = os.path.join(data_path, f'scada_T{turbine_number:02d}.csv')
    df_scada = pd.read_csv(local_file_scada)
    df_scada = df_scada.set_index('Datetime',drop=True)
    
    local_file_logs = os.path.join(data_path, f'logs_T{turbine_number:02d}.csv')
    df_logs = pd.read_csv(local_file_logs)

    return df_scada, df_logs

In [None]:
df_penmanshiel=pd.DataFrame()
for t_id in [1,2,4,5,6,7,8,9,10,11,12,13,14,15]:
    print(t_id)
    df_scada, _ = read_data_penmanshiel(t_id)
    df_scada = df_scada[(df_scada.index>="2018-01-01 00:00:00") & (df_scada.index<="2022-12-31 23:50:00")].copy()
    df_scada.reset_index(inplace=True)
    df_scada["Turbine"] = f'T{t_id:02d}'
    df_penmanshiel=pd.concat([df_penmanshiel,df_scada])

In [None]:
df_penmanshiel['Datetime']=pd.to_datetime(df_penmanshiel['Datetime'])

In [None]:
# df_penmanshiel.to_csv("penmanshiel_preprocessed.csv")

In [None]:
# df_penmanshiel=pd.read_csv("penmanshiel_preprocessed.csv")
# df_penmanshiel['Datetime']=pd.to_datetime(df_penmanshiel['Datetime'])

In [None]:
df_penmanshiel_filtered = df_penmanshiel.copy()

### Feature Engineering
- Motor current = Mean (Motor current axis 1,axis 2,axis 3)
- Motor temperature = Mean (Motot temperature axis 1,axis 2,axis 3)
- Blade angle = Mean (Blade angle A, B, C)

In [None]:
df_penmanshiel_filtered["Motor temperature (°C)"] = df_penmanshiel_filtered[['Temperature motor axis 1 (°C)',
                                          'Temperature motor axis 2 (°C)',
                                          'Temperature motor axis 3 (°C)']].mean(axis=1)

df_penmanshiel_filtered["Motor current (A)"] = df_penmanshiel_filtered[['Motor current axis 1 (A)',
                                          'Motor current axis 2 (A)',
                                          'Motor current axis 3 (A)',]].mean(axis=1)

temp = df_penmanshiel_filtered[['Blade angle (pitch position) A (°)','Blade angle (pitch position) B (°)','Blade angle (pitch position) C (°)']].apply(np.radians)
# Convert angular values to Cartesian coordinates
x = temp.apply(lambda row: np.cos(row), axis=1)
y = temp.apply(lambda row: np.sin(row), axis=1)
# Average the Cartesian coordinates
mean_x = x.mean(axis=1)
mean_y = y.mean(axis=1)
# Convert the average Cartesian coordinates back to an angle in radians
mean_angle_radians = np.arctan2(mean_y, mean_x)
df_penmanshiel_filtered['Blade Angle (pitch position) (°)'] = np.degrees(mean_angle_radians)

In [None]:
df_penmanshiel_filtered[['Temperature motor axis 1 (°C)','Temperature motor axis 2 (°C)','Temperature motor axis 3 (°C)',"Motor temperature (°C)",
                        'Motor current axis 1 (A)','Motor current axis 2 (A)','Motor current axis 3 (A)',"Motor current (A)",
                        'Blade angle (pitch position) A (°)','Blade angle (pitch position) B (°)','Blade angle (pitch position) C (°)','Blade Angle (pitch position) (°)']].hist(figsize=(12,12))
plt.show()

##### Convert Angular features to its sine and cosine components

In [None]:
for col in ['Nacelle position (°)','Vane position 1+2 (°)','Blade Angle (pitch position) (°)']:
    df_penmanshiel_filtered[col.split("(°)")[0]+"cos"] = np.cos(np.radians(df_penmanshiel_filtered[col]))
    df_penmanshiel_filtered[col.split("(°)")[0]+"sin"] = np.sin(np.radians(df_penmanshiel_filtered[col]))

##### Convert Wind direction and wind speed to u and v components

In [None]:
def convert_angles_lengths_to_u_v(angles, lengths, conversion='trigonometric', kind='deg'):
    u = -np.sin(angles * np.pi / 180) * lengths
    v = -np.cos(angles * np.pi / 180) * lengths
    return u, v

In [None]:
df_penmanshiel_filtered["Wind direction u"],df_penmanshiel_filtered["Wind direction v"] = convert_angles_lengths_to_u_v(df_penmanshiel_filtered['Wind direction (°)'],df_penmanshiel_filtered['Wind speed (m/s)'])

In [None]:
df_penmanshiel_final=df_penmanshiel_filtered[['Datetime','Turbine',
'Long Term Wind (m/s)','Wind direction u','Wind direction v',
 'Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin',
 'Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)',
 'Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)',
 'Motor temperature (°C)','Motor current (A)',
 'Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)','Power (kW)']].copy()

In [None]:
df_penmanshiel_final.drop(["Datetime","Turbine",'Power (kW)'],axis=1).corr().stack().value_counts().sort_index(ascending=False)

Correlation between features is between -0.88 to 0.82

In [None]:
df_penmanshiel_final.drop(['Turbine','Datetime'],axis=1).hist(figsize=(20,20))
plt.show()

## Atomic asset behaviour extraction
- Divide data into weeks
- Calculate median per week per turbine for each feature

In [None]:
df_penmanshiel_per_week_median = df_penmanshiel_final.groupby([pd.Grouper(key='Datetime', freq='7D'),'Turbine']).median()

In [None]:
df_penmanshiel_per_week_median["Datetime"]=df_penmanshiel_per_week_median.index.get_level_values(0)
df_penmanshiel_per_week_median["week"]=df_penmanshiel_per_week_median.index.get_level_values(0)
df_penmanshiel_per_week_median["week"]=pd.factorize(df_penmanshiel_per_week_median['week'])[0]
df_penmanshiel_per_week_median["week"]=df_penmanshiel_per_week_median["week"]+1

df_penmanshiel_per_week_median["Turbine"]=df_penmanshiel_per_week_median.index.get_level_values(1)

df_penmanshiel_per_week_median["week-turbine"]="W"+df_penmanshiel_per_week_median["week"].astype(str)+":"+df_penmanshiel_per_week_median["Turbine"]

In [None]:
df_penmanshiel_per_week_median.set_index(["week-turbine"],inplace=True)

In [None]:
df_penmanshiel_per_week_median.drop(['week', 'Turbine','Datetime'],axis=1).hist(figsize=(20,20))
plt.show()

In [None]:
fig=px.line(df_penmanshiel_per_week_median.drop(['Datetime', 'week', 'Turbine','Power (kW)'],axis=1))
fig.show()

## Real time elementary mode detection

Train Test

In [None]:
df_train=pd.DataFrame()
df_train["Elementary mode"]=None
df_train["Composite mode"]=None
for i in range(1,54,1):
    for turb in df_week_turbine_plot_id.columns:
        ind = "W"+str(i)+":"+turb
        df_train.loc[ind,"Elementary mode"]=str(df_week_turbine_plot.loc[i][turb])
        df_train.loc[ind,"Composite mode"]=df_week_turbine_plot_id.loc[str(i)][turb]


df_test=pd.DataFrame()
df_test["Elementary mode"]=None
df_test["Composite mode"]=None
for i in range(54,len(df_week_turbine_plot_id)+1,1):
    for turb in df_week_turbine_plot_id.columns:
        ind = "W"+str(i)+":"+turb
        df_test.loc[ind,"Elementary mode"]=str(df_week_turbine_plot.loc[i][turb])
        df_test.loc[ind,"Composite mode"]=df_week_turbine_plot_id.loc[str(i)][turb]

Each elementary mode can be defined with features that it is composed of

In [None]:
layer_feature_dict={}
layer_feature_dict["Layer 1"] = ['Wind direction u','Wind direction v']
layer_feature_dict["Layer 2"] = ['Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin']
layer_feature_dict["Layer 3"] = ['Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)']
layer_feature_dict["Layer 4"] = ['Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)']
layer_feature_dict["Layer 5"] = ['Motor temperature (°C)','Motor current (A)']
layer_feature_dict["Layer 6"] = ['Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)']

In [None]:
layer_cluster_list=[]
for ind,key in enumerate([Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]):
    for _ in range(key["cluster"].nunique()):
        layer_cluster_list.append("Layer "+str(ind+1))

In [None]:
dict_clusters_layers={}
print(dict_clusters)
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")

feat_set=[]
for key in dict_clusters.keys():
    print("E"+str(key))
    print([layer_cluster_list[i] for i in dict_clusters[key]])
    dict_clusters_layers[key]=[layer_cluster_list[i] for i in dict_clusters[key]]

    feat=[]
    for lyr in list(set(dict_clusters_layers[key])):
        feat.append(layer_feature_dict[lyr])
    feat=[item for sublist in feat for item in sublist]
    feat=list(set(feat))
    feat_set.append(feat)
    print(feat)
    print("-----------------------------------------------------------------")

### Centroid based
Centroid of each elementary mode

In [None]:
from scipy.spatial.distance import cdist

fig=go.Figure()
elementary_mode_centroid={}
elementary_mode_centroid["centroid"]={}
elementary_mode_centroid["dist"]={}
for elem in [0,1,2,3,4]:
    ind=list(df_train[df_train["Elementary mode"].str.contains(str(elem))].index)

    scaler=MinMaxScaler()
    all_data=scaler.fit_transform(df_penmanshiel_per_week_median.loc[ind][feat_set[elem]].values)
    elementary_mode_centroid["centroid"][elem]=np.mean(all_data,axis=0)
    print("Number of samples in training set: ",len(all_data))

    if (len(ind)>1):
        fig1=go.Figure()
        pca = PCA(n_components=2).fit_transform(all_data)
        fig1=px.scatter(x=pca[:, 0],y=pca[:, 1],title="E"+str(elem)+"-PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(all_data).explained_variance_ratio_),2)) +")")
        fig1.show()
    
    dist = cdist(all_data,elementary_mode_centroid["centroid"][elem].reshape(1,-1),metric="euclidean")
    elementary_mode_centroid["dist"][elem]=np.max(dist)

    fig.add_traces(go.Histogram(x=dist.flatten(),name="E"+str(elem)))
    fig.update_xaxes(title="Distance")
    fig.update_yaxes(title="Number of instances")
fig.show()

In [None]:
elementary_mode_centroid["dist"]

New datapoint

In [None]:
df_test["pred_elem"]=None
df_test["dist"]=None
df_test["thres"]=None

for i,row in df_test.iterrows():
    pred_elem=[]
    dist_lst=[]
    thresh_dist=[]
    for elem in [0,1,2,3,4]:
        test_vector=df_penmanshiel_per_week_median.loc[i][feat_set[elem]].values
        
        scaler=MinMaxScaler()
        ind=list(df_train[df_train["Elementary mode"].str.contains(str(elem))].index)
        scaler.fit(df_penmanshiel_per_week_median.loc[ind][feat_set[elem]].values)
        X_test=scaler.transform(test_vector.reshape(1,-1))

        dist=cdist(elementary_mode_centroid["centroid"][elem].reshape(1,-1),X_test)
        if (dist<=elementary_mode_centroid["dist"][elem]):
            pred_elem.append(elem)
            dist_lst.append(dist[0][0])
            thresh_dist.append(elementary_mode_centroid["dist"][elem])
    df_test.loc[i,"pred_elem"]=str(pred_elem)
    df_test.loc[i,"dist"]=str(dist_lst)
    df_test.loc[i,"thres"]=str(thresh_dist)

In [None]:
df_test

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(df_test["Elementary mode"], df_test["pred_elem"])

In [None]:
report=classification_report(df_test["Elementary mode"], df_test["pred_elem"])
print(report)

In [None]:
df_train["Elementary mode"].value_counts()

### Bounding box

In [None]:
elementary_mode_bb={}
elementary_mode_bb["feats"]={}
elementary_mode_bb["min"]={}
elementary_mode_bb["max"]={}

fig=go.Figure()
for elem in [0,1,2,3,4]:
    ind=list(df_train[df_train["Elementary mode"].str.contains(str(elem))].index)
    
    all_data=df_penmanshiel_per_week_median.loc[ind][feat_set[elem]]
    elementary_mode_bb["feats"][elem]=feat_set[elem]
    elementary_mode_bb["min"][elem]=all_data.min().values
    elementary_mode_bb["max"][elem]=all_data.max().values
    fig.add_traces(go.Scatter(x=feat_set[elem],y=all_data.min().values,name="E"+str(elem)+"-min"))
    fig.add_traces(go.Scatter(x=feat_set[elem],y=all_data.max().values,name="E"+str(elem)+"-max"))

    test_vector=df_penmanshiel_per_week_median.loc["W55:T10"][feat_set[4]].values
    fig.add_traces(go.Scatter(x=feat_set[4],y=test_vector,name="Test vector"))
fig.show()

In [None]:
test_vector

In [None]:
df_penmanshiel_per_week_median.loc[i][feat_set[1]].values

In [None]:
elementary_mode_bb

In [None]:
df_test["pred_bb"]=None

for i,row in df_test.iterrows():
    pred_elem=[]
    dist_lst=[]
    thresh_dist=[]
    for elem in [0,1,2,3,4]:
        test_vector=df_penmanshiel_per_week_median.loc[i][feat_set[elem]].values
        min_ = elementary_mode_bb["min"][elem]
        max_ = elementary_mode_bb["max"][elem]
        if (np.all((test_vector >= min_) & (test_vector <= max_))):
            pred_elem.append(elem)
    df_test.loc[i,"pred_bb"]=str(pred_elem)

In [None]:
df_test[["Elementary mode","pred_bb"]].head(50)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(df_test["Elementary mode"], df_test["pred_bb"])

In [None]:
report=classification_report(df_test["Elementary mode"], df_test["pred_bb"])
print(report)

### Layer clusters - hyperclusters

In [None]:
df_layercls_hypcls=pd.DataFrame(columns=layer_list_temp,index=['E' + str(i) for i in dict_clusters.keys()])

for key in dict_clusters.keys():
    print("E"+str(key))
    print([layer_list_temp[i] for i in dict_clusters[key]])
    df_layercls_hypcls.loc["E"+str(key),[layer_list_temp[i] for i in dict_clusters[key]]]=1
df_layercls_hypcls=df_layercls_hypcls.notnull().astype(int)

In [None]:
fig=px.imshow(df_layercls_hypcls)
fig.show()

In [None]:
for key in [1]:
    print("Cluster: ",key)
    print(str(len(list(clus_sol[key]))) +" turbine-week pairs")
    temp=df_penmanshiel_per_week_median.loc[list(clus_sol[key])]
    print("Number of turbines: ",temp["Turbine"].nunique())
    print("Number of unique weeks: ",temp["week"].nunique())
          
    fig=px.bar(temp["Turbine"].values,color=temp["week"].astype(str).values)
    fig.update_xaxes(title="Turbine")
    fig.update_layout(legend_title_text="Week")
    fig.show()

In [None]:
temp=df_test[["Elementary mode","Composite mode"]].copy()
df_layer_hyper=pd.concat([df_train,temp])
df_layer_hyper.loc[Layer1["week-turbine"],"Layer1"]=Layer1["cluster"].values
df_layer_hyper.loc[Layer2["week-turbine"],"Layer2"]=Layer2["cluster"].values
df_layer_hyper.loc[Layer3["week-turbine"],"Layer3"]=Layer3["cluster"].values
df_layer_hyper.loc[Layer4["week-turbine"],"Layer4"]=Layer4["cluster"].values
df_layer_hyper.loc[Layer5["week-turbine"],"Layer5"]=Layer5["cluster"].values
df_layer_hyper.loc[Layer6["week-turbine"],"Layer6"]=Layer6["cluster"].values

In [None]:
df_layer_hyper.loc["W1:T10"]

## Hyperclusters

In [None]:
def generate_final_clusters(final_clusters1, hypergraph1, method):
    # mapping hyperedges to data objects to obtain the clustering solution of data objects
    temp_del = 0
    clustering_nodes = {}
    for key, val in final_clusters1.items():
        if method == 'donot_inc_key_in_cluster':
            pins_center = []
        elif method == "inc_key_in_cluster":
            pins_center = list(hypergraph1.pins(key))
        
        for _ in val:
            pins_center.extend(list(hypergraph1.pins(_)))
        clustering_nodes[key] = set(pins_center)
        temp_del = temp_del + len(set(pins_center))

    if Debug == True:
        print("clustering of data objects", clustering_nodes) # dict, key = center(hyperedge), values = data objects
    
    # replacing the index of the data object with its short id
    clus_nodes_short_id = {}
    for key, val in clustering_nodes.items():
        # print(val)
        clus_nodes_short_id[key] = {nodes_hyper[x] for x in val} # note that sets are not ordered

    if Debug == True:
        print("clustering solution, key = center (hyperedge), val = set of short_ids")
        print(clus_nodes_short_id)
    return clus_nodes_short_id

In [None]:
year ="2019"
df_oneyear=df_penmanshiel_per_week_median[df_penmanshiel_per_week_median["Datetime"].astype(str).str.contains(year)]

# creating layers
Layer1 = df_oneyear.reset_index()[['week-turbine','Wind direction u','Wind direction v']].dropna().copy()
Layer2 = df_oneyear.reset_index()[['week-turbine','Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin']].dropna().copy()
Layer3 = df_oneyear.reset_index()[['week-turbine','Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)']].dropna().copy()
Layer4 = df_oneyear.reset_index()[['week-turbine','Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)']].dropna().copy()
Layer5 = df_oneyear.reset_index()[['week-turbine','Motor temperature (°C)','Motor current (A)']].dropna().copy()
Layer6 = df_oneyear.reset_index()[['week-turbine','Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)']].dropna().copy()

layer_data=[Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]
layer_dict={}

for l_id in range(len(layer_data)): # Specify number of layers
    layer_dict['Layer '+str(l_id+1)] = {}
    layer_dict['Layer '+str(l_id+1)]["Layer_data"] = layer_data[l_id]
    
    scaler = MinMaxScaler()
    layer_data_transformed = scaler.fit_transform(layer_data[l_id].drop(["week-turbine"],axis=1).to_numpy())
    layer_dict['Layer '+str(l_id+1)]["Data_transformed"] = layer_data_transformed

db = DBSCAN(eps=0.1).fit(layer_dict["Layer 1"]["Data_transformed"])
pca_layer1 = PCA(n_components=2).fit_transform(layer_dict["Layer 1"]["Data_transformed"])
fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 1"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
fig.show()
Layer1["cluster"]=layer_dict["Layer 1"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer1=Layer1.copy()

db = DBSCAN(eps=0.15).fit(layer_dict["Layer 2"]["Data_transformed"])
pca_layer2= PCA(n_components=6).fit_transform(layer_dict["Layer 2"]["Data_transformed"])
fig=px.scatter(x=pca_layer2[:, 0],y=pca_layer2[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 2"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
fig.show()
Layer2["cluster"]=layer_dict["Layer 2"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer2=Layer2.copy()

db = DBSCAN(eps=0.12).fit(layer_dict["Layer 3"]["Data_transformed"])
pca_layer3= PCA(n_components=2).fit_transform(layer_dict["Layer 3"]["Data_transformed"])
fig=px.scatter(x=pca_layer3[:, 0],y=pca_layer3[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 3"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
fig.show()
Layer3["cluster"]=layer_dict["Layer 3"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer3=Layer3.copy()

db = DBSCAN(eps=0.1).fit(layer_dict["Layer 4"]["Data_transformed"])
pca_layer4= PCA(n_components=2).fit_transform(layer_dict["Layer 4"]["Data_transformed"])
fig=px.scatter(x=pca_layer4[:, 0],y=pca_layer4[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 4"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
fig.show()
Layer4["cluster"]=layer_dict["Layer 4"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer4=Layer4.copy()

db = DBSCAN(eps=0.04).fit(layer_dict["Layer 5"]["Data_transformed"])
pca_layer5 = PCA(n_components=2).fit_transform(layer_dict["Layer 5"]["Data_transformed"])
fig=px.scatter(x=pca_layer5[:, 0],y=pca_layer5[:, 1],color=db.labels_.astype(str),title="PCA(1.0)")
fig.show()
Layer5["cluster"]=layer_dict["Layer 5"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer5=Layer5.copy()

db = DBSCAN(eps=0.045).fit(layer_dict["Layer 6"]["Data_transformed"])
pca_layer6 = PCA(n_components=2).fit_transform(layer_dict["Layer 6"]["Data_transformed"])
# fig=px.scatter(x=pca_layer6[:, 0],y=pca_layer6[:, 1],color=db.labels_.astype(str),title="PCA (1.0)")
# fig.show()
Layer6["cluster"]=layer_dict["Layer 6"]["Layer_data"]["cluster"]=db.labels_
dfincdb_layer6=Layer6.copy()

# Drop outliers
layer_data=[Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]
for l_id, key in enumerate(layer_dict.keys()):
    temp=layer_dict[key]["Layer_data"]
    layer_dict[key]["Layer_data"]=temp[temp["cluster"]!=-1]
    layer_data[l_id]=temp[temp["cluster"]!=-1]

Layer1=Layer1[Layer1["cluster"]!=-1]
Layer2=Layer2[Layer2["cluster"]!=-1]
Layer3=Layer3[Layer3["cluster"]!=-1]
Layer4=Layer4[Layer4["cluster"]!=-1]
Layer5=Layer5[Layer5["cluster"]!=-1]
Layer6=Layer6[Layer6["cluster"]!=-1]
for key in layer_dict.keys():
    print(key+"--->"+str(len(np.unique(layer_dict[key]["Layer_data"]["cluster"])))+" clusters")

# Hypergraph
# creating a nested list, where each inner list lists the ids in that cluster. 
lst = [v for v in Layer1.groupby('cluster')['week-turbine'].apply(list).values]
lst = lst + [v for v in Layer2.groupby('cluster')['week-turbine'].apply(list).values]
lst = lst + [v for v in Layer3.groupby('cluster')['week-turbine'].apply(list).values]
lst = lst + [v for v in Layer4.groupby('cluster')['week-turbine'].apply(list).values]
lst = lst + [v for v in Layer5.groupby('cluster')['week-turbine'].apply(list).values]
lst = lst + [v for v in Layer6.groupby('cluster')['week-turbine'].apply(list).values]
hyperedge_indices = []
cnt = 0
hyperedge_indices.append(cnt)
for each in lst:
    cnt = cnt+len(each)
    hyperedge_indices.append(cnt)
nodes_hyper = df_oneyear.index.tolist()
hyperedges_1 = [item for sublist in lst for item in sublist]
hyperedges = [nodes_hyper.index(i) for i in hyperedges_1]
num_nodes = len(nodes_hyper)
num_nets = len(hyperedge_indices)-1

k = 2
hypergraph = kahypar.Hypergraph(num_nodes, num_nets, hyperedge_indices, hyperedges, k)
context = kahypar.Context()
context.loadINIconfiguration("cut_kKaHyPar_sea20.ini")

node_incident_edges = [] # list containg the incident edges of each node in sublist.
for each_node in hypergraph.nodes():
    ie = []
    for incident_edge in hypergraph.incidentEdges(each_node):
        ie.append(incident_edge)
    node_incident_edges.append(ie)
cluster_list = [sublist for sublist in lst if len(sublist) > 1]

#Obtaining the neighbourhood of each edge.
total_edges = hypergraph.numEdges()

neighbourhood = [] # neighbourhood of each edge is presented in order.
number_neighbours = []
for i in range(total_edges):
    temp = []
    for node, edges in enumerate(node_incident_edges):
        if i in edges:
            for e in edges:
                temp.append(e)
    neighbourhood.append(set(temp))
    number_neighbours.append(len(set(temp)))
layer_list_temp=[]
for ind,key in enumerate([Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]):
    for cls in range(key["cluster"].nunique()):
        layer_list_temp.append("Layer"+str(ind+1)+"- cluster"+str(cls+1))

# calculating Nearest Neighbourhood Similarity
NNS = {}
for i in range(total_edges):
    for j in range(i, total_edges):
        if i != j:
            # print("checking intersection of ", i, "and", j, "i.e.,", neighbourhood[i], "and", neighbourhood[j])
            intersection = neighbourhood[i].intersection(neighbourhood[j])
            if (i not in intersection) or (j not in intersection): # or condition is not required. if i is in intersection then automatically j will be in the intersection. 
                # print(i, j, intersection, neighbourhood[i], neighbourhood[j])
                NNS[(i,j)] = 0
            else:
                union = neighbourhood[i].union(neighbourhood[j])
                NNS[(i,j)] = len(intersection)/len(union)
cnt=0
df_snns=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
for i in np.arange(0,total_edges,1):
    for j in np.arange(0,total_edges,1):
        if (i,j) in NNS:
            cnt=cnt+1
            df_snns.loc[i,j]=NNS[i,j]
# converting the similarity matrix into distance matrix.
NNS_dist = {k: 1-v for k, v in NNS.items()}

# obtaining the distance matrix
array_dist = []
for i in range(total_edges):
    temp = []
    for j in range(total_edges):
        if i != j:
            temp.append(NNS_dist[(min(i, j), max(i,j))])
        else:
            temp.append(0)
    array_dist.append(temp)

df_dist=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
for i in np.arange(0,total_edges,1):
    for j in np.arange(0,total_edges,1):
        # if (i<=j):
        df_dist.loc[i,j]=np.round(array_dist[i][j],2)
df_sim=1-df_dist

# kmeans=KMeans(n_clusters=5).fit_predict(np.array(array_dist))
kmeans = KMedoids(n_clusters=5, metric='precomputed', method='pam', init='k-medoids++', random_state=0).fit_predict(np.array(array_dist))
dict_clusters = {0: [], 1:[], 2:[],3:[],4:[]}
for index, each in enumerate(kmeans):
    dict_clusters[each].append(index)

Debug = False
clus_sol = generate_final_clusters(dict_clusters, hypergraph, 'donot_inc_key_in_cluster')
# print(clus_sol) # clus_sol is the final clustering solution based on k-medoids based method. key: cluster number, value: data objects in cluster
cluster_objects_list=[list(value) for value in clus_sol.values()]
for key in clus_sol.keys():
    print(str(key) +" : "+str(len(list(clus_sol[key]))))

fig=go.Figure()
for key in clus_sol.keys():
    temp=df_oneyear.reset_index().copy()
    temp=temp[temp["week-turbine"].isin(list(clus_sol[key]))].copy()

    fig.add_trace(go.Box(y=temp['Power (kW)'], name='E'+str(key)))

fig.update_layout(title="Active power")
# fig.update_xaxes(title="Cluster")
fig.update_yaxes(title="Power (kW)")
fig.show()

df_fca=pd.DataFrame(index=df_oneyear.index.astype(str),columns=clus_sol.keys())
for ind,row in df_fca.iterrows():
    for clus in row.index:
        if (ind in list(clus_sol[clus])):
            df_fca.loc[ind,clus]=True
        else:
            df_fca.loc[ind,clus]=False
df_fca.columns=df_fca.columns.astype(str)
df_fca.columns="E"+df_fca.columns

from fcapy.context import FormalContext
K = FormalContext.from_pandas(pd.DataFrame(df_fca))

from fcapy.lattice import ConceptLattice
L = ConceptLattice.from_context(K)

from fcapy.visualizer import LineVizNx
fig, ax = plt.subplots(figsize=(10, 5))
vsl = LineVizNx()
vsl.draw_concept_lattice(L, ax=ax, flg_node_indices=True,flg_new_intent_count_prefix=False)
ax.set_title('week-turbine concept lattice')
plt.tight_layout()
plt.show()

week_cluster={}
for w_id in np.arange(1,df_oneyear["week"].nunique()+1,1):
    clst_lst=[]
    for key in clus_sol.keys():
        lst=[element for element in list(clus_sol[key]) if "W"+str(w_id)+":" in element]
        if (len(lst)!=0):
            clst_lst.append(key)
    week_cluster[w_id]=clst_lst


turbine_cluster={}
for turb in df_oneyear["Turbine"].unique():
    clst_lst=[]
    for key in clus_sol.keys():
        lst=[element for element in list(clus_sol[key]) if turb in element]
        if (len(lst)!=0):
            clst_lst.append(key)
    turbine_cluster[turb]=clst_lst

df_week_cluster=pd.DataFrame(str(0),index=list(week_cluster.keys()),columns=np.arange(0,len(clus_sol.keys()),1))
for ind,row in df_week_cluster.iterrows():
    df_week_cluster.loc[ind,week_cluster[ind]]=str(1)
df_week_cluster.index=df_week_cluster.index.astype(str)
df_week_cluster.columns=df_week_cluster.columns.astype(str)


df_turbine_cluster=pd.DataFrame(str(0),index=list(turbine_cluster.keys()),columns=np.arange(0,len(clus_sol.keys()),1))
for ind,row in df_turbine_cluster.iterrows():
    df_turbine_cluster.loc[ind,turbine_cluster[ind]]=str(1)
df_turbine_cluster.index=df_turbine_cluster.index.astype(str)
df_turbine_cluster.columns=df_turbine_cluster.columns.astype(str)

df_week_turbine_plot=pd.DataFrame(index=df_week_cluster.index.astype(int),columns=df_turbine_cluster.index)
unique_combination=[]
for w_id in np.arange(1,df_oneyear["week"].nunique()+1,1):
    for t_id in df_turbine_cluster.index:
        week_turb= "W"+str(w_id)+":"+t_id 
        comb = [key for key, values in clus_sol.items() if week_turb in values]
        df_week_turbine_plot.loc[w_id,t_id]=comb
        if comb not in unique_combination:
            unique_combination.append(comb)
unique_dict=dict(zip(np.arange(100,100+len(unique_combination)+1,1), unique_combination))
unique_dict={tuple(value): key for key, value in unique_dict.items()}
unique_dict.keys()

df_week_turbine_plot_id=df_week_turbine_plot.copy()
for ind,row in df_week_turbine_plot_id.iterrows():
    for col in df_week_turbine_plot_id.columns:
        df_week_turbine_plot_id.loc[ind,col]=unique_dict[tuple(df_week_turbine_plot_id.loc[ind,col])]
        
df_week_turbine_plot_id.index=df_week_turbine_plot_id.index.astype(str)
df_week_turbine_plot_id=df_week_turbine_plot_id.astype(int).astype(str)
fig = px.imshow(df_week_turbine_plot_id.T,labels=dict(x="Week", y="Turbine"))
fig.update_layout(height=500)
fig.show()
print(unique_dict)

if (year=="2019"):
    df_train=pd.DataFrame()
    df_train["Elementary mode"]=None
    for i in range(1,54,1):
        for turb in df_week_turbine_plot.columns:
            ind = "W"+str(i)+":"+turb
            df_train.loc[ind,"Elementary mode"]=str(df_week_turbine_plot.loc[i][turb])
            

In [None]:
layer_feature_dict={}
layer_feature_dict["Layer 1"] = ['Wind direction u','Wind direction v']
layer_feature_dict["Layer 2"] = ['Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin']
layer_feature_dict["Layer 3"] = ['Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)']
layer_feature_dict["Layer 4"] = ['Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)']
layer_feature_dict["Layer 5"] = ['Motor temperature (°C)','Motor current (A)']
layer_feature_dict["Layer 6"] = ['Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)']

layer_cluster_list=[]
for ind,key in enumerate([Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]):
    for _ in range(key["cluster"].nunique()):
        layer_cluster_list.append("Layer "+str(ind+1))

dict_clusters_layers={}
print(dict_clusters)
print("--------------------------------------------------------------------")
print("--------------------------------------------------------------------")

feat_set=[]
for key in dict_clusters.keys():
    print("E"+str(key))
    print([layer_cluster_list[i] for i in dict_clusters[key]])
    dict_clusters_layers[key]=[layer_cluster_list[i] for i in dict_clusters[key]]

    feat=[]
    for lyr in list(set(dict_clusters_layers[key])):
        feat.append(layer_feature_dict[lyr])
    feat=[item for sublist in feat for item in sublist]
    feat=list(set(feat))
    feat_set.append(feat)
    print(feat)
    print("-----------------------------------------------------------------")

### Boundingbox

In [None]:
df_test=pd.DataFrame()
df_test["Elementary mode"]=None
for i in range(54,(54*2)-2,1):
    for turb in df_week_turbine_plot_id.columns:
        ind = "W"+str(i)+":"+turb
        df_test.loc[ind,"Elementary mode"]=None

In [None]:
elementary_mode_bb={}
elementary_mode_bb["feats"]={}
elementary_mode_bb["min"]={}
elementary_mode_bb["max"]={}

fig=go.Figure()
for elem in [0,1,2,3,4]:
    ind=list(df_train[df_train["Elementary mode"].str.contains(str(elem))].index)
    
    all_data=df_penmanshiel_per_week_median.loc[ind][feat_set[elem]]
    elementary_mode_bb["feats"][elem]=feat_set[elem]
    elementary_mode_bb["min"][elem]=all_data.min().values
    elementary_mode_bb["max"][elem]=all_data.max().values
    fig.add_traces(go.Scatter(x=feat_set[elem],y=all_data.min().values,name="E"+str(elem)+"-min"))
    fig.add_traces(go.Scatter(x=feat_set[elem],y=all_data.max().values,name="E"+str(elem)+"-max"))

    test_vector=df_penmanshiel_per_week_median.loc["W55:T10"][feat_set[4]].values
    fig.add_traces(go.Scatter(x=feat_set[4],y=test_vector,name="Test vector"))
fig.show()

In [None]:
df_test["pred_bb"]=None

for i,row in df_test.iterrows():
    pred_elem=[]
    dist_lst=[]
    thresh_dist=[]
    for elem in [0,1,2,3,4]:
        test_vector=df_penmanshiel_per_week_median.loc[i][feat_set[elem]].values
        min_ = elementary_mode_bb["min"][elem]
        max_ = elementary_mode_bb["max"][elem]
        if (np.all((test_vector >= min_) & (test_vector <= max_))):
            pred_elem.append(elem)
    df_test.loc[i,"pred_bb"]=str(pred_elem)

In [None]:
import json
df_week_turbine_plot_new=df_week_turbine_plot.copy()
for week in np.arange(54,(54*2)-2,1):
    df_week_turbine_plot_new.loc[week]=None
    for turb in df_week_turbine_plot_new.columns:
        df_week_turbine_plot_new.loc[week,turb]= json.loads(df_test.loc["W"+str(week)+":"+turb]["pred_bb"])

In [None]:
unique_combination=[]
for week in np.arange(1,(54*2)-2,1):
    for turb in df_week_turbine_plot_new.columns:
        comb=df_week_turbine_plot_new.loc[week,turb]
        if comb not in unique_combination:
            unique_combination.append(comb)
            
unique_dict_=dict(zip(np.arange(100,100+len(unique_combination)+1,1), unique_combination))
unique_dict_={tuple(value): key for key, value in unique_dict_.items()}
unique_dict_.keys()

In [None]:
df_week_turbine_plot_id=df_week_turbine_plot_new.copy()
for ind,row in df_week_turbine_plot_id.iterrows():
    for col in df_week_turbine_plot_id.columns:
        df_week_turbine_plot_id.loc[ind,col]=unique_dict_[tuple(df_week_turbine_plot_id.loc[ind,col])]
        
df_week_turbine_plot_id.index=df_week_turbine_plot_id.index.astype(str)
df_week_turbine_plot_id=df_week_turbine_plot_id.astype(int).astype(str)


from plotly.subplots import make_subplots

# Create subplots
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("2019-Train","2020-Test"),
    shared_yaxes=True,  # Share y-axis if needed
    horizontal_spacing=0.1  # Adjust spacing between plots
)

fig.add_trace(
    go.Heatmap(z=df_week_turbine_plot_id.iloc[0:53].T.replace({'A': 0}), x=df_week_turbine_plot_id.iloc[0:53].index,
    y=df_week_turbine_plot_id.columns,
    coloraxis="coloraxis",text=df_week_turbine_plot_id.iloc[0:53].T,texttemplate="%{text}"),
    row=1, col=1
)
print(unique_dict)

fig.add_trace(
    go.Heatmap(z=df_week_turbine_plot_id.iloc[53:53*2].T.replace({'A': 0}), x=df_week_turbine_plot_id.iloc[53:53*2].index,
    y=df_week_turbine_plot_id.columns,
    coloraxis="coloraxis",text=df_week_turbine_plot_id.iloc[53:53*2].T,texttemplate="%{text}"),
    row=2, col=1
)
print(unique_dict_)

# Update layout to add shared color axis
fig.update_layout(
    # coloraxis=dict(colorscale='Viridis'),
    coloraxis_colorbar=dict(title="Shared Color Bar"),
    height=800,width=1100,
    xaxis_title='Week',
    yaxis_title='Turbine',
    coloraxis_showscale=False

    ,xaxis=dict(
        titlefont=dict(size=18),
        tickfont=dict(size=12)
    ),
    yaxis=dict(
        titlefont=dict(size=18),
        tickfont=dict(size=12)
    )
    
)
# fig.update_traces(
#     textfont=dict(color=[["black" if val == "A" else "white" for val in row] for row in z])
# )
# Show the figure
fig.show()

### Layer clusters - hyperclusters

In [None]:
layer_list_temp=[]
for ind,key in enumerate([Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]):
    for cls in range(key["cluster"].nunique()):
        layer_list_temp.append("Layer"+str(ind+1)+"- cluster"+str(cls+1))

In [None]:
df_layercls_hypcls=pd.DataFrame(columns=layer_list_temp,index=['E' + str(i) for i in dict_clusters.keys()])

for key in dict_clusters.keys():
    print("E"+str(key))
    print([layer_list_temp[i] for i in dict_clusters[key]])
    df_layercls_hypcls.loc["E"+str(key),[layer_list_temp[i] for i in dict_clusters[key]]]=1
df_layercls_hypcls=df_layercls_hypcls.notnull().astype(int)

In [None]:
fig=px.imshow(df_layercls_hypcls)
fig.show()

In [None]:
df_layer_hyper=df_train.copy()
df_layer_hyper.loc[Layer1["week-turbine"],"Layer1"]=Layer1["cluster"].values+1
df_layer_hyper.loc[Layer2["week-turbine"],"Layer2"]=Layer2["cluster"].values+1
df_layer_hyper.loc[Layer3["week-turbine"],"Layer3"]=Layer3["cluster"].values+1
df_layer_hyper.loc[Layer4["week-turbine"],"Layer4"]=Layer4["cluster"].values+1
df_layer_hyper.loc[Layer5["week-turbine"],"Layer5"]=Layer5["cluster"].values+1
df_layer_hyper.loc[Layer6["week-turbine"],"Layer6"]=Layer6["cluster"].values+1

In [None]:
df_layer_hyper[df_layer_hyper.isna().any(axis=1)].head(50)

### Incremental DBScan

In [None]:
# pip install incdbscan
from incdbscan import IncrementalDBSCAN

In [None]:
# DBSCAN = INCDBSCAN
incdb = IncrementalDBSCAN(eps=0.1, min_pts=5)
incdb.insert(layer_dict["Layer 1"]["Data_transformed"])
dfincdb_layer1_temp=dfincdb_layer1.copy()
dfincdb_layer1_temp["incdb_cluster"]=incdb.get_cluster_labels(layer_dict["Layer 1"]["Data_transformed"])

fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["cluster"].values.astype(str))
fig.show()

fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["incdb_cluster"].values.astype(str))
fig.show()

In [None]:
from incdbscan import IncrementalDBSCAN
incdb = IncrementalDBSCAN(eps=0.1, min_pts=5)
dfincdb_layer1_temp=dfincdb_layer1.copy()
temp=layer_dict["Layer 1"]["Data_transformed"].copy()

incdb.insert(temp[0:400]) # 1st batch
dfincdb_layer1_temp.loc[0:400-1,"incdb_cluster"]=incdb.get_cluster_labels(temp[0:400])

incdb.insert(temp[400::]) # 2nd batch
dfincdb_layer1_temp.loc[400::,"incdb_cluster"]=incdb.get_cluster_labels(temp[400::]) # get labels for 2nd batch
# dfincdb_layer1_temp["incdb_cluster"]=incdb.get_cluster_labels(temp) # get labels for all

fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["cluster"].values.astype(str),title="DBScan")
fig.show()
fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["incdb_cluster"].values.astype(str),title="IncDBScan")
fig.show()

In [None]:
incdb = IncrementalDBSCAN(eps=0.1, min_pts=5)
dfincdb_layer1_temp=dfincdb_layer1.copy()
dfincdb_layer1_temp['incdb_cluster']=None
temp=layer_dict["Layer 1"]["Data_transformed"].copy()

n=100
incdb.insert(temp[0:n]) # 1st batch
dfincdb_layer1_temp.loc[0:n-1,"incdb_cluster"]=list(incdb.get_cluster_labels(temp[0:n]))
fig=px.scatter(x=pca_layer1[0:n, 0],y=pca_layer1[0:n, 1],color=dfincdb_layer1_temp.iloc[0:n]["cluster"].values.astype(str),title="DBScan")
fig.show()
fig=px.scatter(x=pca_layer1[0:n, 0],y=pca_layer1[0:n, 1],color=dfincdb_layer1_temp.iloc[0:n]["incdb_cluster"].values.astype(str),title="IncDBScan")
fig.show()

In [None]:
incdb.insert(temp[n::]) # 2nd batch
# dfincdb_layer1_temp.loc[n::,"incdb_cluster"]=incdb.get_cluster_labels(temp[n::]) # get labels for 2nd batch
dfincdb_layer1_temp["incdb_cluster"]=incdb.get_cluster_labels(temp) # get labels for all

fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["cluster"].values.astype(str),title="DBScan")
fig.show()
fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=dfincdb_layer1_temp["incdb_cluster"].values.astype(str),title="IncDBScan")
fig.show()

In [None]:
incdb = IncrementalDBSCAN(eps=0.1, min_pts=5)
dfincdb_layer1_temp=dfincdb_layer1.copy()
dfincdb_layer1_temp['incdb_cluster']=None
temp=layer_dict["Layer 1"]["Data_transformed"].copy()

j=0
month=1
for i in np.arange(56,len(temp),56):
    incdb.insert(temp[j:i]) # 1st batch
    dfincdb_layer1_temp.loc[0:i-1,"incdb_cluster"]=list(incdb.get_cluster_labels(temp[0:i]))
    fig=px.scatter(x=pca_layer1[0:i, 0],y=pca_layer1[0:i, 1],color=dfincdb_layer1_temp.iloc[0:i]["incdb_cluster"].values.astype(str),title="Month-"+str(month))
    fig.show()
    j=i
    month=month+1

In [None]:
year ="2019"
df_oneyear=df_penmanshiel_per_week_median[df_penmanshiel_per_week_median["Datetime"].astype(str).str.contains(year)]

# creating layers
Layer1 = df_oneyear.reset_index()[['week-turbine','Wind direction u','Wind direction v']].dropna().copy()
Layer2 = df_oneyear.reset_index()[['week-turbine','Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin']].dropna().copy()
Layer3 = df_oneyear.reset_index()[['week-turbine','Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)']].dropna().copy()
Layer4 = df_oneyear.reset_index()[['week-turbine','Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)']].dropna().copy()
Layer5 = df_oneyear.reset_index()[['week-turbine','Motor temperature (°C)','Motor current (A)']].dropna().copy()
Layer6 = df_oneyear.reset_index()[['week-turbine','Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)']].dropna().copy()

layer_data=[Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]
layer_dict={}

for l_id in range(len(layer_data)): # Specify number of layers
    layer_dict['Layer '+str(l_id+1)] = {}
    layer_dict['Layer '+str(l_id+1)]["Layer_data"] = layer_data[l_id]
    
    scaler = MinMaxScaler()
    layer_data_transformed = scaler.fit_transform(layer_data[l_id].drop(["week-turbine"],axis=1).to_numpy())
    layer_dict['Layer '+str(l_id+1)]["Data_transformed"] = layer_data_transformed

incdb_1 = IncrementalDBSCAN(eps=0.1, min_pts=5) 
incdb_2 = IncrementalDBSCAN(eps=0.15, min_pts=5) 
incdb_3 = IncrementalDBSCAN(eps=0.12, min_pts=5) 
incdb_4 = IncrementalDBSCAN(eps=0.1, min_pts=5) 
incdb_5 = IncrementalDBSCAN(eps=0.04, min_pts=5) 
incdb_6 = IncrementalDBSCAN(eps=0.045, min_pts=5) 

temp=layer_dict["Layer 1"]["Data_transformed"].copy()
j=0
month=1
for i in np.arange(56,len(temp),56):

    incdb_1.insert(layer_dict["Layer 1"]["Data_transformed"][j:i]) 
    fig=px.scatter(x=pca_layer1[0:i, 0],y=pca_layer1[0:i, 1],
                   color=list(incdb_1.get_cluster_labels(layer_dict["Layer 1"]["Data_transformed"][0:i])).astype(str))
    fig.show()

## Incremental hypergraph clustering

In [None]:
len(df_penmanshiel_per_week_median[df_penmanshiel_per_week_median["Datetime"].astype(str).str.contains('2018|2019|2020')])
len(df_penmanshiel_per_week_median[df_penmanshiel_per_week_median["Datetime"].astype(str).str.contains('2018')])

In [None]:
def generate_final_clusters(final_clusters1, hypergraph1, method):
    # mapping hyperedges to data objects to obtain the clustering solution of data objects
    temp_del = 0
    clustering_nodes = {}
    for key, val in final_clusters1.items():
        if method == 'donot_inc_key_in_cluster':
            pins_center = []
        elif method == "inc_key_in_cluster":
            pins_center = list(hypergraph1.pins(key))
        
        for _ in val:
            pins_center.extend(list(hypergraph1.pins(_)))
        clustering_nodes[key] = set(pins_center)
        temp_del = temp_del + len(set(pins_center))

    if Debug == True:
        print("clustering of data objects", clustering_nodes) # dict, key = center(hyperedge), values = data objects
    
    # replacing the index of the data object with its short id
    clus_nodes_short_id = {}
    for key, val in clustering_nodes.items():
        # print(val)
        clus_nodes_short_id[key] = {nodes_hyper[x] for x in val} # note that sets are not ordered

    if Debug == True:
        print("clustering solution, key = center (hyperedge), val = set of short_ids")
        print(clus_nodes_short_id)
    return clus_nodes_short_id

In [None]:
def merge_dataframes(base_df, new_df, new_suffix):
    merged_df = pd.merge(base_df, new_df, on='week-turbine', how='right', suffixes=('', new_suffix))
    return merged_df

In [None]:
# year ="2019"
# df_oneyear=df_penmanshiel_per_week_median[df_penmanshiel_per_week_median["Datetime"].astype(str).str.contains(year)]

show=True
# for b in range(13):
for b in [10]:
    batch=b
    print(batch)
    df_oneyear=df_penmanshiel_per_week_median.iloc[0:(742+(56*batch))].copy()
    # creating layers
    Layer1 = df_oneyear.reset_index()[['week-turbine','Wind direction u','Wind direction v']].dropna().copy()
    Layer2 = df_oneyear.reset_index()[['week-turbine','Nacelle position cos', 'Nacelle position sin','Vane position 1+2 cos', 'Vane position 1+2 sin','Blade Angle (pitch position) cos','Blade Angle (pitch position) sin']].dropna().copy()
    Layer3 = df_oneyear.reset_index()[['week-turbine','Generator bearing rear temperature (°C)','Generator bearing front temperature (°C)','Generator RPM (RPM)','Rotor bearing temp (°C)','Drive train acceleration (mm/ss)']].dropna().copy()
    Layer4 = df_oneyear.reset_index()[['week-turbine','Gear oil temperature (°C)','Gear oil inlet temperature (°C)','Gear oil pump pressure (bar)']].dropna().copy()
    Layer5 = df_oneyear.reset_index()[['week-turbine','Motor temperature (°C)','Motor current (A)']].dropna().copy()
    Layer6 = df_oneyear.reset_index()[['week-turbine','Tower Acceleration X (mm/ss)','Tower Acceleration y (mm/ss)']].dropna().copy()
    
    layer_data=[Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]
    layer_dict={}
    
    for l_id in range(len(layer_data)): # Specify number of layers
        layer_dict['Layer '+str(l_id+1)] = {}
        layer_dict['Layer '+str(l_id+1)]["Layer_data"] = layer_data[l_id]
        
        scaler = MinMaxScaler()
        layer_data_transformed = scaler.fit_transform(layer_data[l_id].drop(["week-turbine"],axis=1).to_numpy())
        layer_dict['Layer '+str(l_id+1)]["Data_transformed"] = layer_data_transformed
    
    db = DBSCAN(eps=0.1).fit(layer_dict["Layer 1"]["Data_transformed"])
    pca_layer1 = PCA(n_components=2).fit_transform(layer_dict["Layer 1"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer1[:, 0],y=pca_layer1[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 1"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
        fig.show()
    Layer1["cluster"]=layer_dict["Layer 1"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer1=Layer1.copy()
    
    db = DBSCAN(eps=0.12).fit(layer_dict["Layer 2"]["Data_transformed"])
    pca_layer2= PCA(n_components=6).fit_transform(layer_dict["Layer 2"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer2[:, 0],y=pca_layer2[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 2"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
        fig.show()
    Layer2["cluster"]=layer_dict["Layer 2"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer2=Layer2.copy()
    
    db = DBSCAN(eps=0.1).fit(layer_dict["Layer 3"]["Data_transformed"])
    pca_layer3= PCA(n_components=2).fit_transform(layer_dict["Layer 3"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer3[:, 0],y=pca_layer3[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 3"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
        fig.show()
    Layer3["cluster"]=layer_dict["Layer 3"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer3=Layer3.copy()
    
    db = DBSCAN(eps=0.1).fit(layer_dict["Layer 4"]["Data_transformed"])
    pca_layer4= PCA(n_components=2).fit_transform(layer_dict["Layer 4"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer4[:, 0],y=pca_layer4[:, 1],color=db.labels_.astype(str),title="PCA (" +str(np.round(np.sum(PCA(n_components=2).fit(layer_dict["Layer 4"]["Data_transformed"]).explained_variance_ratio_),2)) +")")
        fig.show()
    Layer4["cluster"]=layer_dict["Layer 4"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer4=Layer4.copy()
    
    db = DBSCAN(eps=0.05).fit(layer_dict["Layer 5"]["Data_transformed"])
    pca_layer5 = PCA(n_components=2).fit_transform(layer_dict["Layer 5"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer5[:, 0],y=pca_layer5[:, 1],color=db.labels_.astype(str),title="PCA(1.0)")
        fig.show()
    Layer5["cluster"]=layer_dict["Layer 5"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer5=Layer5.copy()
    
    db = DBSCAN(eps=0.05).fit(layer_dict["Layer 6"]["Data_transformed"])
    pca_layer6 = PCA(n_components=2).fit_transform(layer_dict["Layer 6"]["Data_transformed"])
    if (show):
        fig=px.scatter(x=pca_layer6[:, 0],y=pca_layer6[:, 1],color=db.labels_.astype(str),title="PCA (1.0)")
        fig.show()
    Layer6["cluster"]=layer_dict["Layer 6"]["Layer_data"]["cluster"]=db.labels_
    dfincdb_layer6=Layer6.copy()
    
    # Drop outliers
    layer_data=[Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]
    for l_id, key in enumerate(layer_dict.keys()):
        temp=layer_dict[key]["Layer_data"]
        layer_dict[key]["Layer_data"]=temp[temp["cluster"]!=-1]
        layer_data[l_id]=temp[temp["cluster"]!=-1]
    
    Layer1=Layer1[Layer1["cluster"]!=-1]
    Layer2=Layer2[Layer2["cluster"]!=-1]
    Layer3=Layer3[Layer3["cluster"]!=-1]
    Layer4=Layer4[Layer4["cluster"]!=-1]
    Layer5=Layer5[Layer5["cluster"]!=-1]
    Layer6=Layer6[Layer6["cluster"]!=-1]
    if (show):
        for key in layer_dict.keys():
            print(key+"--->"+str(len(np.unique(layer_dict[key]["Layer_data"]["cluster"])))+" clusters")
    
    # Hypergraph
    # creating a nested list, where each inner list lists the ids in that cluster. 
    lst = [v for v in Layer1.groupby('cluster')['week-turbine'].apply(list).values]
    lst = lst + [v for v in Layer2.groupby('cluster')['week-turbine'].apply(list).values]
    lst = lst + [v for v in Layer3.groupby('cluster')['week-turbine'].apply(list).values]
    lst = lst + [v for v in Layer4.groupby('cluster')['week-turbine'].apply(list).values]
    lst = lst + [v for v in Layer5.groupby('cluster')['week-turbine'].apply(list).values]
    lst = lst + [v for v in Layer6.groupby('cluster')['week-turbine'].apply(list).values]
    hyperedge_indices = []
    cnt = 0
    hyperedge_indices.append(cnt)
    for each in lst:
        cnt = cnt+len(each)
        hyperedge_indices.append(cnt)
    nodes_hyper = df_oneyear.index.tolist()
    hyperedges_1 = [item for sublist in lst for item in sublist]
    hyperedges = [nodes_hyper.index(i) for i in hyperedges_1]
    num_nodes = len(nodes_hyper)
    num_nets = len(hyperedge_indices)-1
    
    k = 2
    hypergraph = kahypar.Hypergraph(num_nodes, num_nets, hyperedge_indices, hyperedges, k)
    context = kahypar.Context()
    context.loadINIconfiguration("cut_kKaHyPar_sea20.ini")
    
    node_incident_edges = [] # list containg the incident edges of each node in sublist.
    for each_node in hypergraph.nodes():
        ie = []
        for incident_edge in hypergraph.incidentEdges(each_node):
            ie.append(incident_edge)
        node_incident_edges.append(ie)
    cluster_list = [sublist for sublist in lst if len(sublist) > 1]
    
    #Obtaining the neighbourhood of each edge.
    total_edges = hypergraph.numEdges()
    
    neighbourhood = [] # neighbourhood of each edge is presented in order.
    number_neighbours = []
    for i in range(total_edges):
        temp = []
        for node, edges in enumerate(node_incident_edges):
            if i in edges:
                for e in edges:
                    temp.append(e)
        neighbourhood.append(set(temp))
        number_neighbours.append(len(set(temp)))
    layer_list_temp=[]
    for ind,key in enumerate([Layer1,Layer2,Layer3,Layer4,Layer5,Layer6]):
        for cls in range(key["cluster"].nunique()):
            layer_list_temp.append("Layer"+str(ind+1)+"- cluster"+str(cls+1))
    
    # calculating Nearest Neighbourhood Similarity
    NNS = {}
    for i in range(total_edges):
        for j in range(i, total_edges):
            if i != j:
                # print("checking intersection of ", i, "and", j, "i.e.,", neighbourhood[i], "and", neighbourhood[j])
                intersection = neighbourhood[i].intersection(neighbourhood[j])
                if (i not in intersection) or (j not in intersection): # or condition is not required. if i is in intersection then automatically j will be in the intersection. 
                    # print(i, j, intersection, neighbourhood[i], neighbourhood[j])
                    NNS[(i,j)] = 0
                else:
                    union = neighbourhood[i].union(neighbourhood[j])
                    NNS[(i,j)] = len(intersection)/len(union)
    cnt=0
    df_snns=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
    for i in np.arange(0,total_edges,1):
        for j in np.arange(0,total_edges,1):
            if (i,j) in NNS:
                cnt=cnt+1
                df_snns.loc[i,j]=NNS[i,j]
    # converting the similarity matrix into distance matrix.
    NNS_dist = {k: 1-v for k, v in NNS.items()}
    
    # obtaining the distance matrix
    array_dist = []
    for i in range(total_edges):
        temp = []
        for j in range(total_edges):
            if i != j:
                temp.append(NNS_dist[(min(i, j), max(i,j))])
            else:
                temp.append(0)
        array_dist.append(temp)
    
    df_dist=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
    for i in np.arange(0,total_edges,1):
        for j in np.arange(0,total_edges,1):
            # if (i<=j):
            df_dist.loc[i,j]=np.round(array_dist[i][j],2)
    df_sim=1-df_dist
    
    #KMedoids
    # Clustering using KMedoids, and calculated distance matrix based on NNS
    from sklearn_extra.cluster import KMedoids
    
    silhouette_score = []
    labels_kmedoids = []
    for num_clusters in range(2, total_edges):
        kmedoids_ = KMedoids(n_clusters=num_clusters, metric='precomputed', method='pam', init='k-medoids++', random_state=0).fit_predict(np.array(array_dist))
        # print(kmedoids_)
        hy_sorted_cluster = []
        for _ in range(num_clusters):
            hy_sorted_cluster.extend(np.where(kmedoids_ == _)[0].tolist())
    
        heat_map = []
        for i in hy_sorted_cluster:
            temp = []
            for j in hy_sorted_cluster:
                if i != j:
                    temp.append(NNS_dist[(min(i, j), max(i,j))])
                else:
                    temp.append(0)
            heat_map.append(temp)
            
    
        silhouette_score.append(metrics.silhouette_score(array_dist, kmedoids_, metric="precomputed"))
        labels_kmedoids.append(kmedoids_)
    
        index_heatmap = kmedoids_.copy()
        index_heatmap.sort()
        dataframe_heat_map = pd.DataFrame(heat_map, index=index_heatmap, columns=index_heatmap)
    if (show):
        i_ = 2
        for each in silhouette_score:
            print(i_, each)
            i_ = i_+1

        plt.plot(range(2,total_edges), silhouette_score)
        plt.ylabel("silhouette score")
        plt.xlabel("number of clusters")
        
    if (show):
        # KMeans
        cvm.find_optimal_number_of_clusters(array_dist, algorithm=KMeans, display= True, seed= 0, col_wrap=4 ,score_metrics = ['silhouette_score',
                                     'calinski_harabasz_score',
                                     'davies_bouldin_score',
                                     'connectivity_score'], max_number_clusters=total_edges-1)
        
        # Agglomerative
        cvm.find_optimal_number_of_clusters(array_dist, algorithm=AgglomerativeClustering, display= True, seed= 0, col_wrap=4 ,score_metrics = ['silhouette_score',
                                     'calinski_harabasz_score',
                                     'davies_bouldin_score',
                                     'connectivity_score'], max_number_clusters=total_edges-1)
    
    # kmeans=KMeans(n_clusters=5).fit_predict(np.array(array_dist))
    # kmeans = KMedoids(n_clusters=7, metric='precomputed', method='pam', init='k-medoids++', random_state=0).fit_predict(np.array(array_dist))
    kmeans=AgglomerativeClustering(n_clusters=6).fit_predict(np.array(array_dist))
    dict_clusters = {0: [], 1:[], 2:[],3:[],4:[],5:[]}
    for index, each in enumerate(kmeans):
        dict_clusters[each].append(index)
    
    Debug = False
    clus_sol = generate_final_clusters(dict_clusters, hypergraph, 'donot_inc_key_in_cluster')
    # print(clus_sol) # clus_sol is the final clustering solution based on k-medoids based method. key: cluster number, value: data objects in cluster
    cluster_objects_list=[list(value) for value in clus_sol.values()]
    if (show):
        for key in clus_sol.keys():
            print(str(key) +" : "+str(len(list(clus_sol[key]))))

        fig=go.Figure()
        for key in clus_sol.keys():
            temp=df_oneyear.reset_index().copy()
            temp=temp[temp["week-turbine"].isin(list(clus_sol[key]))].copy()
        
            fig.add_trace(go.Box(y=temp['Power (kW)'], name='E'+str(key)))
        
        fig.update_layout(title="Active power")
        # fig.update_xaxes(title="Cluster")
        fig.update_yaxes(title="Power (kW)")
        fig.show()
    
    df_fca=pd.DataFrame(index=df_oneyear.index.astype(str),columns=clus_sol.keys())
    for ind,row in df_fca.iterrows():
        for clus in row.index:
            if (ind in list(clus_sol[clus])):
                df_fca.loc[ind,clus]=True
            else:
                df_fca.loc[ind,clus]=False
    df_fca.columns=df_fca.columns.astype(str)
    df_fca.columns="E"+df_fca.columns
    
    from fcapy.context import FormalContext
    K = FormalContext.from_pandas(pd.DataFrame(df_fca))
    
    from fcapy.lattice import ConceptLattice
    L = ConceptLattice.from_context(K)
    if (show):
        from fcapy.visualizer import LineVizNx
        fig, ax = plt.subplots(figsize=(10, 5))
        vsl = LineVizNx()
        vsl.draw_concept_lattice(L, ax=ax, flg_node_indices=True,flg_new_intent_count_prefix=False)
        ax.set_title('week-turbine concept lattice')
        plt.tight_layout()
        plt.show()
    
    week_cluster={}
    for w_id in np.arange(1,df_oneyear["week"].nunique()+1,1):
        clst_lst=[]
        for key in clus_sol.keys():
            lst=[element for element in list(clus_sol[key]) if "W"+str(w_id)+":" in element]
            if (len(lst)!=0):
                clst_lst.append(key)
        week_cluster[w_id]=clst_lst
    
    
    turbine_cluster={}
    for turb in df_oneyear["Turbine"].unique():
        clst_lst=[]
        for key in clus_sol.keys():
            lst=[element for element in list(clus_sol[key]) if turb in element]
            if (len(lst)!=0):
                clst_lst.append(key)
        turbine_cluster[turb]=clst_lst
    
    df_week_cluster=pd.DataFrame(str(0),index=list(week_cluster.keys()),columns=np.arange(0,len(clus_sol.keys()),1))
    for ind,row in df_week_cluster.iterrows():
        df_week_cluster.loc[ind,week_cluster[ind]]=str(1)
    df_week_cluster.index=df_week_cluster.index.astype(str)
    df_week_cluster.columns=df_week_cluster.columns.astype(str)
    
    
    df_turbine_cluster=pd.DataFrame(str(0),index=list(turbine_cluster.keys()),columns=np.arange(0,len(clus_sol.keys()),1))
    for ind,row in df_turbine_cluster.iterrows():
        df_turbine_cluster.loc[ind,turbine_cluster[ind]]=str(1)
    df_turbine_cluster.index=df_turbine_cluster.index.astype(str)
    df_turbine_cluster.columns=df_turbine_cluster.columns.astype(str)
    
    df_week_turbine_plot=pd.DataFrame(index=df_week_cluster.index.astype(int),columns=df_turbine_cluster.index)
    unique_combination=[]
    for w_id in np.arange(1,df_oneyear["week"].nunique()+1,1):
        for t_id in df_turbine_cluster.index:
            week_turb= "W"+str(w_id)+":"+t_id 
            comb = [key for key, values in clus_sol.items() if week_turb in values]
            df_week_turbine_plot.loc[w_id,t_id]=comb
            if comb not in unique_combination:
                unique_combination.append(comb)
    unique_dict=dict(zip(np.arange(100,100+len(unique_combination)+1,1), unique_combination))
    unique_dict={tuple(value): key for key, value in unique_dict.items()}
    unique_dict.keys()
    
    df_week_turbine_plot_id=df_week_turbine_plot.copy()
    for ind,row in df_week_turbine_plot_id.iterrows():
        for col in df_week_turbine_plot_id.columns:
            df_week_turbine_plot_id.loc[ind,col]=unique_dict[tuple(df_week_turbine_plot_id.loc[ind,col])]
            
    df_week_turbine_plot_id.index=df_week_turbine_plot_id.index.astype(str)
    df_week_turbine_plot_id=df_week_turbine_plot_id.astype(int).astype(str)
    if (show):
        fig = px.imshow(df_week_turbine_plot_id.T,labels=dict(x="Week", y="Turbine"))
        fig.update_layout(height=700)
        fig.show()
        print(unique_dict)
    
    if (batch==0): # Initial batch
        Layer1_merging=Layer1[["week-turbine","cluster"]].copy()
        Layer2_merging=Layer2[["week-turbine","cluster"]].copy()
        Layer3_merging=Layer3[["week-turbine","cluster"]].copy()
        Layer4_merging=Layer4[["week-turbine","cluster"]].copy()
        Layer5_merging=Layer5[["week-turbine","cluster"]].copy()
        Layer6_merging=Layer6[["week-turbine","cluster"]].copy()
    
    else:
        Layer1_merging=merge_dataframes(Layer1_merging,Layer1[["week-turbine","cluster"]], '_'+str(batch))
        Layer2_merging=merge_dataframes(Layer2_merging,Layer2[["week-turbine","cluster"]], '_'+str(batch))
        Layer3_merging=merge_dataframes(Layer3_merging,Layer3[["week-turbine","cluster"]], '_'+str(batch))
        Layer4_merging=merge_dataframes(Layer4_merging,Layer4[["week-turbine","cluster"]], '_'+str(batch))
        Layer5_merging=merge_dataframes(Layer5_merging,Layer5[["week-turbine","cluster"]], '_'+str(batch))
        Layer6_merging=merge_dataframes(Layer6_merging,Layer6[["week-turbine","cluster"]], '_'+str(batch))

In [None]:
fig=go.Figure()
for l_id,val in enumerate([Layer1_merging,Layer2_merging,Layer3_merging,Layer4_merging,Layer5_merging,Layer6_merging]):
    temp=val.nunique().drop('week-turbine').dropna()
    # temp=pd.concat([temp.iloc[1:], temp.iloc[:1]]).iloc[::-1]
    fig.add_traces(go.Scatter(x=temp.index,y=temp.values,name="Layer "+str(l_id+1)))

# fig.write_html("Layer_clusters_initial_period_1year.html")
fig.show()

In [None]:
temp=Layer2_merging[["cluster","cluster_1"]].dropna()
adjusted_mutual_info_score(temp["cluster"],temp["cluster_1"])

In [None]:
fig=px.line(Layer6_merging.drop("week-turbine",axis=1))
fig.show()

In [None]:
Layer6_merging["cluster_4"].unique()

In [None]:
Layer1_merging.columns

In [None]:
# Tracking Layer clusters 
df_layer_clusters_ami=pd.DataFrame(index=["Layer1","Layer2","Layer3","Layer4","Layer5","Layer6"],columns=Layer1_merging.columns[2::])

In [None]:
df_layer_clusters_ami

In [None]:
from sklearn.metrics import adjusted_mutual_info_score
for l_id,val in enumerate([Layer1_merging,Layer2_merging,Layer3_merging,Layer4_merging,Layer5_merging,Layer6_merging]):
    print(l_id)
    temp=val
    init='cluster'
    for col in df_layer_clusters_ami.columns:
        temp_=temp[[init,col]].dropna()
        df_layer_clusters_ami.loc["Layer"+str(l_id+1),col]=adjusted_mutual_info_score(temp_[init],temp_[col])
        init=col

In [None]:
df_layer_clusters_ami

In [None]:
fig=px.line(df_layer_clusters_ami.T)
# fig.write_html("Layer_clusters_initial_period_1year_ami.html")
fig.show()

In [None]:
fig=px.line(Layer6_merging.drop("week-turbine",axis=1))
fig.show()

In [None]:
fig=px.line(Layer2_merging.drop("week-turbine",axis=1))
fig.show()

In [None]:
temp=Layer6_merging[["cluster_3","cluster_4"]].dropna()
adjusted_mutual_info_score(temp["cluster_3"],temp["cluster_4"])

In [None]:
fig=go.Figure()
for l_id,val in enumerate([Layer1_merging,Layer2_merging,Layer3_merging,Layer4_merging,Layer5_merging,Layer6_merging]):
    temp=val.nunique().drop('week-turbine').dropna()
    # temp=pd.concat([temp.iloc[1:], temp.iloc[:1]]).iloc[::-1]
    fig.add_traces(go.Scatter(x=temp.index,y=temp.values,name="Layer "+str(l_id+1)))

# fig.write_html("Layer_clusters_initial_period_1year.html")
fig.show()

#### Assign clusters using historical data

In [None]:
df_layercls_hypcls=pd.DataFrame(columns=layer_list_temp,index=['E' + str(i) for i in dict_clusters.keys()])
elem_modes_historical={}
for key in dict_clusters.keys():
    print("E"+str(key))
    print([layer_list_temp[i] for i in dict_clusters[key]])

    elem_modes_historical["E"+str(key)]=[layer_list_temp[i] for i in dict_clusters[key]]
    
    df_layercls_hypcls.loc["E"+str(key),[layer_list_temp[i] for i in dict_clusters[key]]]=1
df_layercls_hypcls=df_layercls_hypcls.notnull().astype(int)

In [None]:
fig=px.imshow(df_layercls_hypcls)
fig.show()

In [None]:
dfs=[Layer1.rename(columns={'cluster': 'Layer1'})[['week-turbine','Layer1']],
     Layer2.rename(columns={'cluster': 'Layer2'})[['week-turbine','Layer2']],
     Layer3.rename(columns={'cluster': 'Layer3'})[['week-turbine','Layer3']],
     Layer4.rename(columns={'cluster': 'Layer4'})[['week-turbine','Layer4']],
     Layer5.rename(columns={'cluster': 'Layer5'})[['week-turbine','Layer5']],
     Layer6.rename(columns={'cluster': 'Layer6'})[['week-turbine','Layer6']]]
from functools import reduce
df_layercluster=reduce(lambda left, right: pd.merge(left, right, on='week-turbine', how='outer'), dfs)

# Reorder dataframe
df_layercluster["week-turbine"]=pd.Categorical(df_layercluster['week-turbine'], categories=df_oneyear.index, ordered=True)
df_layercluster=df_layercluster.sort_values("week-turbine").reset_index(drop=True)

In [None]:
testing_batch=9

df_layercluster_newbatch=df_layercluster[(742+(56*testing_batch))::].copy()
for ind,row in df_layercluster_newbatch.iterrows():
    for col in df_layercluster_newbatch.columns[1::]:
        if pd.notna(row[col]):
            df_layercluster_newbatch.loc[ind,col]=col + "- "+"cluster"+str(int(row[col])+1)

In [None]:
df_layercluster_newbatch["Elementary_modes_prediction_historical"]=None
df_layercluster_newbatch["Elementary_modes_prediction_hypergraph"]=None
for ind,row in df_layercluster_newbatch.iterrows():
    elemmode=[]
    for elem in elem_modes_historical.keys():
        if (len(list(set(row[1::].values) & set(elem_modes_historical[elem])))!=0):
            elemmode.append(elem) 
    df_layercluster_newbatch.at[ind,"Elementary_modes_prediction_historical"]=elemmode
    temp=df_fca.loc[row["week-turbine"]]
    df_layercluster_newbatch.at[ind,"Elementary_modes_prediction_hypergraph"]=temp[temp].index.tolist()

In [None]:
df_layercluster_newbatch["week"]=df_oneyear[(742+(56*testing_batch))::]["week"].values
df_layercluster_newbatch["Turbine"]=df_oneyear[(742+(56*testing_batch))::]["Turbine"].values

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_layercluster_newbatch["Elementary_modes_prediction_hypergraph"].astype(str),
                            df_layercluster_newbatch["Elementary_modes_prediction_historical"].astype(str)))