In [4]:
%load_ext autoreload
%autoreload 2

TRAIN_OPERATIONAL_PATH = 'train_operational_readouts.csv'
TRAIN_REPAIR_PATH = 'train_tte.csv'
TRAIN_SPECIFICATIONS = 'train_specifications.csv' 

import pandas as pd
import matplotlib.pyplot as plt
import kahypar
import plotly.express as px
import seaborn as sns
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from conscious_engie_icare import hypergraph_clustering
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler
import matplotlib as mpl
import matplotlib.ticker as ticker
from sklearn.decomposition import PCA
import math
import warnings
import networkx as nx
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
df_ts = pd.read_csv(TRAIN_OPERATIONAL_PATH)
df_ts.head()

FileNotFoundError: [Errno 2] No such file or directory: 'train_operational_readouts.csv'

Timeseries are monotonically increasing:

In [None]:
df_ = df_ts[df_ts.vehicle_id == 0]
ax = df_['171_0'].plot();
ax.set_title('171_0')

In [None]:
df_ = df_ts[df_ts.vehicle_id == 2]
df_['171_0'].plot()

# Timeseries data

In [None]:
unique_vehicle_ids = len(df_ts.vehicle_id.unique())
print(f'Number of unique vehicles: {unique_vehicle_ids}')

In [None]:
ax = df_ts.vehicle_id.value_counts().plot.hist(bins=100)
ax.set_title('Number of samples per vehicle')
ax.set_xlabel('Samples');

# Specifications data

In [None]:
df_specifications = pd.read_csv(TRAIN_SPECIFICATIONS, index_col='vehicle_id')
df_specifications.iloc[5000:5005]

Rename categories to avoid confusion:

In [None]:
def add_column_prefix(df):
    for col in df.columns:
        df[col] = df[col].apply(lambda x: f'{col}_{x}')
    return df

df_specifications = add_column_prefix(df_specifications)

The number of unique categories differs between specifications.
Below, we illustrate the distribution of categories.
Note that for most specifications are highly imbalanced, with one category labelling the majority of cases.

In [None]:
fig, axes = plt.subplots(figsize=(20, 5), nrows=2, ncols=4, sharey=True)
for spec, ax in zip(range(8), axes.flatten()):
    df_specifications[f'Spec_{spec}'].value_counts().plot.bar(ax=ax)
fig.tight_layout()
fig.savefig('distribution_specifications.png')

In [None]:
df_specifications['Spec_1'].value_counts().plot.bar();

In [None]:
df_specifications.head()

In [None]:
len(df_specifications['Spec_0'].unique())

In [None]:
len(df_specifications['Spec_1'].unique())

In [None]:
len(df_specifications['Spec_2'].unique())

In [None]:
len(df_specifications['Spec_3'].unique())

In [None]:
len(df_specifications['Spec_4'].unique())

# Repair data

In [None]:
df_repair = pd.read_csv(TRAIN_REPAIR_PATH)
df_repair.head()

In [None]:
df_ = df_repair.copy()
df_['in_study_repair'] = df_repair['in_study_repair'].replace({0: 'False', 1: 'True'})
ax = df_.in_study_repair.value_counts().plot.bar()
ax.set_title('vehicle repairs')
ax.set_ylabel('vehicles')
ax.set_xlabel('Was repaired?')

# Methods: Explorations

## Formal concept analysis (FCA)

Following line, where a concept lattice is constructed takes too long to run (because there are 3607 unique configurations and hence at least 3607 concepts over 23550 objects):

In [None]:
if False:
    df_transformed = pd.get_dummies(df_specifications, columns=df_specifications.columns, drop_first=False)
    print(f'original shape = {df_specifications.shape}')
    print(f'transformed shape = {df_transformed.shape}')
    cluster_table = df_transformed.to_numpy()
    formal_context = FormalContext(cluster_table)
    display(df_transformed.head())

    n_unique_configurations = len(set([tuple(row) for row in df_transformed.to_numpy()]))
    print("Number of unique combinations:", n_unique_configurations)

    L = ConceptLattice.from_context(formal_context)
    print("# concepts:", len(L))

## Hypergraph clustering

In [None]:
from collections import OrderedDict

nodes_hyper = df_specifications.index.tolist()
layers = df_specifications.to_dict(orient='series')
for k, v in layers.items():
    v.name = 'spec'

# Create an *ordered* dictionary by iterating over each DataFrame, with
# {cat_idx: [vehicle_ids]}
category_to_vehicle_ids = OrderedDict()
for spec_idx, df in layers.items():
    for cat_idx, group in df.reset_index().groupby('spec', sort=False):
        category_to_vehicle_ids[cat_idx] = group['vehicle_id'].tolist()
"""
category_to_vehicle_ids = {cat_idx: group['vehicle_id'].tolist()
                           for spec_idx, df in layers.items()
                           for cat_idx, group in df.reset_index().groupby('spec')}
"""

print("Number of categories in total", len(category_to_vehicle_ids))
print("----------------------------------------------------")

# Create a list of hyperedge indices, where
# each index indicates the start/stop if a segment of trucks belonging to the given category
hyperedge_indices = []
cnt = 0
hyperedge_indices.append(cnt)
for each in category_to_vehicle_ids.values():
    cnt = cnt+len(each)
    hyperedge_indices.append(cnt)

# convert category_to_vehicle_ids to 
# (1) flat list = flat list of vehicle id's (as originally given in df_specifications), and 
# (2) hyperedges = indices of list of vehicle id's
flat_list_ = [vehicle_id for vehicle_id_list in category_to_vehicle_ids.values() for vehicle_id in vehicle_id_list]   # 188400
hyperedges = [nodes_hyper.index(i) for i in flat_list_]
print("Number of category-truck combinations: ", len(hyperedges))
print("-----------------------------------------------------")
num_nodes = len(nodes_hyper)
num_nets = len(hyperedge_indices)-1

k = 2
# edge and node weight are not considered as they are not relavent in our case.
# k values does not have a significance as we are not partitioning the hypergraph
# using the algorithm.
hypergraph = kahypar.Hypergraph(
    num_nodes,                      # 23550
    num_nets,                       # 90
    index_vector=hyperedge_indices, # 91 ...
    edge_vector=hyperedges,         # 188400
    k=k                             # 2, no significance
)
context = kahypar.Context()
print("number of edges of hyper-graph", hypergraph.numEdges())
# Clusters list:
# Category with single truck is not considered as hyperedge.
cluster_list = [cat_idx for cat_idx, sublist in category_to_vehicle_ids.items() if len(sublist) > 1]
print("Unique categories with respect to edges: ", len(cluster_list))
print("number of nodes of hyper-graph", hypergraph.numNodes())
print("number of pins of hyper-graph", hypergraph.numPins())

Create mapping between naming convention of KaHyPar and categories.

In [None]:
assert len(list(hypergraph.edges())) == len(cluster_list)
edge_id_to_category_id = {edge_id: category_id for edge_id, category_id in zip(hypergraph.edges(), cluster_list)}

# Check if the renaming is correct:
# If the code below runs without exceptions, then it is guaranteed that 
# (1) the order of nodes in the hypergraph (0, 1, 2, ...) corresponds to the order of trucks in df_specifications,
#     i.e., first row of df_specifications corresponds to first row of nodes
# (2) The renaming introduced in edge_id_to_category_id is correct.
for i, (truck_id, specifications) in enumerate(df_specifications.iterrows()):
    incindent_edges_ = list(hypergraph.incidentEdges(i))
    specifications_ = specifications.tolist()
    renamed_indicent_edges_ = [edge_id_to_category_id[e] for e in incindent_edges_]
    msg = f"Specification names don't match in line {i}\n" + \
          f"renamed_indicent_edges: {renamed_indicent_edges_}\n" + \
          f"specifications: {specifications_}"
    if len(renamed_indicent_edges_) == 8:
        assert renamed_indicent_edges_ == specifications_, msg
    else:
        print(f"Not considering truck {i}, since it has unique specification category.")

In [None]:
"""
for i, (truck_id, specifications) in enumerate(df_specifications.iterrows()):
    incindent_edges_ = list(hypergraph.incidentEdges(i))
    renamed_indicent_edges = [edge_id_to_category_id[e] for e in incindent_edges_]
    assert renamed_indicent_edges == specifications.tolist(), f"Specification names don't match in line {i}"

renamed_indicent_edges
"""
pass

Obtaining the neighbourhood of each edge:

For each hyperedge n_i, identify its set of neigbours. A hyperedge is considered a neighbor of another hyperedge, when they have at least one common sample.

In [None]:
check_if_intersection_non_empty = lambda x, y: len(set(hypergraph.pins(x)).intersection(set(hypergraph.pins(y)))) > 0
neighbourhood_ = {ni: set([nj for nj in hypergraph.edges() if check_if_intersection_non_empty(ni, nj)]) for ni in hypergraph.edges()}
# rename neighbourhood and number_neighbours according to category names
neighbourhood = {edge_id_to_category_id[k]: {edge_id_to_category_id[vi] for vi in v} for k, v in neighbourhood_.items()}
number_neighbours = {ni: len(neighbours) for ni, neighbours in neighbourhood.items()}
#print("number of neighbours", number_neighbours)
fig, ax = plt.subplots(figsize=(15,5))
pd.Series(number_neighbours).plot.bar(ax=ax);
ax.set_ylabel('Number of neighbours');

Calculating Nearest Neighbourhood Similarity:

In [None]:
verbose = False

NNS = {}
for i_cat_idx, i_neighbourhood_edges in neighbourhood.items():   # !!! set of all i_neighbourhood_edges != set of all i_cat_idx
    for j_cat_idx, j_neighbourhood_edges in neighbourhood.items():
        if i_cat_idx != j_cat_idx:
            if verbose:
                print("checking intersection of ", i_cat_idx, "and", j_cat_idx, 
                      "i.e.,", neighbourhood[i_cat_idx], "and", neighbourhood[j_cat_idx])
            intersection = neighbourhood[i_cat_idx].intersection(neighbourhood[j_cat_idx])
            if (i_cat_idx not in intersection) or (j_cat_idx not in intersection): # or condition is not required. if i is in intersection then automatically j will be in the intersection. 
                if verbose:
                    print(i_cat_idx, j_cat_idx, intersection, neighbourhood[i_cat_idx], neighbourhood[j_cat_idx])
                NNS[(i_cat_idx, j_cat_idx)] = 0
            else:
                union = neighbourhood[i_cat_idx].union(neighbourhood[j_cat_idx])
                NNS[(i_cat_idx, j_cat_idx)] = len(intersection)/len(union)
                if verbose:
                    print(union)

print(len(NNS))

- Similarity score
    - 0 means those edges do not overlap directly
    - 1 means those edges overlap completely?

In [None]:
index = []
columns = []
values = []
for (idx, col), val in NNS.items():
    index.append(idx)
    columns.append(col)
    values.append(val)
df = pd.DataFrame({'Index': index, 'Column': columns, 'Value': values})
df_snns = df.pivot(index='Index', columns='Column', values='Value')

fig, ax = plt.subplots(figsize=(20, 16))
sns.heatmap(df_snns.fillna(1), cmap='viridis')
ax.set_title('Nearest neighbor similarity (NNS)')
fig.savefig('v2_nns.png')

In [None]:
# Create the heatmap
fig = px.imshow(df_snns.fillna(1), 
                color_continuous_scale='viridis', 
                title='Nearest neighbor similarity (NNS)',
                labels=dict(x="Columns", y="Rows", color="Similarity"))

# Update layout for better appearance
fig.update_layout(
    width=1000, 
    height=900,
    title_x=0.5
)

# Show the heatmap
fig.show()

In [None]:
fig = px.imshow(df_snns.fillna(0) > 0.5, 
                color_continuous_scale='viridis', 
                title='Adjacency matrix',
                labels=dict(x="Columns", y="Rows", color="Similarity"))

# Update layout for better appearance
fig.update_layout(
    width=1000, 
    height=900,
    title_x=0.5
)

# Show the heatmap
fig.show()

In [None]:
df_

In [None]:
df_ = df_snns.fillna(0) > 0.5
G = nx.Graph(df_)
options = {
    "font_size": 6,
    "font_color": 'grey',
    "node_size": 12,
    "with_labels": True,
    "node_color": "white",
    "edgecolors": "blue",
    #"linewidths": 5,
    #"width": 5,
}
pos = nx.spring_layout(G, k=0.2, seed=648)
#pos = nx.multipartite_layout(G, subset_key=layers) # TODO
fig, ax = plt.subplots(figsize=(21,21))
nx.draw(G, pos, ax=ax, **options)
fig.savefig('graph_0.5.png')

In [None]:
'Spec_7_Cat4' in G

In [None]:
df_ = df_snns.fillna(0) > 0.6
G = nx.Graph(df_)
options = {
    "font_size": 6,
    "font_color": 'grey',
    "node_size": 12,
    "with_labels": True,
    "node_color": "white",
    "edgecolors": "blue",
    #"linewidths": 5,
    #"width": 5,
}
pos = nx.spring_layout(G, k=0.2, seed=648)
#pos = nx.multipartite_layout(G, subset_key=layers) # TODO
fig, ax = plt.subplots(figsize=(21,21))
nx.draw(G, pos, ax=ax, **options)
fig.savefig('graph_0.6.png')

In [None]:
df_ = df_snns.fillna(0) > 0.7
G = nx.Graph(df_)
options = {
    "font_size": 6,
    "font_color": 'grey',
    "node_size": 12,
    "with_labels": True,
    "node_color": "white",
    "edgecolors": "blue",
    #"linewidths": 5,
    #"width": 5,
}
pos = nx.spring_layout(G, k=0.2, seed=648)
#pos = nx.multipartite_layout(G, subset_key=layers) # TODO
fig, ax = plt.subplots(figsize=(21,21))
nx.draw(G, pos, ax=ax, **options)
fig.savefig('graph_0.7.png')

In [None]:
df_ = df_snns.fillna(0) > 0.8
G = nx.Graph(df_)
options = {
    "font_size": 6,
    "font_color": 'grey',
    "node_size": 12,
    "with_labels": True,
    "node_color": "white",
    "edgecolors": "blue",
    #"linewidths": 5,
    #"width": 5,
}
pos = nx.spring_layout(G, k=0.2, seed=648)
#pos = nx.multipartite_layout(G, subset_key=layers) # TODO
fig, ax = plt.subplots(figsize=(21,21))
nx.draw(G, pos, ax=ax, **options)
fig.savefig('graph_0.8.png')

In [None]:
G = nx.davis_southern_women_graph()  # Example graph
communities = nx.community.greedy_modularity_communities(G)

# Compute positions for the node clusters as if they were themselves nodes in a
# supergraph using a larger scale factor
supergraph = nx.cycle_graph(len(communities))
superpos = nx.spring_layout(G, scale=50, seed=429)

# Use the "supernode" positions as the center of each node cluster
centers = list(superpos.values())
pos = {}
for center, comm in zip(centers, communities):
    pos.update(nx.spring_layout(nx.subgraph(G, comm), center=center, seed=1430))

# Nodes colored by cluster
for nodes, clr in zip(communities, ("tab:blue", "tab:orange", "tab:green")):
    nx.draw_networkx_nodes(G, pos=pos, nodelist=nodes, node_color=clr, node_size=100)
nx.draw_networkx_edges(G, pos=pos)

plt.tight_layout()
plt.show()

In [None]:
# Create the heatmap
fig = px.imshow(df_snns.fillna(1), 
                color_continuous_scale='viridis', 
                title='Nearest neighbor similarity (NNS)',
                labels=dict(x="Columns", y="Rows", color="Similarity"))

# Update layout for better appearance
fig.update_layout(
    width=1000, 
    height=900,
    title_x=0.5
)

# Show the heatmap
fig.show()

In [None]:
df_snns.fillna(1)

In [None]:
# old version
"""
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

cnt=0
#df_snns=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
df_snns=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
for i in np.arange(0,total_edges,1):
    for j in np.arange(0,total_edges,1):
        if (i,j) in NNS:
            cnt=cnt+1
            df_snns.loc[i,j]=NNS[i,j]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df_snns.fillna(1), cmap='viridis')
ax.set_title('Nearest neighbor similarity (NNS)');
"""
pass

Converting the similarity matrix into distance matrix.

In [None]:
NNS_dist = {k: 1-v for k, v in NNS.items()}

Obtaining distance matrix.

- More similar (more overlapping) means less distant
- Distance metric: distance 1 means no overlap; distance 0 means full overlap

In [None]:
df_dist = df_snns.map(lambda x: 1-x).fillna(1)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df_dist, cmap='viridis')
ax.set_title('Distance matrix');

In [None]:
array_dist = df_dist.to_numpy()

In [None]:
# old code replaced with code above
"""
array_dist = []
for i in range(total_edges):
    temp = []
    for j in range(total_edges):
        if i != j:
            temp.append(NNS_dist[(min(i, j), max(i,j))])
        else:
            temp.append(0)
    array_dist.append(temp)

df_dist=pd.DataFrame(index=np.arange(0,total_edges,1),columns=np.arange(0,total_edges,1))
for i in np.arange(0,total_edges,1):
    for j in np.arange(0,total_edges,1):
        # if (i<=j):
        df_dist.loc[i,j]=array_dist[i][j]

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(df_dist.fillna(1), cmap='viridis')
ax.set_title('Distance matrix');
"""
pass

# Agglomerative clustering

In [None]:
linkage_method = 'complete'
distance_metric = 'precomputed'

X = np.array(array_dist)
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None,  metric=distance_metric, linkage=linkage_method)
model = model.fit(X)
fig, ax = plt.subplots(figsize=(15,5))
hypergraph_clustering.plot_dendrogram(model, truncate_mode="level", p=50, ax=ax)
ax.set_title(f"Hierarchical Clustering Dendrogram ({linkage_method} linkage)")
ax.set_xlabel("Unique specification Category")
ax.set_ylabel(f"{distance_metric} linkage distance");

# Replace the tick labels
new_labels = [edge_id_to_category_id[int(label.get_text())] for label in ax.get_xticklabels()]
ax.set_xticklabels(new_labels)
fig.tight_layout()
fig.savefig('v2_clustering_dendrogram.png')

Performing clustering.

In [None]:
model = AgglomerativeClustering(distance_threshold=0.7, n_clusters=None,  metric=distance_metric, linkage=linkage_method)
y_agglomerative = model.fit_predict(X)
dict_clusters = {y: np.where(y_agglomerative == y)[0].tolist() for y in np.unique(y_agglomerative)}
dict_clusters_ = {y: np.where(y_agglomerative == y)[0].tolist() for y in np.unique(y_agglomerative)}
dict_clusters = {f'HG_{y}': [edge_id_to_category_id[y] for y in vs] for y, vs in dict_clusters_.items()}
print(dict_clusters)

In [None]:
def generate_final_clusters(final_clusters1, hypergraph1, method, Debug=False):
    """ Mapping hyperedges to data objects to obtain the clustering solution of data objects. """
    temp_del = 0
    clustering_nodes = {}
    for key, val in final_clusters1.items():
        if method == 'donot_inc_key_in_cluster':
            pins_center = []
        elif method == "inc_key_in_cluster":
            pins_center = list(hypergraph1.pins(key))
        for _ in val:
            pins_center.extend(list(hypergraph1.pins(_)))
        clustering_nodes[key] = set(pins_center)
        temp_del = temp_del + len(set(pins_center))

    if Debug == True:
        print("clustering of data objects", clustering_nodes) # dict, key = center(hyperedge), values = data objects

    # replacing the index of the data object with its short id
    clus_nodes_short_id = {}
    for key, val in clustering_nodes.items():
        # print(val)
        clus_nodes_short_id[key] = {nodes_hyper[x] for x in val} # note that sets are not ordered

    if Debug == True:
        print("clustering solution, key = center (hyperedge), val = set of short_ids")
        print(clus_nodes_short_id)
    return clus_nodes_short_id

clus_sol = generate_final_clusters(dict_clusters_, hypergraph, 'donot_inc_key_in_cluster', Debug=False)
len(clus_sol)

In [None]:
n_ = {y: len(clus_sol[y]) for y in clus_sol} # WHY MORE THAN BEFORE? --> check generate_final_clusters
fig, ax = plt.subplots()
pd.Series(n_).plot.barh(ax=ax)
ax.set_title('Clustering of data objects (data objects can be labelled as multiple objects)')
ax.set_xlabel('n')
ax.set_ylabel('cluster label')
fig.savefig('n_cluster_labels.png')

Per hypergraph cluster, plot distibution of labels present.

In [None]:
print(dict_clusters)

In [None]:
def reverse_dict_of_sets(input_dict):
    reversed_dict = {}
    for key, values_set in input_dict.items():
        for value in values_set:
            if value not in reversed_dict:
                reversed_dict[value] = []
            reversed_dict[value].append(key)
    return reversed_dict

reversed_dict = reverse_dict_of_sets(clus_sol)

In [None]:
def count_list_occurrences(reversed_dict):
    list_count = {}
    for key_list in reversed_dict.values():
        key_tuple = tuple(key_list)
        if key_tuple not in list_count:
            list_count[key_tuple] = 0
        list_count[key_tuple] += 1
    return list_count

list_counts = count_list_occurrences(reversed_dict)
list_counts_with_str_key = {str(k): v for k, v in list_counts.items()}
fig, ax = plt.subplots(figsize=(8, 120))
pd.Series(list_counts_with_str_key).sort_values().plot.barh(ax=ax)
ax.set_title('Clustering of data objects (data objects can be labelled in multiple clusters)')
ax.set_xlabel('n')
ax.set_ylabel('cluster label combinations');
fig.savefig('n_unqiue_cluster_label_combinations.png')

In [None]:
s = pd.Series(list_counts_with_str_key)
s.sort_values().plot()

# Integrated clustering analysis

In [None]:
df_trucks = df_specifications.copy()
df_ = df_repair.copy()
df_.index = df_['vehicle_id']
df_trucks = pd.merge(df_trucks, df_, left_index=True, right_index=True)
df_trucks = df_trucks.drop(columns=['vehicle_id'])
df_trucks.head()

In [None]:
def construct_clustering_table(dict_):
    # Step 1: Get all unique elements in the lists
    unique_elements = set()
    for value_list in dict_.values():
        unique_elements.update(value_list)
    
    # Convert the set to a sorted list to ensure consistent column ordering
    unique_elements = sorted(unique_elements)
    
    # Step 2: Create the DataFrame
    result_df = pd.DataFrame(index=dict_.keys(), columns=unique_elements)
    
    # Step 3: Populate the DataFrame
    for key, value_list in dict_.items():
        result_df.loc[key] = [element in value_list for element in unique_elements]
    return result_df

df_clustering_solution = construct_clustering_table(reversed_dict)
df_clustering_solution.head()

In [None]:
df_trucks_clustering1 = df_trucks.copy()
df_trucks_clustering1 = pd.merge(df_trucks_clustering1, df_clustering_solution, left_index=True, right_index=True)
df_trucks_clustering1.head()

In [None]:
HGC = 0
df_ = df_trucks_clustering1[df_trucks_clustering1[HGC]]
fig, axes = plt.subplots(figsize=(15, 10), ncols=4, nrows=2)
specs = ['Spec_0', 'Spec_1', 'Spec_2', 'Spec_3', 'Spec_4', 'Spec_5', 'Spec_6', 'Spec_7']
for spec, ax in zip(specs, axes.flat):
    df_[spec].value_counts().plot.bar(ax=ax)
fig.suptitle(f'Hypergraph cluster {HGC}')
fig.tight_layout()

In [None]:
HGC = 1
df_ = df_trucks_clustering1[df_trucks_clustering1[HGC]]
fig, axes = plt.subplots(figsize=(15, 10), ncols=4, nrows=2)
specs = ['Spec_0', 'Spec_1', 'Spec_2', 'Spec_3', 'Spec_4', 'Spec_5', 'Spec_6', 'Spec_7']
for spec, ax in zip(specs, axes.flat):
    value_counts = df_[spec].value_counts()
    colors = ['red' if value in dict_clusters[f'HG_{HGC}'] else 'blue' for value in value_counts.index]
    value_counts.plot.bar(ax=ax, color=colors)
    ax.set_title(spec)
fig.suptitle(f'Hypergraph cluster {HGC}')
fig.tight_layout()

In [None]:
def construct_kpi_df(df_trucks_clustering_):
    # Initialize lists to store the results
    numbers = []
    percentages = []
    counts = []
    
    # Iterate through each column that represents a number
    for column in df_trucks_clustering_.columns:
        if isinstance(column, int) or column.isdigit():
            number_col = df_trucks_clustering_[column]
            total_count = number_col.sum()  # Count of rows where the number column is True
            if total_count > 0:
                true_percentage = df_trucks_clustering_.loc[number_col, 'in_study_repair'].mean() * 100  # Percentage of True in in_study_repair
            else:
                true_percentage = 0
            # Append results to lists
            numbers.append(int(column))
            percentages.append(true_percentage)
            counts.append(total_count)
    
    # Create a result DataFrame
    result_df = pd.DataFrame({
        'cluster id': numbers,
        'percentage': percentages,
        'count': counts
    })
    return result_df

results_df_clustering1 = construct_kpi_df(df_trucks_clustering1)
results_df_clustering1

In [None]:
# Calculate the average percentage of True in in_study_repair
overall_average_percentage = df_trucks_clustering1['in_study_repair'].mean() * 100

In [None]:
# Sort the DataFrame by the percentage column
df_plot = results_df_clustering1.sort_values(by='percentage')
df_plot['cluster id'] = df_plot['cluster id'].astype(str)

# Create the bubble plot
fig = px.scatter(df_plot, x='cluster id', y='percentage', size='count', 
                 title='Bubble plot of cluster ID vs. percentage of repairs',
                 labels={'number': 'Number', 'percentage': '% True in Study Repair'},
                 size_max=60)
fig.update_traces(marker_sizemin=2, selector=dict(type='scatter'))

# Add a horizontal line for the average percentage
fig.add_shape(
    type="line",
    x0=min(df_plot['cluster id']),
    x1=len(df_plot['cluster id']),
    y0=overall_average_percentage,
    y1=overall_average_percentage,
    line=dict(color="Red", width=2),
)

# Add annotation for the average line
fig.add_annotation(
    x=len(df_plot['cluster id'].unique()),
    y=overall_average_percentage,
    text=f"Average: {overall_average_percentage:.2f}%",
    showarrow=False,
    yshift=10,
    font=dict(color="Red")
)

# Show the plot
fig.show()

In [None]:
df_trucks_clustering1

# SNN

In [None]:
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.cluster import DBSCAN
from sklearn.neighbors import kneighbors_graph


def snn(X, neighbor_num, min_shared_neighbor_num):
    """Perform Shared Nearest Neighbor (SNN) clustering algorithm clustering.

    Parameters
    ----------
    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or array of shape (n_samples, n_samples)
    A feature array
    neighbor_num : int
    K number of neighbors to consider for shared nearest neighbor similarity
    min_shared_neighbor_num : int
    Number of nearest neighbors that need to share two data points to be considered part of the same cluster
    """

    # for each data point, find their set of K nearest neighbors
    knn_graph = kneighbors_graph(X, n_neighbors=neighbor_num, include_self=False)
    neighbors = np.array([set(knn_graph[i].nonzero()[1]) for i in range(len(X))])
    print('identified neighbours for each data point')

    # the distance matrix is computed as the complementary of the proportion of shared neighbors between each pair of data points
    snn_distance_matrix = np.asarray([[get_snn_distance(neighbors[i], neighbors[j]) for j in range(len(neighbors))] for i in range(len(neighbors))])
    print(f'retrieved distance matrix {snn_distance_matrix.shape}')
    
    # perform DBSCAN with the shared-neighbor distance criteria for density estimation
    dbscan = DBSCAN(min_samples=min_shared_neighbor_num, metric="precomputed")
    dbscan = dbscan.fit(snn_distance_matrix)
    return dbscan.core_sample_indices_, dbscan.labels_


def get_snn_similarity(x0, x1):
    """Calculate the shared-neighbor similarity of two sets of nearest neighbors, normalized by the maximum number of shared neighbors"""

    return len(x0.intersection(x1)) / len(x0)


def get_snn_distance(x0, x1):
    """Calculate the shared-neighbor distance of two sets of nearest neighbors, normalized by the maximum number of shared neighbors"""

    return 1 - get_snn_similarity(x0, x1)


class SNN(BaseEstimator, ClusterMixin):
    """Class for performing the Shared Nearest Neighbor (SNN) clustering algorithm.

    Parameters
    ----------
    neighbor_num : int
        K number of neighbors to consider for shared nearest neighbor similarity

    min_shared_neighbor_proportion : float [0, 1]
        Proportion of the K nearest neighbors that need to share two data points to be considered part of the same cluster

    Note: Naming conventions for attributes are based on the analogous ones of DBSCAN
    """

    def __init__(self, neighbor_num, min_shared_neighbor_proportion):
        """Constructor"""
        self.neighbor_num = neighbor_num
        self.min_shared_neighbor_num = round(neighbor_num * min_shared_neighbor_proportion)

    def fit(self, X):

        """Perform SNN clustering from features or distance matrix.

        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or array of shape (n_samples, n_samples)
            A feature array
        """

        clusters = snn(X, neighbor_num=self.neighbor_num, min_shared_neighbor_num=self.min_shared_neighbor_num)
        self.core_sample_indices_, self.labels_ = clusters
        if len(self.core_sample_indices_):
            # fix for scipy sparse indexing issue
            self.components_ = X[self.core_sample_indices_].copy()
        else:
            # no core samples
            self.components_ = np.empty((0, X.shape[1]))
        return self

    def fit_predict(self, X, y=None, sample_weight=None):
        """Performs clustering on X and returns cluster labels.

        Parameters
        ----------
        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
                array of shape (n_samples, n_samples)
            A feature array, or array of distances between samples if
            ``metric='precomputed'``.
        sample_weight : array, shape (n_samples,), optional
            Weight of each sample, such that a sample with a weight of at least
            ``min_samples`` is by itself a core sample; a sample with negative
            weight may inhibit its eps-neighbor from being core.
            Note that weights are absolute, and default to 1.

        y : Ignored

        Returns
        -------
        y : ndarray, shape (n_samples,)
            cluster labels
        """
        self.fit(X)
        return self.labels_

In [None]:
#X_sample_ = X_[np.random.choice(X_.shape[0], 1000, replace=False), :]

if False:
    X_ = df_clustering_solution.astype(int).to_numpy()
    print(X_.shape)
    clustering_algo = SNN(10, 0.5)
    y_ = clustering_algo.fit_predict(X_)
    print(pd.Series(y_).value_counts())
    fig, ax = plt.subplots(figsize=(10, 40))
    pd.Series(y_).value_counts().plot.barh(ax=ax);
    fig.savefig('snn_barplot.png')

In [None]:
from tqdm import tqdm

unique_vehicle_ids = df_ts.vehicle_id.unique()
sol = {}
for id_ in tqdm(unique_vehicle_ids):
    df_ = df_ts[df_ts.vehicle_id == id_]
    sol[id_] = df_.iloc[:50]

In [None]:
exp = sol[2]
exp.iloc[:, 2:].plot(legend=False)

In [None]:
exp = sol[3]
exp.iloc[:, 2:].plot(legend=False)

### FCA on hypergraph clustering

In [None]:
"""
cluster_table = df_kpi_clustering.filter(regex='^\d+$').to_numpy()
formal_context = FormalContext(cluster_table)
formal_context.to_pandas()
"""
pass

# Contextual Fingerprints

In [None]:
ATTRIBUTE_COLUMNS = df_ts.columns.str.extract(r'(\d+_\d+)')[0]
ATTRIBUTE_COLUMNS = ATTRIBUTE_COLUMNS[~ATTRIBUTE_COLUMNS.isna()].tolist()
print(ATTRIBUTE_COLUMNS)

## (1) Train-test split

- TEST SET = all repaired trucks & equal amount of healthy trucks
- TRAIN SET = rest of healthy trucks

In [None]:
repaired_truck_ids = df_trucks_clustering1[df_trucks_clustering1.in_study_repair == 1].index.to_numpy()
healthy_truck_ids = df_trucks_clustering1[df_trucks_clustering1.in_study_repair == 0].index.to_numpy()
ALL_TRUCK_IDS_TEST = np.concatenate((repaired_truck_ids, np.random.choice(healthy_truck_ids, len(repaired_truck_ids), replace=False)))
ALL_TRUCK_IDS_TRAIN = np.array(list(set(healthy_truck_ids) - set(ALL_TRUCK_IDS_TEST)))
print(f"Length of ALL_TRUCK_IDS_TEST={len(ALL_TRUCK_IDS_TEST)}")
print(f"Length of ALL_TRUCK_IDS_TRAIN={len(ALL_TRUCK_IDS_TRAIN)}")

In [None]:
def plot_cumulative_timeseries(df, vehicle_ids):
    unique_features = set(int(col.split('_')[0]) for col in ATTRIBUTE_COLUMNS)
    fig, axes = plt.subplots(figsize=(6*len(unique_features), 4*len(vehicle_ids)),
                             nrows=len(vehicle_ids), ncols=len(unique_features), sharex=True)
    if len(vehicle_ids) > 1:
        for vehicle_id, axes_row in zip(vehicle_ids, axes):
            df_ = df[df.vehicle_id == vehicle_id].set_index('time_step')
            for feature, ax in zip(unique_features, axes_row):
                df_plot_ = df_[df_.columns[df_.columns.str.contains(str(feature))]]
                #df_plot_.plot(ax=ax, marker='o')
                ax.plot(df_plot_, marker='o')
                ax.set_xlabel('timestamp')
                ax.set_ylabel('Cumulative count')
                ax.set_title(feature)
    else:
        df_ = df[df.vehicle_id == vehicle_ids[0]].set_index('time_step')
        for feature, ax in zip(unique_features, axes):
            df_plot_ = df_[df_.columns[df_.columns.str.contains(str(feature))]]
            ax.plot(df_plot_, marker='o')
            ax.set_xlabel('abstract timestamp unit')
            ax.set_ylabel('cumulative count')
            ax.set_title(feature)
    fig.tight_layout()
    return fig, axes

vehicle_ids = [2]
fig, axes = plot_cumulative_timeseries(df_ts, vehicle_ids)
fig.savefig('cumulative_timeseries.png')

## `df_ts_filled`: Handling missing data

In [None]:
missing_percentage = df_ts[ATTRIBUTE_COLUMNS].isnull().mean() * 100
missing_percentage = missing_percentage.sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(15, 5))
missing_percentage.plot(kind='bar', ax=ax)
ax.set_xlabel('Attribute')
ax.set_ylabel('Percentage of Missing Values [%]');

In [None]:
# Forward fill missing values for each vehicle_id based on time_step
df_ts_filled = df_ts.copy()
df_ts_filled = df_ts_filled.sort_values(by=['vehicle_id', 'time_step'])
tqdm.pandas()
tmp = df_ts_filled.groupby('vehicle_id')[ATTRIBUTE_COLUMNS].progress_apply(lambda group: group.ffill())
tmp = tmp.fillna(0)
df_ts_filled[ATTRIBUTE_COLUMNS] = tmp.reset_index(level=0, drop=True)
del tmp
print(f'Shape of df_ts_filled: {df_ts_filled.shape}')
df_ts_filled.head()

In [None]:
fig, axes = plot_cumulative_timeseries(df_ts_filled, vehicle_ids)

## `df_ts_limited`: Limiting amount of sampled data (only required for some decomposition matrices)

In [None]:
# Forward fill missing values for each vehicle_id based on time_step
df_ts_limited = df_ts_filled.copy()
tmp = df_ts_limited.groupby('vehicle_id').progress_apply(lambda group: group.iloc[:25])
df_ts_limited = tmp.reset_index(level=0, drop=True)
del tmp
print(df_ts_limited.shape)
df_ts_limited.head()

In [None]:
vehicle_ids = [0, 2]
fig, axes = plot_cumulative_timeseries(df_ts_limited, vehicle_ids)

In [None]:
df_ts_limited.shape

## `df_ts_differentiated`: Differentiating dataset (only required for some decomposition matrices)

In [None]:
def differentiate(group):
    """ Differentiate time series and copy first value. """
    group_differentiated = group.diff()
    group_differentiated.iloc[0] = group.iloc[0]
    return group_differentiated

df_ts_differentiated = df_ts_filled.copy()
tmp = df_ts_differentiated.groupby('vehicle_id')[ATTRIBUTE_COLUMNS].progress_apply(differentiate)
df_ts_differentiated[ATTRIBUTE_COLUMNS] = tmp.reset_index(level=0, drop=True)
del tmp
print(f'Shape of df_ts_differentiated: {df_ts_differentiated.shape}')
df_ts_differentiated.head()

In [None]:
vehicle_ids = [0, 2, 100, 33641, 33643]
fig, axes = plot_cumulative_timeseries(df_ts_differentiated, vehicle_ids)

## `df_ts_normalized_v1`: Normalize by time interval

Idea: divide by difference between timesteps, to take out effect of irregular sampling.

In [None]:
df_ = df_ts_differentiated.copy()
df_['time_diff'] = df_.groupby('vehicle_id')['time_step'].diff()
df_['time_diff'] = np.where(df_['time_diff'].isna(), df_['time_step'], df_['time_diff'])
meta_cols = ['vehicle_id', 'time_step', 'time_diff']

# Process DataFrame in chunks
chunk_size = 10000
df_ts_normalized_v1 = pd.DataFrame()
for start in tqdm(range(0, len(df_), chunk_size)):
    end = start + chunk_size
    df_chunk_normalized = df_.drop(columns=meta_cols).iloc[start:end].div(df_['time_diff'].iloc[start:end], axis=0)
    df_ts_normalized_v1 = pd.concat([df_ts_normalized_v1, df_chunk_normalized])

df_ts_normalized_v1[meta_cols] = df_[meta_cols]
df_ts_normalized_v1.head()

In [None]:
vehicle_ids = [0, 2, 100, 33641, 33643]
fig, axes = plot_cumulative_timeseries(df_ts_normalized_v1, vehicle_ids)

## `df_ts_normalized_v2`: Normalize by maximum

In [None]:
pass

## v3: Normalizing dataset

(1) per bin over all trucks

In [None]:
# Apply MinMaxScaler to each individual attribute
scaler = MinMaxScaler()

# We assume that 'vehicle_id' and 'time_step' are not to be normalized
# Select only the numeric columns to normalize
numeric_columns = df_ts_limited.select_dtypes(include=['float64', 'int64']).columns
columns_to_normalize = [col for col in numeric_columns if col not in ['vehicle_id', 'time_step']]

# Normalize the numeric columns
df_ts_limited[columns_to_normalize] = scaler.fit_transform(df_ts_limited[columns_to_normalize])

# Display the head of the normalized DataFrame
print(df_ts_limited.head())

In [None]:
vehicle_ids = [0, 2]
unique_features = set(int(col.split('_')[0]) for col in ATTRIBUTE_COLUMNS)
fig, axes = plt.subplots(figsize=(3*len(unique_features), 2*len(vehicle_ids)),
                         nrows=len(vehicle_ids), ncols=len(unique_features), sharex=True)
for vehicle_id, axes in zip(vehicle_ids, axes):
    df_ = df_ts_limited[df_ts_limited.vehicle_id == vehicle_id]
    df_ = df_.set_index('time_step')
    for feature, ax in zip(unique_features, axes):
        df_plot_ = df_[df_.columns[df_.columns.str.contains(str(feature))]]
        #df_plot_.plot(ax=ax, marker='o')
        ax.plot(df_plot_, marker='o')
        ax.set_xlabel('timestamp')
        ax.set_ylabel('Cumulative count')
        ax.set_title(feature)
fig.tight_layout()

In [None]:
df_ts_limited[df_ts_limited.vehicle_id == 4].head()

(2) per feature

In [None]:
pass

## Building decomposition matrix

### Approach 1

In [None]:
def flatten_group(group):
    return pd.Series(
        data={f"{col}_{i}": group.at[i, col] for col in group.columns if col != 'vehicle_id' for i in group.index},
        name=group['vehicle_id'].iloc[0]
    )


flattened_df = pd.DataFrame([flatten_group(group.reset_index(drop=True)) 
                             for name, group in tqdm(df_ts_limited[['vehicle_id'] + ATTRIBUTE_COLUMNS].groupby('vehicle_id'))])

In [None]:
flattened_df = flattened_df.ffill() # !!! workaround
flattened_df.head()

In [None]:
feature_space_train = flattened_df.loc[ALL_TRUCK_IDS_TRAIN]

cmap = mpl.cm.get_cmap("Blues")
V = feature_space_train.to_numpy()
fig, ax = plt.subplots(figsize=(10, 20))
nrows = V.shape[0]
ncols = V.shape[1]
title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
ax.set_title(title_, fontsize=None)
# V.columns = BAND_COLUMNS
im = ax.imshow(
    V,
    cmap=cmap,
    aspect='auto',
    interpolation='nearest',
    norm=mpl.colors.LogNorm(vmin=-0.01, vmax=1.01),
    # extent=[0.25,30.25,nrows,0]
)
ax.tick_params(axis='y', labelrotation=90)
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))

In [None]:
feature_space_train = df_ts_limited.loc[TRUCK_IDS_TRAIN]

cmap = mpl.cm.get_cmap("Blues")
V = feature_space_train.to_numpy()
fig, ax = plt.subplots(figsize=(10, 20))
nrows = V.shape[0]
ncols = V.shape[1]
title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
ax.set_title(title_, fontsize=None)
# V.columns = BAND_COLUMNS
im = ax.imshow(
    V,
    cmap=cmap,
    aspect='auto',
    interpolation='nearest',
    norm=mpl.colors.LogNorm(vmin=0, vmax=1),
    # extent=[0.25,30.25,nrows,0]
)
ax.tick_params(axis='y', labelrotation=90)
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))

### Approach 2

Take as input `df_ts_differentiated`. Start off with single sensor `397`. Construct decomposition per truck.

- Normalize per sensor?

In [None]:
# ATTRIBUTE_COLUMNS_167 = ['167_0', '167_1', '167_2', '167_3', '167_4', '167_5', '167_6', '167_7', '167_8', '167_9']
ATTRIBUTE_COLUMNS_397 = [c for c in ATTRIBUTE_COLUMNS if '397' in c]
df_ts_approach2 = df_ts_differentiated[['vehicle_id', 'time_step'] + ATTRIBUTE_COLUMNS_397]
df_ts_approach2_train = df_ts_approach2[df_ts_approach2.vehicle_id.isin(ALL_TRUCK_IDS_TRAIN)]
df_ts_approach2_test = df_ts_approach2[df_ts_approach2.vehicle_id.isin(ALL_TRUCK_IDS_TEST)]
df_ts_approach2_train.head()

In [None]:
V_train = df_ts_approach2_train[ATTRIBUTE_COLUMNS_397].to_numpy()
V_test = df_ts_approach2_test[ATTRIBUTE_COLUMNS_397].to_numpy()
ax = pd.Series({'train': len(V_train), 'test': len(V_test)}).plot.bar()
ax.set_ylabel('Number of timestamps')
ax.set_title('Size of training and test set')

# Add the number of timestamps on top of each bar
for p in ax.patches:
    ax.annotate(str(p.get_height()), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 10), 
                textcoords='offset points')

In [None]:
def plot_V(V):
    cmap = mpl.cm.get_cmap("Blues")
    fig, ax = plt.subplots(figsize=(6, 8))
    nrows = V.shape[0]
    ncols = V.shape[1]
    title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
    ax.set_title(title_, fontsize=20)
    # V.columns = BAND_COLUMNS
    im = ax.imshow(
        V,
        cmap=cmap,
        aspect='auto',
        interpolation='nearest',
        norm=mpl.colors.LogNorm(vmin=V.min(), vmax=V.max()),
        # extent=[0.25,30.25,nrows,0]
    )
    ax.tick_params(axis='y', labelrotation=90)
    return fig, ax

fig, ax = plot_V(V_train)
ax.set_ylabel('Readout', size=24)
ax.set_xlabel('Sensor Bin', size=24);

In [None]:
import os
from sklearn.decomposition import NMF
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
import pickle


def extract_NMF(V, n_components=60, timestamps=None):
    """  Extracts statistics for non-negative matrix factorisation (NMF) on feature space provided by df.

    Given a non-negative decomposition matrix V, NMF approximates V with two matrices W and H s.t. V = W x H.

    :param V: Numpy array with (normalized) decomposition matrix V.
    :param n_components: Integer with maximum number of components that should be extracted for NMF.
    :return: Dictionary containing W and H, as well as some other model specific parameters.
    """
    nmf = NMF(n_components=n_components, init='nndsvd', max_iter=1000, random_state=42)  # TODO: compare solvers
    model = Pipeline([('nmf', nmf)])
    W = model.fit_transform(V)
    H = pd.DataFrame(model['nmf'].components_)
    V_reconstructed = model['nmf'].inverse_transform(W)
    model_dict = {
        # number of components
        'n_components': n_components,
        # trained NMF-model
        'nmf': model,
        # decomposition matrix V, and approximation matrices W and H
        #'V': V, 'W': W, 
        'H': H,
        # timestamps for V
        #'V_timestamps': timestamps,
        # reconstructed decomposition matrix
        #'V_reconstructed': V_reconstructed,
        # coefficient of determination (average)
        'R2_mean': r2_score(V, V_reconstructed, multioutput='uniform_average'),
        # coefficient of determination (per sample)
        'R2': r2_score(V.T, V_reconstructed.T, multioutput='raw_values'),
        # coefficient of determination (per feature)
        'R2_feature': r2_score(V, V_reconstructed, multioutput='raw_values'),
        # mean squared error (average)
        'MSE_mean': mean_squared_error(V, V_reconstructed),
        # mean squared error (per sample)
        'MSE': mean_squared_error(V.T, V_reconstructed.T, multioutput='raw_values'),
        # reconstruction error expressed as Frobenius norm
        'reconstruction_error': model['nmf'].reconstruction_err_
    }
    return model_dict


def extract_nmf_per_number_of_component(df_V, n_components=60, timestamps=None, verbose=True):
    """ Perform nmf with varying number of components.

    :param df: Dataframe with (normalized) decomposition matrix V.
    :param n_components: Integer with maximum number of components that should be extracted for NMF.
    :param timestamps: Pandas series with timestamps corresponding to rows in feature space.
    :return: Dataframe where each row corresponds to a different number of components and the columns contain
        W and H, as well as some other model specific parameters.
    """
    V = df_V.to_numpy()
    range_components = range(1, n_components)
    tqdm_description = 'Fitting NMF with varying number of components'
    tqdm_range_components = tqdm(range_components, desc=tqdm_description, disable=(not verbose))
    list_models = [extract_NMF(V, n_components=n_components, timestamps=timestamps.to_numpy()) for n_components in tqdm_range_components]
    df_models = pd.DataFrame(list_models)
    return df_models


fpath = os.path.join('cache', 'df_nmf_models')
RECOMPUTE = False         # only set to True, if NMF should be recomputed
MAX_N_COMPONENTS = 15     # maximum number of components used to recompute
df_V_train_ = pd.DataFrame(V_train)

if not os.path.exists(fpath) or RECOMPUTE:
    df_nmf_models_ = extract_nmf_per_number_of_component(
        df_V_train_, n_components=MAX_N_COMPONENTS, timestamps=df_V_train_.index, verbose=True
    )
    #print(fpath)
    os.makedirs(os.path.dirname(fpath), exist_ok=True)
    pickle.dump(df_nmf_models_, open(fpath, 'wb'))

# load data from disk
df_nmf_models = pickle.load(open(fpath, 'rb'))
df_nmf_models

In [None]:
def illustrate_nmf_components_for_paper(V, explained_variance_ratio, df_nmf_models,
                                        vmin=0.001, vmax=0.1,
                                        xlims=(-1, 31), plot_x_ticks=7,
                                        n_components=5):
    """ Illustrate NMF components together with hyperparameter tuning and decomposition matrix.

    Improved version of illustrate_nmf_components for paper.

    :param V: Decomposition matrix as it is fed to NMF.
    :param explained_variance_ratio: Numpy array with explained variance ratio extracted with PCA.
    :param df_nmf_models: Dataframe with NMF-models for different number of components.
    :return:
    """
    def get_n_components(x, threshold):
        return (x > threshold).argmax() + 1

    cmap = mpl.cm.get_cmap("Blues")
    # mpl.rcParams['text.usetex'] = True
    fig, ax_row = plt.subplots(figsize=(8, 7), ncols=3, nrows=1, sharex=False, sharey=False, constrained_layout=True)

    # 1st column: show performance matrix V
    ax = ax_row[0]
    nrows = V.shape[0]
    ncols = V.shape[1]
    title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
    ax.set_title(title_, fontsize=None)
    im = ax.imshow(
        V,
        cmap=cmap,
        aspect='auto',
        interpolation='nearest',
        norm=mpl.colors.LogNorm(vmin=vmin, vmax=vmax)
    )
    ax.tick_params(axis='y', labelrotation=90)
    ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    ax.set_xlabel('Sensor Bin')

    # 2nd column: selection of number of components in two columns
    ax = ax_row[1]
    gridspec = ax.get_subplotspec().get_gridspec()
    ax.remove()
    subfig = fig.add_subfigure(gridspec[1])
    sub_axes = subfig.subplots(2, 1, sharex=False, sharey=False)

    # PCA
    ax = sub_axes[0]
    ax.set_ylim([30, 100])
    explained_variance = 100 * explained_variance_ratio
    ax.plot(pd.Series(dict(enumerate(explained_variance, start=1))), marker='o', markersize=4)
    threshold = 95
    n_components_95 = get_n_components(explained_variance, threshold)
    ax.axhline(threshold, color='green', linestyle='dotted', label=f'> {threshold}% --> {n_components_95}')
    threshold = 90
    n_components_90 = get_n_components(explained_variance, threshold)
    ax.axhline(threshold, color='red', linestyle='dashed', label=f'> {threshold}% --> {n_components_90}')
    threshold = 75
    n_components_75 = get_n_components(explained_variance, threshold)
    ax.axhline(threshold, color='blue', linestyle='dotted', label=f'> {threshold}% --> {n_components_75}')
    ax.set_ylabel('cumulative explained variance [%]')
    ax.set_xlabel('number of components')
    ax.set_title('Cumulative explained variance')
    ax.set_yscale('log')
    ax.legend()

    # NMF
    ax = sub_axes[1]
    x = df_nmf_models['n_components']
    y = df_nmf_models['reconstruction_error']
    ax.plot(pd.Series(dict(zip(x, y))), marker='o', markersize=4)
    ax.set_title('Reconstruction error of NMF')
    ax.set_xlabel('Number of components')
    ax.set_ylabel('Frobenius norm')
    ax.set_yscale('log')
    # ax.set_ylim([0.00005, 0.0005])

    # 3rd column
    ax = ax_row[2]
    gridspec = ax.get_subplotspec().get_gridspec()
    ax.remove()
    subfig = fig.add_subfigure(gridspec[2])
    #n_components = get_n_components(explained_variance, min_explained_variance)
    sub_axes = subfig.subplots(n_components, 1, sharex=True, sharey=True)
    subfig.suptitle('components H')
    row = df_nmf_models[df_nmf_models.n_components == n_components].iloc[0]
    H = row.H
    y_labels = list(range(len(H)))

    for (row, x), ax, y_label in zip(H.iterrows(), sub_axes, y_labels):
        x.plot(label=row, ax=ax, markersize=3)
        # ax.set_title(y_label)
        ax.set_xlabel('Sensor Bin')
        ax.set_ylabel(y_label)
        ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))
    # ax.set_xlim(xlims)
    return fig, ax_row

# calculate explained variance
print(f'- calculating explained variance...')
pca = PCA(n_components=V_train.shape[1], random_state=42)
pca.fit(V_train)
explained_variance_ratio = pca.explained_variance_ratio_.cumsum()
explained_variance_ratio = explained_variance_ratio

fig, _ = illustrate_nmf_components_for_paper(
    V_train, explained_variance_ratio=explained_variance_ratio, df_nmf_models=df_nmf_models,
    vmin=V_train.min(), vmax=V_train.max(), xlims=(-1,101), n_components=5
)
fig.savefig('V_v1.png')

Train model

In [None]:
nmf = NMF(n_components=5, init='nndsvd', max_iter=2000, random_state=42)
model = Pipeline([('nmf', nmf)])
W_train = model.fit_transform(V_train)
H_train = pd.DataFrame(model['nmf'].components_)
V_reconstructed = model['nmf'].inverse_transform(W_train)

- Single fingerprint over all trucks
- Assign weights by truck
- Assign weights by hypergraph cluster
- Assign weights by timerange

> The weights are highly skewed!

In [None]:
pd.DataFrame(W_train).describe()

In [None]:
fingerprints = {0: W_train.mean(axis=0)}
fig, ax = plt.subplots(figsize=(1,5))
sns.heatmap(pd.DataFrame(fingerprints[0]), annot=True, fmt=".3f", ax=ax, cmap='Blues', vmin=0, vmax=22, cbar=False)
ax.set_ylabel('Component')
ax.set_title('Fingerprint')

In [None]:
trucks_train = df_ts_approach2_train.vehicle_id
trucks_test = df_ts_approach2_test.vehicle_id
trucks_train

## Validation

In [None]:
replacement_dict = dict(zip(df_repair['vehicle_id'], df_repair['in_study_repair']))
was_repaired = trucks_test.replace(replacement_dict)
ax = pd.Series(was_repaired).value_counts().plot.bar()
ax.set_xlabel('Repaired?')

## Validation 1: No grouping; Decomposition matrix v1

In [None]:
W_test = model.transform(V_test)
W_test.shape

In [None]:
df = pd.DataFrame(list(zip(dist, was_repaired)), columns=['distance', 'was_repaired']).astype({'was_repaired': bool})
df.head()
sns.boxplot(df, y='distance', x='was_repaired')

In [None]:
df = pd.DataFrame(list(zip(dist, was_repaired)), columns=['distance', 'was_repaired']).astype({'was_repaired': bool})
df.head()
ax = sns.boxplot(df, y='distance', x='was_repaired')
ax.set_ylim(0, 50)

## Validation 2: Hypergraph clustering, Decomposition matrix v1

1. Differentiate
2. Forward-fill missing values
3. Construct fingerprints per group

no normalisation

> GROUPS ARE CONSTRUCTED ON THE WHOLE TRAINING SET (NOT ONLY ON HEALTHY DATA)

### Validation 2 (Construct fingerprints per group)

In [None]:
# select clustering solutions for training set
df_clustering_solution_train = pd.DataFrame([df_clustering_solution.loc[truck_id] for truck_id in tqdm(trucks_train)])

# select all rows from V and W for the corresponding operating mode
V_train_per_om = {om: V_train[df_clustering_solution_train[om]] for om in df_clustering_solution_train.columns}
W_train_per_om = {om: W_train[df_clustering_solution_train[om]] for om in df_clustering_solution_train.columns}

# construct fingerprints per operating mode
fingerprints = {om: W_.mean(axis=0) for om, W_ in W_train_per_om.items()}
df_fingerprints = pd.DataFrame(fingerprints).T
df_fingerprints.head()

In [None]:
fig, ax = plt.subplots(figsize=(3, 4))
ax = sns.heatmap(df_fingerprints, cmap='Blues', ax=ax)
ax.set_title('Fingerprints')
ax.set_xlabel('component')
ax.set_ylabel('Fingerprint');

In [None]:
df_clustering_solution_test = pd.DataFrame([df_clustering_solution.loc[truck_id] for truck_id in tqdm(trucks_test)])
W_test = model.transform(V_test)
W_test.shape

In [None]:
fig, ax = plt.subplots(figsize=(3, 12))
ax = sns.heatmap(W_test, cmap='Blues', ax=ax, vmax=50)
ax.set_title('$W_{test}$')
ax.set_xlabel('component')
ax.set_ylabel('Fingerprint');

In [None]:
distances_per_fingerprints = []
for w, clus_sol in tqdm(zip(W_test, df_clustering_solution_test.iterrows()), total=len(df_clustering_solution_test)):
    distances_per_fingerprints_this_row = {om: np.linalg.norm(w-f) for om, f in fingerprints.items()}
    distances_per_fingerprints.append(distances_per_fingerprints_this_row)

df_distances_per_fingerprints = pd.DataFrame(distances_per_fingerprints)
df_distances_per_fingerprints.head()

In [None]:
ncols = 6
nrows = math.ceil(len(df_clustering_solution_test.columns) / ncols)
fig, axes = plt.subplots(figsize=(15,15), nrows=nrows, ncols=ncols, sharex=True, sharey=False)
for col, ax in zip(df_clustering_solution_test.columns, axes.flat):
    df_ = df_distances_per_fingerprints[col]
    df_ = df_[df_clustering_solution_test[col].reset_index(drop=True) & ~was_repaired.astype(bool)]
    with warnings.catch_warnings(action="ignore"):
        sns.histplot(df_, bins=np.arange(0, 160, 2), ax=ax)
        median = df_.median()
        ax.axvline(median, label='median distance', color='red', linestyle='dashed')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.9, s=f'median={round(median, 2)}', color='red')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.8, s=f'p99={round(df_.quantile(0.99), 2)}', color='red')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.7, s=f'p95={round(df_.quantile(0.95), 2)}', color='red')
    ax.set_xlim([0, 160])
    ax.set_xlabel('Distance')
    ax.set_title(col)
fig.suptitle('Distribution of distances per operating mode (healthy)', size=18, color='blue')
fig.tight_layout()
fig.savefig('distribution_healthy.png')

In [None]:
ncols = 6
nrows = math.ceil(len(df_clustering_solution_test.columns) / ncols)
fig, axes = plt.subplots(figsize=(15,15), nrows=nrows, ncols=ncols, sharex=True, sharey=False)
for col, ax in zip(df_clustering_solution_test.columns, axes.flat):
    df_ = df_distances_per_fingerprints[col]
    df_ = df_[df_clustering_solution_test[col].reset_index(drop=True) & was_repaired.astype(bool)]
    with warnings.catch_warnings(action="ignore"):
        sns.histplot(df_, bins=np.arange(0, 160, 2), ax=ax)
        median = df_.median()
        ax.axvline(median, label='median distance', color='red', linestyle='dashed')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.9, s=f'median={round(median, 2)}', color='red')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.8, s=f'p99={round(df_.quantile(0.99), 2)}', color='red')
        ax.text(x=median + 0.05 * median, y=ax.get_ylim()[1] * 0.7, s=f'p95={round(df_.quantile(0.95), 2)}', color='red')
    ax.set_xlim([0, 160])
    ax.set_xlabel('Distance')
    ax.set_title(col)
fig.suptitle('Distribution of distances per operating mode (repaired)', size=18, color='blue')
fig.tight_layout()
fig.savefig('distribution_repaired.png')

In [None]:
n_components = 3  # TODO: replace with elbow method

# construct fingerprints per operating mode
for om, V_test_ in V_test_per_om.items():
    nmf = NMF(n_components=n_components, init='nndsvd', max_iter=1000, random_state=42)
    model = Pipeline([('nmf', nmf)])

In [None]:
V_test.shape

In [None]:
pd.DataFrame(df_)

In [None]:
replacement_dict = dict(zip(df_repair['vehicle_id'], df_clustering_solution.loc[]))
#df_spec_clustering_per_timestamp = df_clustering_solution

---

In [None]:
def get_maximum_number_of_timestamps(group):
    return group.iloc[:25]

In [None]:
THROWN_AWAY_IDS = []
sol = {}
n_thrown_away = 0
unique_vehicle_ids = df_ts.vehicle_id.unique()
LEN_TS = 25
for id_ in tqdm(unique_vehicle_ids):
    df_ = df_ts[df_ts.vehicle_id == id_][ATTRIBUTE_COLUMNS]
    # print(df_.shape)
    matrix = df_.iloc[:LEN_TS]
    normalized_matrix = MinMaxScaler().fit_transform(matrix)
    s_ = pd.Series(normalized_matrix.flatten())
    # why only so few samples?
    if len(s_) >= len(ATTRIBUTE_COLUMNS) * LEN_TS:
        sol[id_] = s_
    else:
        n_thrown_away += 1
        THROWN_AWAY_IDS.append(id_)

print(f'Threw away {n_thrown_away} trucks')

feature_space = pd.DataFrame(sol).T
feature_space

In [None]:
TRUCK_IDS_TRAIN = np.array(list(set(ALL_TRUCK_IDS_TRAIN) - set(THROWN_AWAY_IDS)))
TRUCK_IDS_TEST = np.array(list(set(ALL_TRUCK_IDS_TEST) - set(THROWN_AWAY_IDS)))

print(f"Length of TRUCK_IDS_TRAIN={len(TRUCK_IDS_TRAIN)}")
print(f"Length of TRUCK_IDS_TEST={len(TRUCK_IDS_TEST)}")

In [None]:
feature_space_train = feature_space.loc[TRUCK_IDS_TRAIN]
feature_space_test = feature_space.loc[TRUCK_IDS_TEST]
feature_space_train

In [None]:
import matplotlib as mpl
import matplotlib.ticker as ticker

cmap = mpl.cm.get_cmap("Blues")
V = feature_space_train.to_numpy()
fig, ax = plt.subplots(figsize=(10, 20))
nrows = V.shape[0]
ncols = V.shape[1]
title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
ax.set_title(title_, fontsize=None)
# V.columns = BAND_COLUMNS
im = ax.imshow(
    V,
    cmap=cmap,
    aspect='auto',
    interpolation='nearest',
    norm=mpl.colors.LogNorm(vmin=0, vmax=1),
    # extent=[0.25,30.25,nrows,0]
)
ax.tick_params(axis='y', labelrotation=90)
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))

In [None]:
import matplotlib as mpl
import matplotlib.ticker as ticker

cmap = mpl.cm.get_cmap("Blues")
V = feature_space_train.to_numpy()
fig, ax = plt.subplots(figsize=(10, 20))
nrows = V.shape[0]
ncols = V.shape[1]
title_ = "Performance matrix V" + f" ({nrows} x {ncols})"
ax.set_title(title_, fontsize=None)
# V.columns = BAND_COLUMNS
im = ax.imshow(
    V,
    cmap=cmap,
    aspect='auto',
    interpolation='nearest',
    norm=mpl.colors.LogNorm(vmin=-0.01, vmax=1.01),
    # extent=[0.25,30.25,nrows,0]
)
ax.tick_params(axis='y', labelrotation=90)
# ax.xaxis.set_minor_locator(ticker.MultipleLocator(1))

In [None]:
first_grouping = df_trucks_clustering1[0]
first_grouping = first_grouping[first_grouping].index
first_grouping

In [None]:
trucks = feature_space[feature_space.index.isin(first_grouping)]
trucks

In [None]:
idx_healthy_trucks = df_trucks_clustering1['in_study_repair'][df_trucks_clustering1['in_study_repair'] == 1].index
unhealthy_trucks_first_grouping = trucks[trucks.index.isin(idx_healthy_trucks)]
healthy_trucks_first_grouping = trucks[~trucks.index.isin(idx_healthy_trucks)]
healthy_trucks_first_grouping

In [None]:
unhealthy_trucks_first_grouping

Issues to handle (see slides):

- Handling of very short read-outs (e.g. less than 25 time stamps)
- Handling of missing values
- Differencing of time-series
- Normalization of values