In [None]:
%cd ./../

In [None]:
import logging
import os

from dotenv import load_dotenv

load_dotenv()
os.environ['INDEX'] = str(0)
os.environ['DATA_DEL_SPLITS'] = str(False)
os.environ['DATA_DEL_ALL'] = str(False)

# TODO: share the code with the `logs.py` file.
logging.basicConfig(
	format="{name}\t{asctime}\t{levelname}\t{message}\t",
	style='{',
	level=logging.DEBUG
)
logging.getLogger('PIL.PngImagePlugin').disabled = True
logging.getLogger('matplotlib').disabled = True
logging.getLogger('matplotlib.font_manager').disabled = True

In [None]:
import repro  # Imported for its side effects.

In [None]:
import data

In [None]:
import finetunes.kkk as kkk
import plotly.express as px

In [None]:
import numpy as np

splits = [data.train_splits[i] for i in data.data_dist_nodes_indices]
labels_splits = [[data.train_data_all.targets[idx] for idx in split] for split in splits]

np_unique_splits = [np.unique(labels_split, return_counts=True) for labels_split in labels_splits]
np_unique_splits = [(list(map(str, unique_labels_split)), list(map(int, unique_labels_count_split))) \
                    for (unique_labels_split, unique_labels_count_split) in np_unique_splits]
labels_counts_splits = [dict(zip(*np_unique_split)) for np_unique_split in np_unique_splits]

unique_labels = sorted({label for d in labels_counts_splits for label in d.keys()})
labels_splits_vectors = np.array([[d.get(label, 0) for label in unique_labels] for d in labels_counts_splits], dtype=int)
labels_splits_probs_vectors = labels_splits_vectors / labels_splits_vectors.sum(axis=1, keepdims=True)

labels_splits_vectors

In [None]:
px.bar(
	labels_splits_vectors,
	title='Train Data Distribution - Labels per Node',
	labels={'index': 'Node Index', 'value': 'Count', 'variable': 'Label'}
).show()

px.bar(
	labels_splits_vectors.T,
	title='Train Data Distribution - Nodes per Label', labels={'index': 'Label', 'value': 'Count', 'variable': 'Node Index'}
).show()

In [None]:
# Cosine Similarity

from sklearn.metrics.pairwise import cosine_similarity

cosines = cosine_similarity(labels_splits_vectors)

px.imshow(
	cosines,
	title='Cosine Sim. between Labels\' Dist. of Nodes\' Train Datasets',
	text_auto=True, labels={'x': 'Node Index', 'y': 'Node Index', 'color': 'Cosine Sim.'}
).show()

In [None]:
# Jensen-Shannon Distance

from scipy.spatial.distance import jensenshannon, pdist, squareform

jss = squareform(pdist(labels_splits_probs_vectors, metric=lambda u, v: jensenshannon(u, v, base=2)))

px.imshow(
	jss,
	title='Jensen-Shannon Dist. between Labels\' Dist. of Nodes\' Train Datasets',
	text_auto=True, labels={'x': 'Node Index', 'y': 'Node Index', 'color': 'Jensen-Shanon Distance'},
	color_continuous_scale=px.colors.sequential.Plasma_r
).show()

In [None]:
# Total Variation Distance

from scipy.spatial.distance import pdist, squareform

tvs = squareform(pdist(labels_splits_probs_vectors, metric=lambda p, q: 0.5 * np.sum(np.abs(p - q))))

px.imshow(
	tvs,
	title='Total Variation Dist. between Labels\' Dist. of Nodes\' Train Datasets',
	text_auto=True, labels={'x': 'Node Index', 'y': 'Node Index', 'color': 'Total Variation Distance'},
	color_continuous_scale=px.colors.sequential.Plasma_r
).show()

In [None]:
min_cosine = 0.5
max_js = 0.3
max_tv = 0.5

cosine_indices = kkk.mat_to_indices(cosines, criteria=lambda cosine: cosine >= min_cosine)
cosine_ks = np.array([len(i) for i in cosine_indices], dtype=int)

js_indices = kkk.mat_to_indices(jss, criteria=lambda js: js <= max_js)
js_ks = np.array([len(i) for i in js_indices], dtype=int)

tv_indices = kkk.mat_to_indices(tvs, criteria=lambda tv: tv <= max_tv)
tv_ks = np.array([len(i) for i in tv_indices], dtype=int)

In [None]:
import pandas as pd

df = []

for i, split in enumerate(splits):
	df.append({
		'i': i,
		'v': labels_splits_vectors[i],

		'cosine_k': cosine_ks[i],
		'js_k': js_ks[i],
		'tv_k': tv_ks[i],

		'cosine_peers': cosine_indices[i],
		'js_peers': js_indices[i],
		'tv_peers': tv_indices[i],
	})

df = pd.DataFrame(df)

melt_mapping = {
	'cosine': ('cosine_k', 'cosine_peers'),
	'js': ('js_k', 'js_peers'),
	'tv': ('tv_k', 'tv_peers')
}
dfs = []
for type_, (k_col, peers_col) in melt_mapping.items():
	temp = df[['i', 'v', k_col, peers_col]].copy()
	temp = temp.rename(columns={k_col: 'k', peers_col: 'peers'})
	temp['type'] = type_
	dfs.append(temp)

df = pd.concat(dfs, ignore_index=True)
df

In [None]:
ks_fig = px.bar(
	df,
	x='i', y='k', color='type',
	title='K Values per Nodes',
	barmode='group', labels={'k': 'K', 'i': 'Node Index', 'type': 'Criteria'}
)

type_criteria_mapping = {
	'cosine': f"Cosine ≥ {min_cosine}",
	'js': f"JS ≤ {max_js}",
	'tv': f"TV ≤ {max_tv}"
}
for type_, criteria in type_criteria_mapping.items():
	ks_fig.update_traces({'name': criteria}, selector={'name': type_})

ks_fig.show()

df_summ = df.groupby('type').agg(
	k_min=('k', 'min'),
	k_mean=('k', 'mean'),
	k_max=('k', 'max'),
	k_std=('k', 'std'),
)
df_summ.reset_index(inplace=True)
# noinspection PyShadowingNames
df_summ['criteria'] = df_summ['type'].apply(lambda type_: type_criteria_mapping[type_])
df_summ.drop(columns='type', inplace=True)
df_summ.set_index('criteria', inplace=True)

df_summ