In [2]:
import os
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

def read_csv_file(fn_csv):
    return pd.read_csv(f'prune_bsol/{fn_csv}', dtype={'bsol': 'str'})

def flip_bsol(bsol:str) -> str:
    if bsol[0] == '1':
        return ''.join(['0' if c == '1' else '1' for c in bsol])
    return bsol

def read_prune_bsol() -> pd.DataFrame:
    csv_files = [fn for fn in os.listdir('prune_bsol') if fn.endswith('.csv')]

    with ProcessPoolExecutor() as executor:
        data = list(tqdm(executor.map(read_csv_file, csv_files), total=len(csv_files)))

    df = pd.concat(data, ignore_index=True)
    df['bsol'] = df['bsol'].apply(flip_bsol)

    df['len_bsol'] = df['bsol'].apply(len)
    return df

In [3]:
df = read_prune_bsol()
df.columns

100%|██████████| 73647/73647 [01:10<00:00, 1042.24it/s]


In [11]:
import plotly.express as px

# scatter, x=len_bsol, y=count, y axis log scale
def plot_len_bsol(df:pd.DataFrame):
    df = df[['len_bsol', 'dij']].groupby('len_bsol').count().reset_index()
    df.columns = ['len_bsol', 'count']
    fig = px.scatter(df, x='len_bsol', y='count', title='Length of BSOL vs Count', log_y=True)
    # set size
    fig.update_layout(width=800, height=400)
    fig.show()

plot_len_bsol(df)

In [41]:
import numpy as np

def plot_KL(df:pd.DataFrame):
    # considering only len_bsol <= 20
    df = df[df['len_bsol'] <= 20]
    df = df[['len_bsol', 'bsol', 'dij']].groupby(['len_bsol','bsol']).count().reset_index()
    df.rename(columns={'dij': 'count'}, inplace=True)
    total_by_len = df.groupby('len_bsol')['count'].sum()
    df = df.merge(total_by_len, on='len_bsol', suffixes=('', '_total'))
    df['prob'] = df['count'] / df['count_total']
    df = df[['len_bsol', 'prob']].groupby('len_bsol').agg(list).reset_index()

    # measure KL divergence
    # KL(p||q) = sum(p(x) * log(p(x) / q(x))), where q(x) = 1/2^(len_bsol - 1)
    # Note: 
    #   - The function q is the uniform distribution over all possible BSOLs of length len_bsol.
    #   - The minus 1 is because the first bit is fixed to 0.
    df['KL'] = df.apply(lambda x: np.sum([p * np.log(p / 2**(1 - x['len_bsol'])) for p in x['prob']]), axis=1)

    fig = px.scatter(df, x='len_bsol', y='KL', title='Length of BSOL vs KL Divergence')
    # set size
    fig.update_layout(width=800, height=400)
    fig.show()    
    
    return df

df_KL = plot_KL(df)

In [40]:
for len_bsol in range(2,7):
    PROB = df_KL.query(f'len_bsol == {len_bsol}')['prob'].values
    if len(PROB) == 0:
        continue
    PROB = PROB[0]
    PROB = sorted([round(p, 3) for p in PROB], reverse=True)
    # bar, x=index, y=prob
    fig = px.bar(x=range(len(PROB)), y=PROB, title=f'Probability of BSOL of length {len_bsol}', labels={'x': 'Index', 'y': 'Probability'}, text=PROB)
    # set size
    fig.update_layout(width=800, height=400)
    fig.show()

0.08112390508146683