# Random shuffle the markdown cell only to get real benchmark

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import sparse
from tqdm import tqdm

pd.options.display.width = 180
pd.options.display.max_colwidth = 120

data_dir = Path('../input/AI4Code')

In [None]:
NUM_TRAIN = 10000
def read_notebook(path):
    return (
        pd.read_json(
            path,
            dtype={'cell_type': 'category', 'source': 'str'})
        .assign(id=path.stem)
        .rename_axis('cell_id')
    )


paths_train = list((data_dir / 'train').glob('*.json'))[:NUM_TRAIN]
notebooks_train = [
    read_notebook(path) for path in tqdm(paths_train, desc='Train NBs')
]
df = (
    pd.concat(notebooks_train)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

In [None]:
df.head()

In [None]:
df_orders = pd.read_csv(
    data_dir / 'train_orders.csv',
    index_col='id',
    squeeze=True,
).str.split()  # Split the string representation of cell_ids into a list

In [None]:
df_orders

In [None]:
def get_ranks(base, derived):
    return [base.index(d) for d in derived]

In [None]:
df_orders_ = df_orders.to_frame().join(
    df.reset_index('cell_id').groupby('id')['cell_id'].apply(list),
    how='right',
)

ranks = {}
for id_, cell_order, cell_id in df_orders_.itertuples():
    ranks[id_] = {'cell_id': cell_id, 'rank': get_ranks(cell_order, cell_id)}

df_ranks = (
    pd.DataFrame
    .from_dict(ranks, orient='index')
    .rename_axis('id')
    .apply(pd.Series.explode)
    .set_index('cell_id', append=True)
)

In [None]:
df_ranks

In [None]:
df = pd.merge(df, df_ranks, left_index=True, right_index=True)
df.head()

In [None]:
# get pct_rank  rank: 1 2 3 -> 0.25 0.5 0.75
df['pct_rank'] = df.groupby(["id", "cell_type"])["rank"].apply(lambda s: pd.Series((np.arange(len(s)) + 1) /(len(s) + 1), index=s.index))
df['pred'] = np.random.uniform(size=df.shape[0])
# keep code cell in the correct order
df.loc[df['cell_type']=='code', "pred"] = df.loc[df['cell_type']=='code', "pct_rank"]
df.head()

In [None]:
from bisect import bisect

def count_inversions(a):
    inversions = 0
    sorted_so_far = []
    for i, u in enumerate(a):
        j = bisect(sorted_so_far, u)
        inversions += i - j
        sorted_so_far.insert(j, u)
    return inversions


def kendall_tau(ground_truth, predictions):
    total_inversions = 0
    total_2max = 0  # twice the maximum possible inversions across all instances
    for gt, pred in zip(ground_truth, predictions):
        ranks = [gt.index(x) for x in pred]  # rank predicted order in terms of ground truth
        total_inversions += count_inversions(ranks)
        n = len(gt)
        total_2max += n * (n - 1)
    return 1 - 4 * total_inversions / total_2max

In [None]:
# perfect rank: should be 1
y_dummy = df.reset_index('cell_id').sort_values("rank").groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

In [None]:
# random shuffle those markdown cell
y_dummy = df.reset_index('cell_id').sort_values("pred").groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

In [None]:
# all random shuffle: should close to 0
df['all_random'] = np.random.uniform(size=df.shape[0])
y_dummy = df.reset_index('cell_id').sort_values("all_random").groupby('id')['cell_id'].apply(list)
kendall_tau(df_orders.loc[y_dummy.index], y_dummy)

# Submit

In [None]:
paths_test = list((data_dir / 'test').glob('*.json'))
notebooks_test = [
    read_notebook(path) for path in tqdm(paths_test, desc='Test NBs')
]
df_test = (
    pd.concat(notebooks_test)
    .set_index('id', append=True)
    .swaplevel()
    .sort_index(level='id', sort_remaining=False)
)

In [None]:
df_test.reset_index(inplace=True)
df_test.head()

In [None]:
df_test['rank'] = list(range(len(df_test))) 
df_test['pct_rank'] = df_test.groupby(["id", "cell_type"])["rank"].apply(lambda s: pd.Series((np.arange(len(s)) + 1) /(len(s) + 1), index=s.index))
df_test["pred"] = np.random.uniform(size=df_test.shape[0])
df_test.loc[df_test['cell_type']=='code', "pred"] = df_test.loc[df_test['cell_type']=='code', "pct_rank"]

In [None]:
sub_df = df_test.sort_values("pred").groupby("id")["cell_id"].apply(lambda x: " ".join(x)).reset_index()
sub_df.rename(columns={"cell_id": "cell_order"}, inplace=True)
sub_df.head()

In [None]:
sub_df.to_csv("submission.csv", index=False)