In [1]:
import pandas as pd

from visiontext.pandatools import full_pandas_display

# deduplication of data in pandas
# pandas dedup example
data1 = {
    "epoch": [1, 1, 2, 2, 3, 3],
    "step": [10, 10, 20, 20, 30, 30],
    "loss": [0.1, 0.1, 0.3, 0.4, 0.5, 0.6],
}
data2 = {
    "epoch": [1, 1, 2, 2, 3, 3],
    "step": [10, 10, 20, 20, 30, 30],
    "loss": [0.1, 0.1, 0.3, 0.3, 0.5, 0.5],
}
for i, data in enumerate([data1, data2]):
    print(f"=" * 70, f"Run {i}")
    df = pd.DataFrame(data)
    df_nodup = df.drop_duplicates()
    df_final_sorted = df_nodup.sort_values(by=["epoch", "step"])
    df_final_nona = df_final_sorted.dropna(axis=1, how="all")

    print(f"Dataframe:")
    with full_pandas_display():
        print(df_final_nona)
        print()

    grouped = df_final_nona.groupby(["epoch", "step"]).size()
    duplicates = grouped[grouped > 1]
    duplicated_index = duplicates.index

    # change the index of the dataframe and use .loc to apply the index
    df_final_indexed = df_final_nona.set_index(["epoch", "step"])
    df_dups = df_final_indexed.loc[duplicated_index]
    if len(df_dups) > 0:
        print(f"Mismatching duplicates:")
        with full_pandas_display():
            print(df_dups)
            print()
        df_final_clean = df_final_indexed.reset_index().drop_duplicates(
            subset=["epoch", "step"], keep="first"
        )

        print(f"Clean")
        with full_pandas_display():
            print(df_final_clean)
            print()
    else:
        print(f"No duplicates found")
        df_final_clean = df_final_nona

Dataframe:
   epoch  step  loss
0      1    10   0.1
2      2    20   0.3
3      2    20   0.4
4      3    30   0.5
5      3    30   0.6

Mismatching duplicates:
            loss
epoch step      
2     20     0.3
      20     0.4
3     30     0.5
      30     0.6

Clean
   epoch  step  loss
0      1    10   0.1
1      2    20   0.3
3      3    30   0.5

Dataframe:
   epoch  step  loss
0      1    10   0.1
2      2    20   0.3
4      3    30   0.5

No duplicates found
