In [None]:
import h5py
import pandas as pd


def analyze_hdf(filename, output_file):
    analysis = []

    with h5py.File(filename, "r") as pdb:
        # Print all root level object names (aka keys) 
        # these can be group or dataset names 
        for protein in pdb.keys():

            try:

                data = list(pdb[protein]['pybel']['processed']['pdbbind']['data'])

                import numpy as np
                d5_np = np.array(data)
                
                d_shape_len = len(d5_np.shape)
                d0 = d5_np.shape[0] if d_shape_len > 0 else None
                d1 = d5_np.shape[1] if d_shape_len > 1 else None

                analysis.append({
                    "protein": protein,
                    "d_shape_len": d_shape_len,
                    "rows": d0,
                    "columns": d1
                })

            except:
                print(f"Protein: {protein} error")
                analysis.append({
                    "protein": protein,
                    "d_shape_len": None,
                    "rows": None,
                    "columns": None
                })
        
        import pandas as pd
        df = pd.DataFrame(analysis)
        df.to_csv(output_file, index=False)

In [None]:
analyze_hdf("data/demo_2021/pdb_fast_with_demo_hydro.hdf", "data/demo_2021/protein_data_shapes.csv")

In [None]:
analyze_hdf("data/fast/core_test.hdf", "data/fast/fast_data_shapes.csv")

In [None]:
analyze_hdf("data/ingenni/pdbbind2016_core_test.hdf", "data/ingenni/ingenni_data_shapes.csv")


## Analyse shapes

In [None]:
import pandas as pd

fast_shapes = pd.read_csv("data/fast/fast_data_shapes.csv")
ingenni_shapes = pd.read_csv("data/ingenni/ingenni_data_shapes.csv")
demo_shapes = pd.read_csv('data/demo_2021/protein_data_shapes.csv')



In [None]:
import pandas as pd
from functools import reduce

# List of CSV file paths to compare
csv_files = [
    ("data/fast/fast_data_shapes.csv", "fast"),
    ("data/ingenni/ingenni_data_shapes.csv", "ingenni"),
    ("data/demo_2021/protein_data_shapes.csv", "demo")
]

# Load CSVs and rename 'rows' column per file
dfs = []
for i, (path, name) in enumerate(csv_files):
    df = pd.read_csv(path)
    df = df.set_index("protein")
    df.rename(columns={"rows": f"rows_{name}"}, inplace=True)
    dfs.append(df[f"rows_{name}"])

# Merge all DataFrames on 'protein'
merged_df = reduce(lambda left, right: pd.merge(left, right, on='protein', how='outer'), dfs)

# Compare 'rows' columns
def rows_disagree(row):
    values = [v for v in row if pd.notnull(v)]
    return len(set(values)) > 1

merged_df["rows_diff"] = merged_df.apply(rows_disagree, axis=1)

# Save full comparison
merged_df.to_csv("data/protein_data_points_comparison.csv")

# Print only discrepancies
discrepancies = merged_df[merged_df["rows_diff"]]
print("Proteins with different row counts across files:")
print(discrepancies)


In [None]:
import pandas as pd
from functools import reduce

# List of CSV file paths to compare
csv_files = ["data/fast/fast_data_shapes.csv", "data/ingenni/ingenni_data_shapes.csv", "data/demo_2021/protein_data_shapes.csv"]  # replace with your real files

# Load all CSVs into DataFrames, indexed by protein
dfs = [pd.read_csv(f).set_index("protein") for f in csv_files]

# Rename columns to indicate which file they came from
for i, df in enumerate(dfs):
    df.columns = [f"{col}_f{i+1}" for col in df.columns]

# Merge all DataFrames on protein
merged_df = reduce(lambda left, right: pd.merge(left, right, on='protein', how='outer'), dfs)

# Compare d_shape_len, rows, and d1 across files
def highlight_discrepancies(row, col_prefix):
    values = [row[col] for col in merged_df.columns if col.startswith(col_prefix)]
    return len(set(values)) > 1

# Apply flags
merged_df["shape_len_diff"] = merged_df.apply(lambda row: highlight_discrepancies(row, "d_shape_len"), axis=1)
merged_df["rows_diff"] = merged_df.apply(lambda row: highlight_discrepancies(row, "rows"), axis=1)
merged_df["columns_diff"] = merged_df.apply(lambda row: highlight_discrepancies(row, "columns"), axis=1)

# Optional: save the comparison result
merged_df.to_csv("data/protein_comparison_result.csv")

# Show only discrepancies
print("Discrepancies:")
print(merged_df[(merged_df["shape_len_diff"]) | (merged_df["rows_diff"]) | (merged_df["columns_diff"])])
