In [114]:
import pandas as pd

In [115]:
P = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/P_allos->lig_walks.csv", sep=";")
PL = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/PL_allos->lig_walks.csv", sep=";")
AP = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/AP_allos->lig_walks.csv", sep=";")
APL = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/APL_allos->lig_walks.csv", sep=";")

# P = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/P_trans_abs_walks.csv", sep=";")
# PL = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/PL_trans_abs_walks.csv", sep=";")
# AP = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/AP_trans_abs_walks.csv", sep=";")
# APL = pd.read_csv("/Users/sam/Documents/Wesleyan/Thayer_Lab/2025/summer/bharat/random_walk/APL_trans_abs_walks.csv", sep=";")

In [116]:
def find_successful_paths(df: pd.DataFrame) -> pd.DataFrame:
    df = df[df['termination_code'] == 1]  # pyright: ignore[reportAssignmentType]
    df = df.sort_values(['residue'])
    return df

In [117]:
successful_P = find_successful_paths(P)
successful_PL = find_successful_paths(PL)
successful_AP = find_successful_paths(AP)
successful_APL = find_successful_paths(APL)

In [118]:
from collections import Counter

In [119]:
def count_nodes(df: pd.DataFrame):
    all_nodes = (
        df["path"]
        .dropna()
        .str.split(",")
        .explode()
        .astype(int)
    )
    return all_nodes.value_counts().sort_index()

In [120]:
counts_P = count_nodes(successful_P)
counts_PL = count_nodes(successful_PL)
counts_AP = count_nodes(successful_AP)
counts_APL = count_nodes(successful_APL)

In [121]:
import matplotlib.pyplot as plt
import plotly.express as px

In [122]:
def graph_counts(counts: pd.Series, label: str, top_n: int | None = None):
    if top_n is not None:
        counts = counts.nlargest(top_n)
        title = f"Top {top_n} nodes ({label})"
    else:
        title = f"Node frequency ({label})"

    df_plot = (
        counts
        .rename("frequency")
        .rename_axis("node")   # ðŸ‘ˆ key fix
        .reset_index()
    )

    fig = px.bar(
        df_plot,
        x="node",
        y="frequency",
        title=title,
        labels={
            "node": "Node number",
            "frequency": "Frequency"
        }
    )

    fig.update_layout(
        xaxis_tickangle=-45,
        bargap=0.2,
        template="plotly_white"
    )

    fig.show()

In [123]:
graph_counts(counts_P, "P")
graph_counts(counts_PL, "PL")
graph_counts(counts_AP, "AP")
graph_counts(counts_APL, "APL")

In [124]:
import plotly.graph_objects as go

In [125]:
def overlay_graph_counts(
    counts_dict: dict[str, pd.Series],
    top_n: int | None = None,
    title: str = "Node frequency comparison"
):
    # Combine into one DataFrame
    df = (
        pd.concat(counts_dict, axis=1)
        .fillna(0)
    )

    # Optionally restrict to top N nodes by total frequency
    if top_n is not None:
        top_nodes = df.sum(axis=1).nlargest(top_n).index
        df = df.loc[top_nodes]

    fig = go.Figure()

    for label in df.columns:
        fig.add_bar(
            x=df.index,
            y=df[label],
            name=label
        )

    fig.update_layout(
        title=title,
        xaxis_title="Node number",
        yaxis_title="Frequency",
        barmode="overlay",   # ðŸ‘ˆ bars on top of each other
        bargap=0.15,
        template="plotly_white"
    )

    fig.show()

In [129]:
counts_dict = {
    "P": counts_P,
    "PL": counts_PL,
    "AP": counts_AP,
    "APL": counts_APL,
}

overlay_graph_counts(counts_dict, top_n=20)

In [127]:
PATH_DELIM = ","

def parse_path(p):
    return p.split(PATH_DELIM)


def identify_breakpoints(df: pd.DataFrame):
    conn = df[df["termination_code"] == 1].copy()
    print(f"Connecting walks: {len(conn)}")
    conn["path_nodes"] = conn["path"].apply(parse_path)  # pyright: ignore[reportAttributeAccessIssue]
    node_counts = Counter()

    for nodes in conn["path_nodes"]:
        # exclude start and end nodes if desired
        internal = nodes[1:-1]
        node_counts.update(internal)

    node_freq = (
        pd.DataFrame(node_counts.items(), columns=["node", "count"])
        .sort_values("count", ascending=False)
    )

    print(node_freq.head(10))
    

In [128]:
identify_breakpoints(P)
identify_breakpoints(PL)
identify_breakpoints(AP)
identify_breakpoints(APL)

Connecting walks: 1000
    node  count
11   593    646
35   571    357
10   598    352
7    586    276
9    591    264
34   264    247
25   587    237
37   613    236
12   614    221
113  585    219
Connecting walks: 177244
    node  count
66   231  10084
111  278   9697
58    98   9686
41    97   9597
153   99   9468
265  143   9398
16    86   9372
34    87   9286
89    85   8958
239  138   8877
Connecting walks: 51941
     node  count
26    428   7038
44    420   6236
158   418   6074
59    422   5943
136   421   5747
84   1041   5612
25    429   5537
18    430   5512
55    416   5422
70    417   5065
Connecting walks: 7699
   node  count
5   428   4864
15  429   1859
40  468   1842
38  435   1342
10  251   1218
39  452   1133
4   234   1119
46  399   1084
14  403   1049
32  434   1027
