## The .pt files in `all_st_cpc` folder

In [2]:
import os
import torch
from torch_geometric.data import Data

directory = "data/TCGA_GBMLGG/all_st_cpc"

correct_edge_index_count = 0

# iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pt"):
        filepath = os.path.join(directory, filename)

        data = torch.load(filepath)

        num_nodes = data.num_nodes
        max_edge_index = data.edge_index.max().item()

        if max_edge_index >= num_nodes:
            print(
                f"In file {filename}, edge_index max ({max_edge_index}) is greater than or equal to the number of nodes ({num_nodes})."
            )
        else:
            correct_edge_index_count += 1
            print(f"In file {filename}, edge_index is correctly defined")

print(f"Correct edge_index count: {correct_edge_index_count}")

In file TCGA-HT-7481-01Z-00-DX4.C8D1B102-5254-431F-9A10-BB1DF1DFDF40_1.pt, edge_index is correctly defined
In file TCGA-06-0154-01Z-00-DX2.5d908d2e-08ba-4ae0-bb0e-34472a5ed4b3_2.pt, edge_index is correctly defined
In file TCGA-06-0147-01Z-00-DX2.57e3c7a3-b271-4ab3-b6a0-aac29f5d52a6_2.pt, edge_index is correctly defined
In file TCGA-FG-A60K-01Z-00-DX1.A5997F67-B39A-477A-BE71-BF72FD9596F8_2.pt, edge_index is correctly defined
In file TCGA-06-5859-01Z-00-DX1.ab40e405-f25a-4fd2-95a2-ed86f8a2007f_1.pt, edge_index is correctly defined
In file TCGA-CS-6290-01Z-00-DX1.BA4C9FC2-DD06-45F4-A7D2-5B3230DF027C_1.pt, edge_index is correctly defined
In file TCGA-HW-7491-01Z-00-DX1.0648acf5-5dbd-47ad-96fd-52d765398718_2.pt, edge_index is correctly defined
In file TCGA-14-1037-01Z-00-DX1.4a7c2db6-d389-4194-ac24-8a739c68c7f3_2.pt, edge_index is correctly defined
In file TCGA-14-1453-01Z-00-DX4.023ea4a4-364c-4785-af1c-16e580cda290_1.pt, edge_index is correctly defined
In file TCGA-19-2619-01Z-00-DX1.ed7e1

## The .pt files in `all_st_patches_512_cpc` folder, which comes from nine times duplication from `all_st_cpc`

In [4]:
import os
import torch
from torch_geometric.data import Data

directory = "data/TCGA_GBMLGG/all_st_patches_512_cpc"

correct_edge_index_count = 0

# iterate through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".pt"):
        filepath = os.path.join(directory, filename)

        data = torch.load(filepath)

        num_nodes = data.num_nodes
        max_edge_index = data.edge_index.max().item()

        if max_edge_index >= num_nodes:
            print(
                f"In file {filename}, edge_index max ({max_edge_index}) is greater than or equal to the number of nodes ({num_nodes})."
            )
        else:
            correct_edge_index_count += 1

print(f"Correct edge_index count: {correct_edge_index_count}")

Correct edge_index count: 13545


## The .pt files in the orifinal `all_st_cpc_img` folder

In [7]:
import os
import torch
import torch_geometric
from torch_geometric.data import Data

# the pt_bi files from the original all_st_cpc_img folder
directory = "/Users/shizhexu/Downloads/TCGA_GBMLGG/all_st_cpc_img/pt_bi"

correct_edge_index_count = 0
incorrect_edge_index_count = 0

for filename in os.listdir(directory):
    if filename.endswith(".pt"):
        filepath = os.path.join(directory, filename)

        old_data = torch.load(filepath)

        new_data = torch_geometric.data.data.Data.from_dict(old_data.__dict__)

        # check if edge_index.max() is greater than the number of nodes
        num_nodes = new_data.num_nodes
        max_edge_index = new_data.edge_index.max().item()

        if max_edge_index >= num_nodes:
            print(
                f"In file {filename}, edge_index max ({max_edge_index}) is greater than or equal to the number of nodes ({num_nodes})."
            )
            incorrect_edge_index_count += 1
        else:
            correct_edge_index_count += 1

print(f"Correct edge_index count: {correct_edge_index_count}")
print(f"Incorrect edge_index count: {incorrect_edge_index_count}")

In file TCGA-HW-7493-01Z-00-DX1.5cb8de89-ec52-43fc-b977-7135e2b09397_2.pt, edge_index max (209) is greater than or equal to the number of nodes (205).
In file TCGA-06-0876-01Z-00-DX1.4268ceff-f46b-4d72-8606-45e79d935325_2.pt, edge_index max (760) is greater than or equal to the number of nodes (752).
In file TCGA-19-1386-01Z-00-DX1.07c678e9-131c-4548-a70f-bcff22967210_1.pt, edge_index max (804) is greater than or equal to the number of nodes (798).
In file TCGA-06-0176-01Z-00-DX4.98c6f166-0c65-47c5-9d26-c4e524b25577_1.pt, edge_index max (691) is greater than or equal to the number of nodes (687).
In file TCGA-QH-A6X5-01Z-00-DX1.A5F05E37-81B8-4390-8C0B-4DD1FDBC3853_1.pt, edge_index max (500) is greater than or equal to the number of nodes (494).
In file TCGA-06-0156-01Z-00-DX1.1303eb2c-6cf6-4173-804c-d52a111263b5_1.pt, edge_index max (333) is greater than or equal to the number of nodes (329).
In file TCGA-HT-7690-01Z-00-DX1.C810C130-7583-4700-B6B4-EE48175A1BE4_1.pt, edge_index max (874

## The example .pt files in `cell_graph_reconstruction` examples

In [9]:
import os
import torch
import torch_geometric
from torch_geometric.data import Data

# the pt_bi files from the original all_st_cpc_img folder
directory = "cell_graph_reconstruction/example_data/pts"

correct_edge_index_count = 0
incorrect_edge_index_count = 0

for filename in os.listdir(directory):
    if filename.endswith(".pt"):
        filepath = os.path.join(directory, filename)

        old_data = torch.load(filepath)

        new_data = torch_geometric.data.data.Data.from_dict(old_data.__dict__)

        # check if edge_index.max() is greater than the number of nodes
        num_nodes = new_data.num_nodes
        max_edge_index = new_data.edge_index.max().item()

        if max_edge_index >= num_nodes:
            print(
                f"In file {filename}, edge_index max ({max_edge_index}) is greater than or equal to the number of nodes ({num_nodes})."
            )
            incorrect_edge_index_count += 1
        else:
            correct_edge_index_count += 1

print(f"Correct edge_index count: {correct_edge_index_count}")
print(f"Incorrect edge_index count: {incorrect_edge_index_count}")

Correct edge_index count: 3
Incorrect edge_index count: 0


## how to preprocess the invalid edge indices

In [3]:
import torch
import torch_geometric

old_data = torch.load(
    "/Users/shizhexu/Downloads/TCGA_GBMLGG/all_st_cpc_img/pt_bi/TCGA-HW-7493-01Z-00-DX1.5cb8de89-ec52-43fc-b977-7135e2b09397_2.pt"
)
new_data = torch_geometric.data.data.Data.from_dict(old_data.__dict__)

print(new_data)
print(new_data.edge_index.max())
print(new_data.edge_index)

num_nodes = new_data.num_nodes
edge_index = new_data.edge_index

valid_edges_mask = (edge_index[0] < num_nodes) & (edge_index[1] < num_nodes)

filtered_edge_index = edge_index[:, valid_edges_mask]

new_data.edge_index = filtered_edge_index

print(new_data)
print(new_data.edge_index.max())
print(new_data.edge_index)

# processed_file_path = os.path.join(directory, 'processed_data.pt')
# torch.save(data, processed_file_path)

Data(x=[205, 1036], edge_index=[2, 1014], edge_attr=[1014, 1], centroid=[205, 2])
tensor(209)
tensor([[  0,   0,   0,  ..., 209, 209, 209],
        [  6,  13,  15,  ..., 195, 202, 204]])
Data(x=[205, 1036], edge_index=[2, 976], edge_attr=[1014, 1], centroid=[205, 2])
tensor(204)
tensor([[  0,   0,   0,  ..., 204, 204, 204],
        [  6,  13,  15,  ..., 187, 189, 202]])


In [None]:
import os
import torch
import torch_geometric
from torch_geometric.data import Data

# the pt_bi files from the original all_st_cpc_img folder
directory = "data/TCGA_GBMLGG/all_st_cpc_img/pt_bi"

correct_edge_index_count = 0

for filename in os.listdir(directory):
    if filename.endswith(".pt"):
        filepath = os.path.join(directory, filename)

        old_data = torch.load(filepath)

        new_data = torch_geometric.data.data.Data.from_dict(old_data.__dict__)

        # check if edge_index.max() is greater than the number of nodes
        num_nodes = new_data.num_nodes
        max_edge_index = new_data.edge_index.max().item()
        edge_index = new_data.edge_index

        if max_edge_index >= num_nodes:
            print(
                f"In file {filename}, edge_index max ({max_edge_index}) is greater than or equal to the number of nodes ({num_nodes})."
            )
            valid_edges_mask = (edge_index[0] < num_nodes) & (edge_index[1] < num_nodes)
            filtered_edge_index = edge_index[:, valid_edges_mask]
            new_data.edge_index = filtered_edge_index
            torch.save(new_data, filepath)
            print(f"The file {filename} has been fixed and saved.")
        else:
            correct_edge_index_count += 1

print(f"Correct edge_index count: {correct_edge_index_count}")
print(f"Incorrect edge_index count: {incorrect_edge_index_count}")