In [2]:
import geojson
import geopandas as gpd
import json
import numpy as np
import pandas as pd
import ast
import os

In [71]:
# Example File Structure
example_detection_file = "/home/sakthi01/Downloads/classifier_training_project/cellvit_pp_outputs/batch2/865247_cell_detection.geojson"
example_seg_file = "/home/sakthi01/Downloads/classifier_training_project/cellvit_pp_outputs/batch2/865247_cells.geojson"

In [72]:
with open(example_detection_file, "r") as f:
    detection_data = geojson.load(f)

In [73]:
with open(example_seg_file, "r") as f:
    seg_data = geojson.load(f)

In [78]:
detection_data[0]['properties']

{'objectType': 'annotation',
 'classification': {'name': 'Neoplastic', 'color': [255, 0, 0]}}

In [67]:
example_annotation_file = "annotations/batch2/865247_good15_TS2_TEI6.geojson"
with open(example_annotation_file, "r") as f:
    annotation_data = geojson.load(f)

In [68]:
annotation_data['features'][0]['properties']['classification']['name']

'good'

In [69]:
rois = {}
columns = ['start_x', 'end_x', 'start_y', 'end_y', 'label']
for i in range(len(annotation_data['features'])):
    current_tile = annotation_data['features'][i]
    start_coords = np.min(np.array(current_tile['geometry']['coordinates'][0]), axis = 0)
    end_coords = np.max(np.array(current_tile['geometry']['coordinates'][0]), axis = 0)
    label = current_tile['properties']['classification']['name']
    rois[i] = [start_coords[0], end_coords[0], start_coords[1], end_coords[1], label]
rois_df = pd.DataFrame.from_dict(rois, orient='index', columns=columns)
rois_df

Unnamed: 0,start_x,end_x,start_y,end_y,label
0,9200,10224,16531,17555,good
1,41813,42837,28770,29794,good
2,35945,36969,29121,30145,good
3,3255,4279,13299,14323,tum_to_str
4,32307,33331,24773,25797,good
5,34895,35919,28118,29142,good
6,35940,36964,28065,29089,good
7,16492,17516,31464,32488,tum_to_str
8,11728,12752,26879,27903,good
9,11200,12224,31474,32498,good


In [83]:
detection_data[0]['properties']['classification']['name']

'Neoplastic'

In [79]:
def change_classification(current_class, label):
    if label == 'good':
        return current_class
    
    elif label == 'tum_to_str':
        return 'Connective'
    
    elif label == 'tumepi_to_imm':
        return 'Inflammatory'

In [93]:
detection_data[0]['geometry']['coordinates'][5]

[38796.0, 1336.0]

In [95]:
annotated_nuclei = {}
columns = ['detection_x', 'detection_y', 'contour', 'Classification']
index = 0
for i in range(len(detection_data)):
    current_points = detection_data[i]
    current_class = current_points['properties']['classification']['name']
    for j in range(len(current_points['geometry']['coordinates'])):
        current_point = current_points['geometry']['coordinates'][j]
        current_x = current_point[0]
        current_y = current_point[1]
        for k in range(len(rois_df)):
            current_roi = rois_df.iloc[k]
            if (current_x >= current_roi['start_x']) and (current_x <= current_roi['end_x']) and (current_y >= current_roi['start_y']) and (current_y <= current_roi['end_y']):
                new_class = change_classification(current_class, current_roi['label'])
                annotated_nuclei[index] = [current_x, current_y, seg_data[i]['geometry']['coordinates'][j], new_class]
                index += 1
                break

annotated_nuclei_df = pd.DataFrame.from_dict(annotated_nuclei, orient='index', columns=columns)
annotated_nuclei_df

Unnamed: 0,detection_x,detection_y,contour,Classification
0,39283.5,4882.5,"[[[39282, 4868], [39282, 4868], [39278, 4868],...",Inflammatory
1,41779.0,12141.0,"[[[41781, 12135], [41780, 12136], [41780, 1213...",Inflammatory
2,41763.5,12474.0,"[[[41764, 12470], [41764, 12471], [41764, 1247...",Inflammatory
3,41733.5,12912.0,"[[[41734, 12906], [41734, 12906], [41732, 1290...",Inflammatory
4,41727.5,12938.5,"[[[41726, 12936], [41725, 12936], [41725, 1293...",Inflammatory
...,...,...,...,...
34765,12282.5,27647.0,"[[[12282, 27645], [12282, 27646], [12281, 2764...",Epithelial
34766,12018.5,27658.0,"[[[12016, 27654], [12015, 27656], [12015, 2765...",Epithelial
34767,12281.5,27682.5,"[[[12280, 27678], [12280, 27679], [12279, 2767...",Epithelial
34768,12191.0,27786.5,"[[[12192, 27783], [12191, 27784], [12190, 2778...",Epithelial


In [96]:
annotated_nuclei_df['Classification'].value_counts()

Classification
Inflammatory    18595
Connective       8233
Neoplastic       7933
Epithelial          9
Name: count, dtype: int64

In [97]:
os.listdir("annotations/batch2/")

['865234_good30_TEI1.geojson',
 '865235_25good_TEI1.geojson',
 '854498_good25_TS10.geojson',
 '865247_good15_TS2_TEI6.geojson',
 '875227_good8_TS10.geojson',
 '866599_good15_TS15.geojson']

In [98]:
segmentation_files = sorted(os.listdir(f"cellvit_pp_outputs/batch2"))
segmentation_files = [f for f in segmentation_files if f.endswith('cells.geojson')]
print(f"Found {len(segmentation_files)} segmentation files.")

Found 6 segmentation files.


In [101]:
def extract_tiles(annotation_data):
    """Function to read in tiles and extract coordinates for resized 256 x 256 tiles"""

    tiles = {}
    columns = ['tile_index', 'start_x', 'end_x', 'start_y', 'end_y']

    tile_counter = 0
    for i in range(len(annotation_data['features'])):
        current_tile = annotation_data['features'][i]
        start_coords = np.min(np.array(current_tile['geometry']['coordinates'][0]), axis = 0)
        end_coords = np.max(np.array(current_tile['geometry']['coordinates'][0]), axis = 0)
        for j in range(start_coords[0], end_coords[0], 256):
            for k in range(start_coords[1], end_coords[1], 256):
                tile_counter += 1
                tiles[tile_counter] = [tile_counter, j, j+256, k, k+256]
    
    tiles_df = pd.DataFrame.from_dict(tiles, orient='index', columns=columns)
    return tiles_df

In [104]:
tiles_df = extract_tiles(annotation_data)
tiles_df

Unnamed: 0,tile_index,start_x,end_x,start_y,end_y
1,1,9200,9456,16531,16787
2,2,9200,9456,16787,17043
3,3,9200,9456,17043,17299
4,4,9200,9456,17299,17555
5,5,9456,9712,16531,16787
...,...,...,...,...,...
364,364,5961,6217,14328,14584
365,365,6217,6473,13560,13816
366,366,6217,6473,13816,14072
367,367,6217,6473,14072,14328


In [105]:
def set_tile_index(row, tiles_df):
    """Function to find the correct tile per nucleus"""

    detection_x = row['detection_x']
    detection_y = row['detection_y']
    for i in range(len(tiles_df)):
        current_tile = tiles_df.iloc[i]
        if (detection_x >= current_tile['start_x']) and (detection_x <= current_tile['end_x']) and (detection_y >= current_tile['start_y']) and (detection_y <= current_tile['end_y']):
            return current_tile['tile_index']
    
    return np.nan

In [106]:
annotated_nuclei_df.apply(lambda x: set_tile_index(x, tiles_df), axis = 1)

0        254
1        201
2        202
3        204
4        204
        ... 
34765    139
34766    136
34767    140
34768    136
34769    136
Length: 34770, dtype: int64