In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pprint

## Save Centroid Location Info

In [None]:
PATH_TO_PATCHES = '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/cellprofiler_out/stardist'
TMA = 6

PATCH_NUM = 4
PATH_TO_DF = os.path.join(PATH_TO_PATCHES, f'tma_{TMA}/patches/patch_num={PATCH_NUM}/nuclei_with_patch_ids.csv')
PATH_TO_OUTPUT_PATCHES = '/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/stardist_cell_pos'
patch_df = pd.read_csv(PATH_TO_DF)[['Location_Center_X', 'Location_Center_Y', 'group_id']]
group_ids = set(list(patch_df['group_id']))
grouped = patch_df.groupby(patch_df['group_id'])


for group_id in group_ids:
    patch_id = group_id.split('.')[0]
    grouped.get_group(group_id).to_csv(os.path.join(PATH_TO_OUTPUT_PATCHES, f'tma_{TMA}/patches/patch_num={PATCH_NUM}/{patch_id}.csv'))
    

## Compute Ripley K Function Values in R

## Concatinating R Output to Other H&E Features

### Read in all spatial info

In [None]:
patch_num = 4
PATH1 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_1_patch_num={patch_num}.csv'
PATH2 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_2_patch_num={patch_num}.csv'
PATH3 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_3_patch_num={patch_num}.csv'
PATH4 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_4_patch_num={patch_num}.csv'
PATH5 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_5_patch_num={patch_num}.csv'
PATH6 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_6_patch_num={patch_num}.csv'
PATH8 = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/spatial/K/tma_8_patch_num={patch_num}.csv'

df1 = pd.read_csv(PATH1, header=None,sep='\n')
df2 = pd.read_csv(PATH2, header=None,sep='\n')
df3 = pd.read_csv(PATH3, header=None,sep='\n')
df4 = pd.read_csv(PATH4, header=None,sep='\n')
df5 = pd.read_csv(PATH5, header=None,sep='\n')
df6 = pd.read_csv(PATH6, header=None,sep='\n')
df8 = pd.read_csv(PATH8, header=None,sep='\n')
df = pd.concat([df1, df2, df3, df4, df5, df6, df8])
df.head()

spat = df[0].str.split(',', expand=True)
patch_id = [s.split(',')[-1].strip('/"') for s in df[0]]
spat_len = [len(s.split(',')) for s in df[0]]
# spat.tail()

# remove tma expressions from spat
spat = spat.replace(regex=r'^.*tma.*$', value=0)
spat['Image'] = patch_id

# remove duplicates (if there are)
spat = spat.drop_duplicates(subset=['Image'])

# replace None with 0
spat = spat.fillna(0)


### Observe Data Distribution

In [None]:
plt.hist(spat_len)

### Data Processing and Concatenation

In [None]:
DIR = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/data_splits/custom_splits/cellprofiler/stardist_patch_num={patch_num}'
PATH_TO_TRAIN = os.path.join(DIR, 'train.csv')
PATH_TO_VAL = os.path.join(DIR, 'val.csv')
PATH_TO_TEST = os.path.join(DIR, 'test.csv')

PATH_TO_OUTPUT = f'/deep/group/aihc-bootcamp-fall2021/lymphoma/processed/data_splits/custom_splits/K/stardist_K_patch_num={patch_num}'

#### Save Training Set

In [None]:
old_train = pd.read_csv(PATH_TO_TRAIN)
new_train = old_train.set_index('Image').join(spat.set_index('Image'), how="inner")
print(len(old_train), len(new_train))
old_features = list(old_train.columns)[1:]
old_train.columns
old_features
# save the full data
new_train.to_csv(os.path.join(PATH_TO_OUTPUT, 'full', 'train.csv'))
new_train[old_features+[i for i in range(500)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=500', 'train.csv'))
# save d=500 data
new_train[old_features+[i for i in range(500)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=500', 'train.csv'))
# save d=300 data
new_train[old_features+[i for i in range(300)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=300', 'train.csv'))
# save d=250 data
new_train[old_features+[i for i in range(350)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=250', 'train.csv'))
# save d=200 data
new_train[old_features+[i for i in range(200)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=200', 'train.csv'))
# save d=100 data
new_train[old_features+[i for i in range(100)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=100', 'train.csv'))

#### Save Validation Set

In [None]:
old_val = pd.read_csv(PATH_TO_VAL)
new_val = old_val.set_index('Image').join(spat.set_index('Image'), how="inner")
print(len(old_val), len(new_val))
old_val['Image']
old_features = list(old_val.columns)[1:]
# save the full data
new_val.to_csv(os.path.join(PATH_TO_OUTPUT, 'full', 'val.csv'))
# save d=500 data
new_val[old_features+[i for i in range(500)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=500', 'val.csv'))
# save d=300 data
new_val[old_features+[i for i in range(300)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=300', 'val.csv'))
# save d=250 data
new_val[old_features+[i for i in range(250)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=250', 'val.csv'))
# save d=200 data
new_val[old_features+[i for i in range(200)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=200', 'val.csv'))
# save d=100 data
new_val[old_features+[i for i in range(100)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=100', 'val.csv'))

#### Save Test Set

In [None]:
old_test = pd.read_csv(PATH_TO_TEST)
new_test = old_test.set_index('Image').join(spat.set_index('Image'), how="inner")
print(len(old_test), len(new_test))
old_features = list(old_test.columns)
old_features
# save the full data
new_test.to_csv(os.path.join(PATH_TO_OUTPUT, 'full', 'test.csv'))
# save d=500 data
new_test[old_features+[i for i in range(500)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=500', 'test.csv'))
# save d=300 data
new_test[old_features+[i for i in range(300)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=300', 'test.csv'))
# save d=250 data
new_test[old_features+[i for i in range(250)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=250', 'test.csv'))
# save d=200 data
new_test[old_features+[i for i in range(200)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=200', 'test.csv'))\
# save d=100 data
new_test[old_features+[i for i in range(100)]].to_csv(os.path.join(PATH_TO_OUTPUT, 'd=100', 'test.csv'))