In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd

from pathlib import Path

In [3]:
train_dir = Path('E:\data\RSNA2024')

In [4]:
class CFG:
    random_seed = 42

    image_size = 256
    
    ROOT_FOLDER = train_dir / 'original'
    DEST_FOLDER = train_dir
    PNG_DIR = DEST_FOLDER / f'pngs_{image_size}'
    IMAGES_DIR = ROOT_FOLDER / 'train_images'
    TRAIN_CSV = ROOT_FOLDER / 'train.csv'
    FILES_CSV = ROOT_FOLDER / 'train_files.csv'
    TRAIN_DESC_CSV = ROOT_FOLDER / 'train_series_descriptions.csv'
    COORDS_CSV = ROOT_FOLDER / 'train_label_coordinates.csv'

### train_df

In [5]:
train_df = pd.read_csv(CFG.TRAIN_CSV)
train_desc_df = pd.read_csv(CFG.TRAIN_DESC_CSV)

train_df.shape, train_desc_df.shape

((1975, 26), (6294, 3))

In [6]:
train_df.head()

Unnamed: 0,study_id,spinal_canal_stenosis_l1_l2,spinal_canal_stenosis_l2_l3,spinal_canal_stenosis_l3_l4,spinal_canal_stenosis_l4_l5,spinal_canal_stenosis_l5_s1,left_neural_foraminal_narrowing_l1_l2,left_neural_foraminal_narrowing_l2_l3,left_neural_foraminal_narrowing_l3_l4,left_neural_foraminal_narrowing_l4_l5,...,left_subarticular_stenosis_l1_l2,left_subarticular_stenosis_l2_l3,left_subarticular_stenosis_l3_l4,left_subarticular_stenosis_l4_l5,left_subarticular_stenosis_l5_s1,right_subarticular_stenosis_l1_l2,right_subarticular_stenosis_l2_l3,right_subarticular_stenosis_l3_l4,right_subarticular_stenosis_l4_l5,right_subarticular_stenosis_l5_s1
0,4003253,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Moderate,...,Normal/Mild,Normal/Mild,Normal/Mild,Moderate,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild
1,4646740,Normal/Mild,Normal/Mild,Moderate,Severe,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Moderate,...,Normal/Mild,Normal/Mild,Normal/Mild,Severe,Normal/Mild,Normal/Mild,Moderate,Moderate,Moderate,Normal/Mild
2,7143189,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,...,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild
3,8785691,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Moderate,...,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild
4,10728036,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,...,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Moderate,Normal/Mild


In [7]:
cols = train_df.columns[1:]
# first = [c.split('_')[:-2] for c in cols]
# last = [c.split('_')[-2:] for c in cols]

cols = [c.split('_') for c in cols]
cols = [''.join([i[0] if len(i) > 2 else i for i in c]).upper() for c in cols]

cols = ['study_id'] + cols

cols[:5]

['study_id', 'SCSL1L2', 'SCSL2L3', 'SCSL3L4', 'SCSL4L5']

In [8]:
dict(zip(train_df.columns, cols))

{'study_id': 'study_id',
 'spinal_canal_stenosis_l1_l2': 'SCSL1L2',
 'spinal_canal_stenosis_l2_l3': 'SCSL2L3',
 'spinal_canal_stenosis_l3_l4': 'SCSL3L4',
 'spinal_canal_stenosis_l4_l5': 'SCSL4L5',
 'spinal_canal_stenosis_l5_s1': 'SCSL5S1',
 'left_neural_foraminal_narrowing_l1_l2': 'LNFNL1L2',
 'left_neural_foraminal_narrowing_l2_l3': 'LNFNL2L3',
 'left_neural_foraminal_narrowing_l3_l4': 'LNFNL3L4',
 'left_neural_foraminal_narrowing_l4_l5': 'LNFNL4L5',
 'left_neural_foraminal_narrowing_l5_s1': 'LNFNL5S1',
 'right_neural_foraminal_narrowing_l1_l2': 'RNFNL1L2',
 'right_neural_foraminal_narrowing_l2_l3': 'RNFNL2L3',
 'right_neural_foraminal_narrowing_l3_l4': 'RNFNL3L4',
 'right_neural_foraminal_narrowing_l4_l5': 'RNFNL4L5',
 'right_neural_foraminal_narrowing_l5_s1': 'RNFNL5S1',
 'left_subarticular_stenosis_l1_l2': 'LSSL1L2',
 'left_subarticular_stenosis_l2_l3': 'LSSL2L3',
 'left_subarticular_stenosis_l3_l4': 'LSSL3L4',
 'left_subarticular_stenosis_l4_l5': 'LSSL4L5',
 'left_subarticular_ste

In [9]:
train_df.rename(columns=dict(zip(train_df.columns, cols)), inplace=True)
train_df.shape

(1975, 26)

In [10]:
train_df.iloc[0]

study_id        4003253
SCSL1L2     Normal/Mild
SCSL2L3     Normal/Mild
SCSL3L4     Normal/Mild
SCSL4L5     Normal/Mild
SCSL5S1     Normal/Mild
LNFNL1L2    Normal/Mild
LNFNL2L3    Normal/Mild
LNFNL3L4    Normal/Mild
LNFNL4L5       Moderate
LNFNL5S1    Normal/Mild
RNFNL1L2    Normal/Mild
RNFNL2L3    Normal/Mild
RNFNL3L4       Moderate
RNFNL4L5       Moderate
RNFNL5S1    Normal/Mild
LSSL1L2     Normal/Mild
LSSL2L3     Normal/Mild
LSSL3L4     Normal/Mild
LSSL4L5        Moderate
LSSL5S1     Normal/Mild
RSSL1L2     Normal/Mild
RSSL2L3     Normal/Mild
RSSL3L4     Normal/Mild
RSSL4L5     Normal/Mild
RSSL5S1     Normal/Mild
Name: 0, dtype: object

In [11]:
train_df.study_id.nunique()

1975

In [12]:
vals = {'Normal/Mild': 'N', 'Moderate': 'M', 'Severe': 'S'}
vals

{'Normal/Mild': 'N', 'Moderate': 'M', 'Severe': 'S'}

In [13]:
train_df[cols[1:]] = train_df[cols[1:]].replace(vals)

In [14]:
train_df.sample(2)

Unnamed: 0,study_id,SCSL1L2,SCSL2L3,SCSL3L4,SCSL4L5,SCSL5S1,LNFNL1L2,LNFNL2L3,LNFNL3L4,LNFNL4L5,...,LSSL1L2,LSSL2L3,LSSL3L4,LSSL4L5,LSSL5S1,RSSL1L2,RSSL2L3,RSSL3L4,RSSL4L5,RSSL5S1
156,331605392,N,N,N,S,N,N,N,N,S,...,N,N,M,S,S,N,N,M,S,S
1583,3449872775,N,N,N,N,N,N,N,N,N,...,N,N,N,M,M,N,N,N,N,N


### coordinates_df

In [15]:
coords_df = pd.read_csv(CFG.COORDS_CSV)
files_df = pd.read_csv(CFG.FILES_CSV)

coords_df.shape, files_df.shape

((48692, 7), (147218, 10))

In [16]:
coords_df.rename(columns={'instance_number': 'instance'}, inplace=True)

In [17]:
coords_df.study_id.nunique(), coords_df.condition.nunique(), coords_df.level.nunique()

(1974, 5, 5)

In [18]:
coords_df.condition.unique(), coords_df.level.unique()

(array(['Spinal Canal Stenosis', 'Right Neural Foraminal Narrowing',
        'Left Neural Foraminal Narrowing', 'Left Subarticular Stenosis',
        'Right Subarticular Stenosis'], dtype=object),
 array(['L1/L2', 'L2/L3', 'L3/L4', 'L4/L5', 'L5/S1'], dtype=object))

In [19]:
coords_df.series_id.nunique()

6291

In [20]:
coords_df['ss_id'] = coords_df.apply(lambda row: f'{str(row["study_id"])}_{str(row["series_id"])}', axis=1)
coords_df['instance_id'] = coords_df.apply(lambda row: f'{str(row["study_id"])}_{str(row["series_id"])}_{str(row["instance"])}', axis=1)

train_desc_df['ss_id'] = train_desc_df.apply(lambda row: f'{str(row["study_id"])}_{str(row["series_id"])}', axis=1)

In [21]:
train_desc_df.sample()

Unnamed: 0,study_id,series_id,series_description,ss_id
5164,3507369254,1753331996,Sagittal T1,3507369254_1753331996


In [22]:
coords_df.sample(2)

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id
42695,3759970625,3224960046,17,Right Subarticular Stenosis,L1/L2,216.71219,251.273866,3759970625_3224960046,3759970625_3224960046_17
21629,1935865758,956889897,15,Left Neural Foraminal Narrowing,L4/L5,313.509585,378.001663,1935865758_956889897,1935865758_956889897_15


In [23]:
# rename condition
coords_df['condition'] = coords_df.apply(lambda row: ''.join([w[0] for w in row['condition'].split(' ')]), axis=1)

In [24]:
# rename level
coords_df['level'] = coords_df.level.apply(lambda l: ''.join(l.split('/')))

In [25]:
coords_df['cl'] = coords_df['condition'] + coords_df['level']

In [26]:
coords_df.condition.nunique()

5

In [27]:
coords_df.head(10)

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl
0,4003253,702807833,8,SCS,L1L2,322.831858,227.964602,4003253_702807833,4003253_702807833_8,SCSL1L2
1,4003253,702807833,8,SCS,L2L3,320.571429,295.714286,4003253_702807833,4003253_702807833_8,SCSL2L3
2,4003253,702807833,8,SCS,L3L4,323.030303,371.818182,4003253_702807833,4003253_702807833_8,SCSL3L4
3,4003253,702807833,8,SCS,L4L5,335.292035,427.327434,4003253_702807833,4003253_702807833_8,SCSL4L5
4,4003253,702807833,8,SCS,L5S1,353.415929,483.964602,4003253_702807833,4003253_702807833_8,SCSL5S1
5,4003253,1054713880,4,RNFN,L4L5,187.961759,251.839388,4003253_1054713880,4003253_1054713880_4,RNFNL4L5
6,4003253,1054713880,4,RNFN,L5S1,198.240918,285.613767,4003253_1054713880,4003253_1054713880_4,RNFNL5S1
7,4003253,1054713880,5,RNFN,L3L4,187.227533,210.722753,4003253_1054713880,4003253_1054713880_5,RNFNL3L4
8,4003253,1054713880,6,RNFN,L1L2,194.56979,127.755258,4003253_1054713880,4003253_1054713880_6,RNFNL1L2
9,4003253,1054713880,6,RNFN,L2L3,191.632887,165.93499,4003253_1054713880,4003253_1054713880_6,RNFNL2L3


In [28]:
train_desc_df.series_description.unique()

array(['Sagittal T2/STIR', 'Sagittal T1', 'Axial T2'], dtype=object)

In [29]:
# train_desc_df[train_desc_df['ss_id'] == '4003253702807833'].series_description.values[0]

In [30]:
coords_df['plane'] = coords_df.apply(lambda row: train_desc_df[train_desc_df['ss_id'] == row['ss_id']].series_description.values[0], axis=1)

In [31]:
coords_df.sample(5)

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl,plane
39537,3485457199,2652956268,2,LSS,L1L2,214.440406,203.847844,3485457199_2652956268,3485457199_2652956268_2,LSSL1L2,Axial T2
15498,1387631768,1355036926,15,LNFN,L1L2,388.704784,185.087492,1387631768_1355036926,1387631768_1355036926_15,LNFNL1L2,Sagittal T1
42912,3781188430,2968472419,15,SCS,L3L4,272.154357,284.79483,3781188430_2968472419,3781188430_2968472419_15,SCSL3L4,Sagittal T2/STIR
31445,2780918205,3870989148,4,RSS,L1L2,147.392704,175.336195,2780918205_3870989148,2780918205_3870989148_4,RSSL1L2,Axial T2
26401,2372647393,3353386436,4,RNFN,L3L4,193.681334,210.381553,2372647393_3353386436,2372647393_3353386436_4,RNFNL3L4,Sagittal T1


In [32]:
# check canal stenosis is noy only in axial plane
coords_df[(coords_df.condition == 'SCS') & (coords_df.plane != 'Axial T2')].sample()

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl,plane
6452,594735110,408469134,10,SCS,L3L4,396.057482,486.08885,594735110_408469134,594735110_408469134_10,SCSL3L4,Sagittal T2/STIR


In [33]:
# get the positive slices
coords_df.groupby(['study_id','series_id']).instance.unique()

study_id    series_id 
4003253     702807833                              [8]
            1054713880               [4, 5, 6, 11, 12]
            2448190387          [3, 4, 11, 19, 28, 35]
4646740     3201256954    [15, 16, 22, 28, 29, 34, 40]
            3486248476           [5, 6, 7, 15, 16, 17]
                                      ...             
4287160193  1507070277                             [8]
            1820446240          [4, 9, 10, 16, 22, 28]
4290709089  3274612423                             [9]
            3390218084    [2, 3, 5, 6, 10, 15, 20, 21]
            4237840455                  [4, 5, 11, 12]
Name: instance, Length: 6291, dtype: object

In [34]:
coords_df.ss_id.nunique(), coords_df.instance_id.nunique()

(6291, 24546)

### files_df

In [35]:
files_df = pd.read_csv(CFG.FILES_CSV)

In [36]:
files_df.sample(5)

Unnamed: 0,study_id,series_id,image,instancenumber,rows,columns,slicethickness,spacingbetweenslices,patientposition,seriesdescription
19371,1524089207,2330513520,9,9,320,320,4.0,4.4,HFS,T1
62812,2657109031,2592276134,14,14,540,384,3.0,3.3,HFS,
102956,3707028884,3637147606,13,13,760,640,3.0,3.6,HFS,T1
79561,3098945529,4008721212,6,6,640,640,4.0,5.6,HFS,
122997,4222937692,4268316582,1,1,384,384,4.0,4.8,HFS,


In [37]:
# files_df.rename(columns={'patient': 'study_id', 'series': 'series_id', 'image': 'instance'}, inplace=True)

In [38]:
files_df['ss_id'] = files_df.apply(lambda row: f'{str(row["study_id"])}_{str(row["series_id"])}', axis=1)
files_df['instance_id'] = files_df.apply(lambda row: f'{str(row["study_id"])}_{str(row["series_id"])}_{str(row["instancenumber"])}', axis=1)

In [39]:
source_dir = CFG.PNG_DIR
files_df['filename'] = files_df.apply(lambda row: f'{source_dir}\\{row.study_id}_{row.series_id}_{row.image}.png', axis=1)

In [40]:
files_df.sample()

Unnamed: 0,study_id,series_id,image,instancenumber,rows,columns,slicethickness,spacingbetweenslices,patientposition,seriesdescription,ss_id,instance_id,filename
72583,2906351586,159538394,7,7,512,512,4.0,5.0,HFS,T1,2906351586_159538394,2906351586_159538394_7,E:\data\RSNA2024\pngs_256\2906351586_159538394...


In [41]:
train_desc_df.sample()

Unnamed: 0,study_id,series_id,series_description,ss_id
2860,1983333974,3990785743,Sagittal T2/STIR,1983333974_3990785743


In [42]:
coords_df = pd.merge(coords_df, files_df[['instance_id', 'rows', 'columns', 'filename']], left_on='instance_id', right_on='instance_id')

In [43]:
coords_df.sample()

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl,plane,rows,columns,filename
33571,2966328820,356353555,10,RNFN,L1L2,278.835555,214.678518,2966328820_356353555,2966328820_356353555_10,RNFNL1L2,Sagittal T1,512,512,E:\data\RSNA2024\pngs_256\2966328820_356353555...


In [44]:
# TODO: make sure we match coords corectly
coords_df['x_perc'] = coords_df['x'] / coords_df['columns']
coords_df['y_perc'] = coords_df['y'] / coords_df['rows']

In [45]:
ax, non_ax = coords_df[coords_df['plane'] == 'Axial T2'], coords_df[coords_df['plane'] != 'Axial T2']
ax.shape, non_ax.shape, coords_df.shape

((19220, 16), (29472, 16), (48692, 16))

In [46]:
coords_df.y.min()

2.063097514340344

In [47]:
for c in [ax, non_ax]:
    print(c['x_perc'].min(), c['y_perc'].min())
    print(c['x_perc'].max(), c['y_perc'].max())
    print('/////////////')
    # print(c['x_perc'].mean(), c['y_perc'].mean())

0.2849557522123894 0.3092621664050236
0.6631964653474212 0.8078096961632665
/////////////
0.0048828125 0.0040294873326959845
0.7684669901065448 0.9117647058823529
/////////////


In [48]:
coords_df.shape

(48692, 16)

In [49]:
files_df.sample(5)

Unnamed: 0,study_id,series_id,image,instancenumber,rows,columns,slicethickness,spacingbetweenslices,patientposition,seriesdescription,ss_id,instance_id,filename
46580,2249525087,1236584656,25,25,320,320,4.0,5.0,FFS,T2,2249525087_1236584656,2249525087_1236584656_25,E:\data\RSNA2024\pngs_256\2249525087_123658465...
67748,2790742955,3179679992,14,14,320,320,4.0,4.6,HFS,,2790742955_3179679992,2790742955_3179679992_14,E:\data\RSNA2024\pngs_256\2790742955_317967999...
24960,1670838975,3213388014,2,2,768,768,4.0,4.4,FFS,T1,1670838975_3213388014,1670838975_3213388014_2,E:\data\RSNA2024\pngs_256\1670838975_321338801...
2243,10728036,3491739931,5,5,512,512,4.0,5.0,FFS,T2,10728036_3491739931,10728036_3491739931_5,E:\data\RSNA2024\pngs_256\10728036_3491739931_...
56182,2493610993,391977977,11,11,512,512,4.0,5.0,HFS,T1,2493610993_391977977,2493610993_391977977_11,E:\data\RSNA2024\pngs_256\2493610993_391977977...


### Save results

In [50]:
train_df.to_csv(CFG.DEST_FOLDER / 'train.csv', index=False)

In [51]:
coords_df.to_csv(CFG.DEST_FOLDER / 'train_label_coordinates.csv', index=False)

In [52]:
files_df.to_csv(CFG.DEST_FOLDER / 'train_files.csv', index=False)