In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd

from pathlib import Path

In [3]:
train_dir = Path('E:\data\RSNA2024')

In [4]:
class CFG:
    random_seed = 42
    
    ROOT_FOLDER = train_dir
    IMAGES_DIR = ROOT_FOLDER / 'train_images'
    TRAIN_CSV = ROOT_FOLDER / 'train.csv'
    FILES_CSV = ROOT_FOLDER / 'train_files.csv'
    TRAIN_DESC_CSV = ROOT_FOLDER / 'train_series_descriptions.csv'
    COORDS_CSV = ROOT_FOLDER / 'train_label_coordinates.csv'

### Train data

In [5]:
train_df = pd.read_csv(CFG.TRAIN_CSV)
train_desc_df = pd.read_csv(CFG.TRAIN_DESC_CSV)

train_df.shape, train_desc_df.shape

((1975, 26), (6294, 3))

In [6]:
train_df.head()

Unnamed: 0,study_id,SCSL1L2,SCSL2L3,SCSL3L4,SCSL4L5,SCSL5S1,LNFNL1L2,LNFNL2L3,LNFNL3L4,LNFNL4L5,...,LSSL1L2,LSSL2L3,LSSL3L4,LSSL4L5,LSSL5S1,RSSL1L2,RSSL2L3,RSSL3L4,RSSL4L5,RSSL5S1
0,4003253,N,N,N,N,N,N,N,N,M,...,N,N,N,M,N,N,N,N,N,N
1,4646740,N,N,M,S,N,N,N,N,M,...,N,N,N,S,N,N,M,M,M,N
2,7143189,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
3,8785691,N,N,N,N,N,N,N,N,M,...,N,N,N,N,N,N,N,N,N,N
4,10728036,N,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,M,N


In [7]:
train_df.iloc[1]

study_id    4646740
SCSL1L2           N
SCSL2L3           N
SCSL3L4           M
SCSL4L5           S
SCSL5S1           N
LNFNL1L2          N
LNFNL2L3          N
LNFNL3L4          N
LNFNL4L5          M
LNFNL5S1          M
RNFNL1L2          N
RNFNL2L3          N
RNFNL3L4          M
RNFNL4L5          M
RNFNL5S1          N
LSSL1L2           N
LSSL2L3           N
LSSL3L4           N
LSSL4L5           S
LSSL5S1           N
RSSL1L2           N
RSSL2L3           M
RSSL3L4           M
RSSL4L5           M
RSSL5S1           N
Name: 1, dtype: object

In [8]:
train_df.study_id.nunique()

1975

### Coordinates

In [9]:
coords_df = pd.read_csv(CFG.COORDS_CSV)
files_df = pd.read_csv(CFG.FILES_CSV)

coords_df.shape, files_df.shape

((48692, 16), (147218, 13))

In [25]:
coords_df.condition.nunique(), coords_df.level.nunique()

(5, 5)

In [11]:
coords_df.condition.unique(), coords_df.level.unique()

(array(['SCS', 'RNFN', 'LNFN', 'LSS', 'RSS'], dtype=object),
 array(['L1L2', 'L2L3', 'L3L4', 'L4L5', 'L5S1'], dtype=object))

In [24]:
coords_df.study_id.nunique(), coords_df.series_id.nunique()

(1974, 6291)

In [13]:
# coords_df['id'] = coords_df.apply(lambda row: str(row['study_id']) + str(row['series_id']), axis=1)
train_desc_df['id'] = train_desc_df.apply(lambda row: str(row['study_id']) + str(row['series_id']), axis=1)

In [14]:
coords_df.sample(2)

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl,plane,rows,columns,filename,x_perc,y_perc
15024,1344807906,308892876,10,SCS,L3L4,185.949821,166.738351,1344807906_308892876,1344807906_308892876_10,SCSL3L4,Sagittal T2/STIR,320,320,E:\data\RSNA2024\original\pngs_256\1344807906_...,0.581093,0.521057
36037,3188843711,1039315139,18,RSS,L3L4,152.334827,165.273547,3188843711_1039315139,3188843711_1039315139_18,RSSL3L4,Axial T2,320,320,E:\data\RSNA2024\original\pngs_256\3188843711_...,0.476046,0.51648


In [15]:
train_desc_df[train_desc_df['id'] == '4003253702807833'].series_description.values[0]

'Sagittal T2/STIR'

In [17]:
# check canal stenosis is noy only in axial plane
coords_df[(coords_df.condition == 'SCS') & (coords_df.plane != 'Axial T2')].sample()

Unnamed: 0,study_id,series_id,instance,condition,level,x,y,ss_id,instance_id,cl,plane,rows,columns,filename,x_perc,y_perc
22077,1985516615,950509856,11,SCS,L5S1,415.088235,563.708061,1985516615_950509856,1985516615_950509856_11,SCSL5S1,Sagittal T2/STIR,836,704,E:\data\RSNA2024\original\pngs_256\1985516615_...,0.589614,0.674292


In [19]:
# get the positive slices
coords_df.groupby(['study_id','series_id']).instance.unique()

study_id    series_id 
4003253     702807833                              [8]
            1054713880               [4, 5, 6, 11, 12]
            2448190387          [3, 4, 11, 19, 28, 35]
4646740     3201256954    [15, 16, 22, 28, 29, 34, 40]
            3486248476           [5, 6, 7, 15, 16, 17]
                                      ...             
4287160193  1507070277                             [8]
            1820446240          [4, 9, 10, 16, 22, 28]
4290709089  3274612423                             [9]
            3390218084    [2, 3, 5, 6, 10, 15, 20, 21]
            4237840455                  [4, 5, 11, 12]
Name: instance, Length: 6291, dtype: object

In [22]:
# total positive images 
coords_df.instance_id.nunique()

24546

In [23]:
# total labels
pos_slices = coords_df.groupby(['study_id','series_id']).instance.unique().apply(list).reset_index(name='slice').explode('slice')
pos_slices.shape

(24546, 3)

In [22]:
# coords_df[coords_df.instance_number > 100]

In [23]:
pos_slices

Unnamed: 0,study_id,series_id,slice
0,4003253,702807833,8
1,4003253,1054713880,4
1,4003253,1054713880,5
1,4003253,1054713880,6
1,4003253,1054713880,11
...,...,...,...
6289,4290709089,3390218084,21
6290,4290709089,4237840455,4
6290,4290709089,4237840455,5
6290,4290709089,4237840455,11


In [24]:
pos_slices.groupby('study_id').slice.nunique().mean()/3

3.6653157716987503

### train_df

In [25]:
# look at categories
for f in ['condition','level']:
    print(coords_df[f].value_counts())
    print('-'*50);print();

condition
LNFN    9860
RNFN    9859
SCS     9753
RSS     9612
LSS     9608
Name: count, dtype: int64
--------------------------------------------------

level
L3L4    9858
L4L5    9858
L5S1    9845
L2L3    9661
L1L2    9470
Name: count, dtype: int64
--------------------------------------------------



In [26]:
pd.crosstab(coords_df.condition, coords_df.level)

level,L1L2,L2L3,L3L4,L4L5,L5S1
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LNFN,1972,1972,1972,1972,1972
LSS,1810,1892,1971,1971,1964
RNFN,1972,1972,1971,1972,1972
RSS,1812,1891,1971,1971,1967
SCS,1904,1934,1973,1972,1970


### Files

In [79]:
files_df.head(3)

Unnamed: 0,patient,series,image,rows,columns
0,100206310,1012284084,1,320,320
1,100206310,1012284084,10,320,320
2,100206310,1012284084,11,320,320


In [28]:
files_df.rows.min(), files_df.rows.max(), files_df['columns'].min(), files_df['columns'].max(), 

(192, 1024, 224, 1024)

In [29]:
# files_df.image.max(), files_df.image.mean()

In [30]:
# file names do not correspond to file count
files_df[files_df.image == 5049]

Unnamed: 0,patient,series,image,rows,columns
59719,2581283971,2683794967,5049,320,320


In [31]:
# max/mean images per patient
files_df.groupby(['patient','series']).image.count().max(), files_df.groupby(['patient','series']).image.count().mean()

(192, 23.390212901175722)

In [34]:
# mean positive imgs per series
coords_df.groupby(['study_id','series_id']).instance.nunique().mean()

3.9017644253695756

In [35]:
files_df.groupby(['patient','series']).series.count()

patient     series    
4003253     702807833     15
            1054713880    15
            2448190387    43
4646740     3201256954    54
            3486248476    17
                          ..
4287160193  1507070277    15
            1820446240    42
4290709089  3274612423    15
            3390218084    23
            4237840455    15
Name: series, Length: 6294, dtype: int64

### Analyze one example

In [36]:
patient = 4003253
train_df[train_df['study_id'] == patient].iloc[0]

study_id    4003253
SCSL1L2           N
SCSL2L3           N
SCSL3L4           N
SCSL4L5           N
SCSL5S1           N
LNFNL1L2          N
LNFNL2L3          N
LNFNL3L4          N
LNFNL4L5          M
LNFNL5S1          N
RNFNL1L2          N
RNFNL2L3          N
RNFNL3L4          M
RNFNL4L5          M
RNFNL5S1          N
LSSL1L2           N
LSSL2L3           N
LSSL3L4           N
LSSL4L5           M
LSSL5S1           N
RSSL1L2           N
RSSL2L3           N
RSSL3L4           N
RSSL4L5           N
RSSL5S1           N
Name: 0, dtype: object

In [49]:
coords_df[coords_df['study_id'] == patient].condition.unique()

array(['SCS', 'RNFN', 'LNFN', 'LSS', 'RSS'], dtype=object)

In [50]:
coords_df[coords_df['study_id'] == patient][['instance', 'cl', 'condition']]

Unnamed: 0,instance,cl,condition
0,8,SCSL1L2,SCS
1,8,SCSL2L3,SCS
2,8,SCSL3L4,SCS
3,8,SCSL4L5,SCS
4,8,SCSL5S1,SCS
5,4,RNFNL4L5,RNFN
6,4,RNFNL5S1,RNFN
7,5,RNFNL3L4,RNFN
8,6,RNFNL1L2,RNFN
9,6,RNFNL2L3,RNFN


In [None]:
train_df[(train_df['study_id'] == patient & train_df[])]