In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np 
import pandas as pd

from pathlib import Path

In [3]:
train_dir = Path('E:\data\RSNA2024')

In [4]:
class CFG:
    random_seed = 42
    
    ROOT_FOLDER = train_dir
    IMAGES_DIR = ROOT_FOLDER / 'train_images'
    TRAIN_CSV = ROOT_FOLDER / 'train.csv'
    FILES_CSV = ROOT_FOLDER / 'train_files.csv'
    TRAIN_DESC_CSV = ROOT_FOLDER / 'train_series_descriptions.csv'
    COORDS_CSV = ROOT_FOLDER / 'train_label_coordinates.csv'

### Train data

In [5]:
train_df = pd.read_csv(CFG.TRAIN_CSV)
train_desc_df = pd.read_csv(CFG.TRAIN_DESC_CSV)

train_df.shape, train_desc_df.shape

((1975, 27), (6294, 3))

In [6]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,study_id,SCSL1L2,SCSL2L3,SCSL3L4,SCSL4L5,SCSL5S1,LNFNL1L2,LNFNL2L3,LNFNL3L4,...,LSSL1L2,LSSL2L3,LSSL3L4,LSSL4L5,LSSL5S1,RSSL1L2,RSSL2L3,RSSL3L4,RSSL4L5,RSSL5S1
0,0,4003253,N,N,N,N,N,N,N,N,...,N,N,N,M,N,N,N,N,N,N
1,1,4646740,N,N,M,S,N,N,N,N,...,N,N,N,S,N,N,M,M,M,N
2,2,7143189,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
3,3,8785691,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,N,N
4,4,10728036,N,N,N,N,N,N,N,N,...,N,N,N,N,N,N,N,N,M,N


In [7]:
train_df.iloc[1]

Unnamed: 0          1
study_id      4646740
SCSL1L2             N
SCSL2L3             N
SCSL3L4             M
SCSL4L5             S
SCSL5S1             N
LNFNL1L2            N
LNFNL2L3            N
LNFNL3L4            N
LNFNL4L5            M
LNFNL5S1            M
RNFNL1L2            N
RNFNL2L3            N
RNFNL3L4            M
RNFNL4L5            M
RNFNL5S1            N
LSSL1L2             N
LSSL2L3             N
LSSL3L4             N
LSSL4L5             S
LSSL5S1             N
RSSL1L2             N
RSSL2L3             M
RSSL3L4             M
RSSL4L5             M
RSSL5S1             N
Name: 1, dtype: object

In [8]:
train_df.study_id.nunique()

1975

### Coordinates

In [9]:
coords_df = pd.read_csv(CFG.COORDS_CSV)
files_df = pd.read_csv(CFG.FILES_CSV)

coords_df.shape, files_df.shape

((48692, 10), (147218, 5))

In [10]:
coords_df.study_id.nunique(), coords_df.condition.nunique(), coords_df.level.nunique()

(1974, 5, 5)

In [11]:
coords_df.condition.unique(), coords_df.level.unique()

(array(['SCS', 'RNFN', 'LNFN', 'LSS', 'RSS'], dtype=object),
 array(['L1L2', 'L2L3', 'L3L4', 'L4L5', 'L5S1'], dtype=object))

In [12]:
coords_df.series_id.nunique()

6291

In [13]:
coords_df['id'] = coords_df.apply(lambda row: str(row['study_id']) + str(row['series_id']), axis=1)
train_desc_df['id'] = train_desc_df.apply(lambda row: str(row['study_id']) + str(row['series_id']), axis=1)

In [14]:
coords_df.sample(2)

Unnamed: 0.1,Unnamed: 0,study_id,series_id,instance,condition,level,x,y,id,plane
34669,34669,3068678959,4216292987,35,LSS,L4L5,174.872215,147.208388,30686789594216292987,Axial T2
9874,9874,886995462,3051597267,15,SCS,L5S1,162.318841,248.26087,8869954623051597267,Sagittal T2/STIR


In [15]:
train_desc_df[train_desc_df['id'] == '4003253702807833'].series_description.values[0]

'Sagittal T2/STIR'

In [16]:
coords_df.sample(5)

Unnamed: 0.1,Unnamed: 0,study_id,series_id,instance,condition,level,x,y,id,plane
23257,23257,2091088734,2849580758,6,LNFN,L3L4,125.031963,123.908676,20910887342849580758,Sagittal T1
6067,6067,527598501,3800352180,39,LSS,L1L2,277.295575,273.897345,5275985013800352180,Axial T2
33223,33223,2937655882,1192868048,21,RSS,L4L5,115.707323,127.351916,29376558821192868048,Axial T2
41126,41126,3617698707,1507720438,19,LSS,L4L5,211.473589,211.284111,36176987071507720438,Axial T2
34127,34127,3018238997,3975706150,10,SCS,L5S1,255.778403,362.183712,30182389973975706150,Sagittal T2/STIR


In [18]:
# check canal stenosis is noy only in axial plane
coords_df[(coords_df.condition == 'SCS') & (coords_df.plane != 'Axial T2')].sample()

Unnamed: 0.1,Unnamed: 0,study_id,series_id,instance,condition,level,x,y,id,plane
34276,34276,3030943727,304128505,10,SCS,L2L3,190.080483,169.295014,3030943727304128505,Sagittal T2/STIR


In [19]:
# get the positive slices
coords_df.groupby(['study_id','series_id']).instance.unique()

study_id    series_id 
4003253     702807833                              [8]
            1054713880               [4, 5, 6, 11, 12]
            2448190387          [3, 4, 11, 19, 28, 35]
4646740     3201256954    [15, 16, 22, 28, 29, 34, 40]
            3486248476           [5, 6, 7, 15, 16, 17]
                                      ...             
4287160193  1507070277                             [8]
            1820446240          [4, 9, 10, 16, 22, 28]
4290709089  3274612423                             [9]
            3390218084    [2, 3, 5, 6, 10, 15, 20, 21]
            4237840455                  [4, 5, 11, 12]
Name: instance, Length: 6291, dtype: object

In [20]:
coords_df.id.nunique()

6291

In [21]:
pos_slices = coords_df.groupby(['study_id','series_id']).instance_number.unique().apply(list).reset_index(name='slice').explode('slice')
pos_slices.shape

(24546, 3)

In [22]:
# coords_df[coords_df.instance_number > 100]

In [23]:
pos_slices

Unnamed: 0,study_id,series_id,slice
0,4003253,702807833,8
1,4003253,1054713880,4
1,4003253,1054713880,5
1,4003253,1054713880,6
1,4003253,1054713880,11
...,...,...,...
6289,4290709089,3390218084,21
6290,4290709089,4237840455,4
6290,4290709089,4237840455,5
6290,4290709089,4237840455,11


In [24]:
pos_slices.groupby('study_id').slice.nunique().mean()/3

3.6653157716987503

### train_df

In [25]:
# look at categories
for f in ['condition','level']:
    print(coords_df[f].value_counts())
    print('-'*50);print();

condition
Left Neural Foraminal Narrowing     9860
Right Neural Foraminal Narrowing    9859
Spinal Canal Stenosis               9753
Right Subarticular Stenosis         9612
Left Subarticular Stenosis          9608
Name: count, dtype: int64
--------------------------------------------------

level
L3/L4    9858
L4/L5    9858
L5/S1    9845
L2/L3    9661
L1/L2    9470
Name: count, dtype: int64
--------------------------------------------------



In [26]:
pd.crosstab(coords_df.condition, coords_df.level)

level,L1/L2,L2/L3,L3/L4,L4/L5,L5/S1
condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Left Neural Foraminal Narrowing,1972,1972,1972,1972,1972
Left Subarticular Stenosis,1810,1892,1971,1971,1964
Right Neural Foraminal Narrowing,1972,1972,1971,1972,1972
Right Subarticular Stenosis,1812,1891,1971,1971,1967
Spinal Canal Stenosis,1904,1934,1973,1972,1970


### Files

In [33]:
files_df.head(3)

Unnamed: 0,patient,series,image,rows,columns
0,100206310,1012284084,1,320,320
1,100206310,1012284084,10,320,320
2,100206310,1012284084,11,320,320


In [28]:
files_df.rows.min(), files_df.rows.max(), files_df['columns'].min(), files_df['columns'].max(), 

(192, 1024, 224, 1024)

In [34]:
# files_df.image.max(), files_df.image.mean()

In [30]:
# file names do not correspond to file count
files_df[files_df.image == 5049]

Unnamed: 0,patient,series,image,rows,columns
59719,2581283971,2683794967,5049,320,320


In [31]:
# max/mean images per patient
files_df.groupby(['patient','series']).image.count().max(), files_df.groupby(['patient','series']).image.count().mean()

(192, 23.390212901175722)

In [32]:
# mean positive imgs per series
coords_df.groupby(['study_id','series_id']).instance_number.nunique().mean()

3.9017644253695756

In [37]:
files_df.groupby(['patient','series']).series.count()

patient     series    
4003253     702807833     15
            1054713880    15
            2448190387    43
4646740     3201256954    54
            3486248476    17
                          ..
4287160193  1507070277    15
            1820446240    42
4290709089  3274612423    15
            3390218084    23
            4237840455    15
Name: series, Length: 6294, dtype: int64

### Analyze one example

In [46]:
patient = 4003253
train_df[train_df['study_id'] == patient].iloc[0]

study_id                                      4003253
spinal_canal_stenosis_l1_l2               Normal/Mild
spinal_canal_stenosis_l2_l3               Normal/Mild
spinal_canal_stenosis_l3_l4               Normal/Mild
spinal_canal_stenosis_l4_l5               Normal/Mild
spinal_canal_stenosis_l5_s1               Normal/Mild
left_neural_foraminal_narrowing_l1_l2     Normal/Mild
left_neural_foraminal_narrowing_l2_l3     Normal/Mild
left_neural_foraminal_narrowing_l3_l4     Normal/Mild
left_neural_foraminal_narrowing_l4_l5        Moderate
left_neural_foraminal_narrowing_l5_s1     Normal/Mild
right_neural_foraminal_narrowing_l1_l2    Normal/Mild
right_neural_foraminal_narrowing_l2_l3    Normal/Mild
right_neural_foraminal_narrowing_l3_l4       Moderate
right_neural_foraminal_narrowing_l4_l5       Moderate
right_neural_foraminal_narrowing_l5_s1    Normal/Mild
left_subarticular_stenosis_l1_l2          Normal/Mild
left_subarticular_stenosis_l2_l3          Normal/Mild
left_subarticular_stenosis_l

In [45]:
coords_df[coords_df['study_id'] == patient]

Unnamed: 0,study_id,series_id,instance_number,condition,level,x,y,id,plane
0,4003253,702807833,8,Spinal Canal Stenosis,L1/L2,322.831858,227.964602,4003253702807833,Sagittal T2/STIR
1,4003253,702807833,8,Spinal Canal Stenosis,L2/L3,320.571429,295.714286,4003253702807833,Sagittal T2/STIR
2,4003253,702807833,8,Spinal Canal Stenosis,L3/L4,323.030303,371.818182,4003253702807833,Sagittal T2/STIR
3,4003253,702807833,8,Spinal Canal Stenosis,L4/L5,335.292035,427.327434,4003253702807833,Sagittal T2/STIR
4,4003253,702807833,8,Spinal Canal Stenosis,L5/S1,353.415929,483.964602,4003253702807833,Sagittal T2/STIR
5,4003253,1054713880,4,Right Neural Foraminal Narrowing,L4/L5,187.961759,251.839388,40032531054713880,Sagittal T1
6,4003253,1054713880,4,Right Neural Foraminal Narrowing,L5/S1,198.240918,285.613767,40032531054713880,Sagittal T1
7,4003253,1054713880,5,Right Neural Foraminal Narrowing,L3/L4,187.227533,210.722753,40032531054713880,Sagittal T1
8,4003253,1054713880,6,Right Neural Foraminal Narrowing,L1/L2,194.56979,127.755258,40032531054713880,Sagittal T1
9,4003253,1054713880,6,Right Neural Foraminal Narrowing,L2/L3,191.632887,165.93499,40032531054713880,Sagittal T1
