In [2]:
import os
os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'
import sys
import cv2
import timm
import torch
import wandb
import pydicom
import torch.nn as nn
import numpy as np
import pandas as pd
import albumentations as A
import torch.optim as optim
import segmentation_models_pytorch as smp
import torch.nn.functional as F

import core.utils as utils
import core.models as models
import core.datasets as datasets
import core.training as training
import core.project_paths as project_paths
import core.keypoints as keypoints
import core.losses as losses
from core.fold_for_all import fold_for_all

from glob import glob
from tqdm import tqdm
from os import environ
from einops import rearrange
from collections import defaultdict
from accelerate.utils import set_seed
from accelerate import Accelerator
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from transformers import get_cosine_schedule_with_warmup
from accelerate import DistributedDataParallelKwargs

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
def get_df_series(filter='T'):
    df_series = pd.read_csv(f'{project_paths.base_path}/train_series_descriptions.csv')
    df_series = df_series[df_series['series_description'].str.contains(filter)].reset_index(drop=True)
    def count_files(folder):
        files = glob(f'{folder}/*.dcm')
        return len(files)
    df_series['filepath'] = df_series.apply(lambda x: f"{project_paths.base_path}/train_images/{x['study_id']}/{x['series_id']}", axis=1)
    return df_series
df = get_df_series()
df

Unnamed: 0,study_id,series_id,series_description,filepath
0,4003253,702807833,Sagittal T2/STIR,/media/workspace/RSNA2024_input/rsna-2024-lumb...
1,4003253,1054713880,Sagittal T1,/media/workspace/RSNA2024_input/rsna-2024-lumb...
2,4003253,2448190387,Axial T2,/media/workspace/RSNA2024_input/rsna-2024-lumb...
3,4646740,3201256954,Axial T2,/media/workspace/RSNA2024_input/rsna-2024-lumb...
4,4646740,3486248476,Sagittal T1,/media/workspace/RSNA2024_input/rsna-2024-lumb...
...,...,...,...,...
6289,4287160193,1507070277,Sagittal T2/STIR,/media/workspace/RSNA2024_input/rsna-2024-lumb...
6290,4287160193,1820446240,Axial T2,/media/workspace/RSNA2024_input/rsna-2024-lumb...
6291,4290709089,3274612423,Sagittal T2/STIR,/media/workspace/RSNA2024_input/rsna-2024-lumb...
6292,4290709089,3390218084,Axial T2,/media/workspace/RSNA2024_input/rsna-2024-lumb...


In [26]:
def get_dicom_metadata(filename):
    dcm = pydicom.dcmread(filename)
    result = {}
    for element in dcm:
        if element.name == 'Pixel Data': continue
        result[element.name] = element.value
    return result
[get_dicom_metadata(f'{df.iloc[1]["filepath"]}/{i}.dcm')['Content Time'] for i in range(1, 15)]

['223713.595069',
 '223716.281743',
 '223715.195741',
 '223716.212079',
 '223715.558088',
 '223714.109437',
 '223715.267722',
 '223715.636375',
 '223715.416051',
 '223714.186226',
 '223714.268076',
 '223715.486975',
 '223715.337148',
 '223716.142397']

In [27]:
[get_dicom_metadata(f'{df.iloc[0]["filepath"]}/{i}.dcm')['Content Time'] for i in range(1, 15)]

['223718.007977',
 '223717.688392',
 '223713.504434',
 '223716.904185',
 '223717.463262',
 '223713.244718',
 '223714.614509',
 '223714.695927',
 '223713.422388',
 '223717.848046',
 '223717.930734',
 '223713.330007',
 '223717.766757',
 '223715.714071']

In [28]:
[get_dicom_metadata(f'{df.iloc[2]["filepath"]}/{i}.dcm')['Content Time'] for i in range(1, 15)]

['223716.487580',
 '223717.050790',
 '223714.923139',
 '223717.619346',
 '223713.860122',
 '223717.257015',
 '223718.497043',
 '223714.012896',
 '223715.928825',
 '223718.635218',
 '223717.323970',
 '223718.287857',
 '223715.059080',
 '223714.535204']

In [37]:
def is_matching(series1, series2):
    try:
        meta1 = get_dicom_metadata(f'{series1}/1.dcm')
        meta2 = get_dicom_metadata(f'{series2}/1.dcm')
    except: # some series dont have 1.dcm
        return False
    if abs(int(meta1['Content Time'].split('.')[0]) - int(meta2['Content Time'].split('.')[0])) <= 10:
        return True
    return False

study_ids = [[], []]
for study_id, sub_df in df.groupby('study_id'):
    matched = np.all([is_matching(sub_df.iloc[i]['filepath'], sub_df.iloc[i + 1]['filepath']) for i in range(len(sub_df) - 1)])
    study_ids[int(matched)].append(study_id)

In [38]:
print(len(study_ids[0]), len(study_ids[1]))

199 1776
