In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc, sys, copy, pickle
from pathlib import Path
import glob
import joblib

import warnings
warnings.filterwarnings("ignore")

# Initial Cleaning

In [None]:
#Imports CSV files needed for notebook
path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/'

train_df = pd.read_csv(path + 'train.csv')
train_coordinates_df = pd.read_csv(path + 'train_label_coordinates.csv')
train_desc_df = pd.read_csv(path + 'train_series_descriptions.csv')
test_desc_df = pd.read_csv(path + 'test_series_descriptions.csv')

In [None]:
#prints size/shapes of all of the dateframes
df_names = ["train_df", 
            "train_coordinates_df", 
            "train_desc_df", 
            "test_desc_df"
           ]

for name in df_names:
    df = globals()[name]
    print(f"{name}: {df.shape}")

In [None]:
#rearanges the train CSV into 3 columns to be more useful for training
melted_df = train_df.melt(id_vars='study_id', var_name='condition', value_name='severity')
#Spilts the conditions and level classifications
melted_df['level'] = melted_df['condition'].str[-5:]
melted_df['condition'] = melted_df['condition'].str[:-6]
melted_df['level'] = melted_df['level'].str.replace("_","/")
melted_df['condition'] = melted_df['condition'].str.replace("_"," ")
melted_df.head(10)

In [None]:
#joins the melted dataset with the coordinate dataset
train_coordinates_df['condition'] = train_coordinates_df['condition'].str.lower()
train_coordinates_df['level'] = train_coordinates_df['level'].str.lower()
training_df = pd.merge(train_coordinates_df,melted_df, on = ['study_id','condition','level'])

#merges new dataframe with description dataset
training_df = pd.merge(training_df,train_desc_df, on = ['study_id','series_id'])

training_df.sample(5)

In [None]:
#Spilt the data frame by plane
sagittal_df = training_df[training_df['series_description'].isin(['Sagittal T2/STIR', 'Sagittal T1'])]
axial_df = training_df[training_df['series_description'] == 'Axial T2']
print(sagittal_df.shape)
print(axial_df.shape)

# Prep Sagittal Data

In [None]:
#Creates function to get list of fiels in each series folder
def get_file_list(study_id, series_id):
    data_dir = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/'
    files = os.listdir(os.path.join(data_dir, str(study_id), str(series_id)))
    file_paths = [os.path.join(str(study_id), str(series_id), file) for file in files if file.endswith('.dcm')]
    return file_paths

In [None]:
def expand_dataframe(df, max_files=12):
    # Prepare a list to store the new columns
    new_columns = {f'file_path_{i+1}': [] for i in range(max_files)}

    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        study_id = row['study_id']
        series_id = row['series_id']

        # Get list of files for the given study_id and series_id
        file_list = get_file_list(study_id, series_id)

        # If there are fewer files than `max_files`, duplicate the list
        if len(file_list) < max_files:
            file_list = (file_list * ((max_files // len(file_list)) + 1))[:max_files]
        else:
            file_list = file_list[:max_files]

        # Fill the new columns with file paths
        for i in range(max_files):
            new_columns[f'file_path_{i+1}'].append(file_list[i] if i < len(file_list) else None)

    # Convert the new columns into a DataFrame
    new_columns_df = pd.DataFrame(new_columns)
    expanded_df = pd.concat([df.reset_index(drop=True), new_columns_df], axis=1)

    return expanded_df

In [None]:
def combine_file_paths(df, max_files=12):
    combined_data = {}

    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        study_id = row['study_id']
        
        # Extract file paths from the row, excluding NaN values
        file_paths = [row[f'file_path_{i+1}'] for i in range(max_files) if pd.notna(row[f'file_path_{i+1}'])]

        # Initialize the study_id entry in combined_data if not present
        if study_id not in combined_data:
            combined_data[study_id] = [None] * (max_files * 2)  # Initialize with None values

        # Find the next available index for the file paths
        existing_paths = [path for path in combined_data[study_id] if path is not None]
        start_index = len(existing_paths)

        # Add file paths to the combined data
        for i, path in enumerate(file_paths):
            if start_index + i < len(combined_data[study_id]):
                combined_data[study_id][start_index + i] = path

    # Create a DataFrame from the combined data
    combined_df = pd.DataFrame.from_dict(combined_data, orient='index', columns=[f'file_path_{i+1}' for i in range(max_files * 2)])

    # Reset index to have study_id as a column
    combined_df.reset_index(inplace=True)
    combined_df.rename(columns={'index': 'study_id'}, inplace=True)

    return combined_df

In [None]:
#function to reuse files in order if not avaiabel to fill every column
def fill_empty_paths(df):
    for index, row in df.iterrows():
        replace_image = row['file_path_1']
        count = 0
        for i in range(1, 25):
            col_name = f'file_path_{i}'
            if pd.isna(row[col_name]):
                count += 1
                replace_image = row[f'file_path_{count}']
                df.at[index, col_name] = replace_image

    return df

In [None]:
#pulls every image in each folder for every study/series ID
drop_df = sagittal_df.drop_duplicates(subset=['study_id', 'series_id'])
expanded_df = expand_dataframe(drop_df)
combine_df = combine_file_paths(expanded_df)
sagittal_df = fill_empty_paths(combine_df)
sagittal_df

In [None]:
sagittal_df.isnull().sum()

In [None]:
#replaces labels with catgorical variables
label2id = {'Normal/Mild': 0, 
            'Moderate':1, 
            'Severe':2
           }
train_df = train_df.replace(label2id)
#merges the dataset with the dependent variables
sag_merge_df = pd.merge(sagittal_df,train_df, on = ['study_id'])
sag_merge_df.fillna(0,inplace=True)


In [None]:
sag_merge_df

# Prep Axial Data

In [None]:
#gets the instance of of every file for each series folder
def get_file_paths_dict(study_id, series_id):
    data_dir = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/train_images/'
    dir_path = os.path.join(data_dir, str(study_id), str(series_id))
    files = [file for file in os.listdir(dir_path) if file.endswith('.dcm')]
    file_paths_dict = {}
    
    for file in files:
        instance_number = int(file.split('.')[0].split('_')[-1])
        file_paths_dict[instance_number] = os.path.join(dir_path, file)
    return file_paths_dict

In [None]:
#creates fiel paths for the instance numeber and the adjancent files
def get_adjacent_file_paths(row, file_paths_dict):
    instance_number = row['instance_number']
    file_path_minus_1 = file_paths_dict.get(instance_number - 1, file_paths_dict[instance_number])
    file_path_current = file_paths_dict.get(instance_number, file_paths_dict[instance_number])
    file_path_plus_1 = file_paths_dict.get(instance_number + 1, file_paths_dict[instance_number])
    
    return pd.Series([file_path_minus_1, file_path_current, file_path_plus_1],
                     index=['file_path_1', 'file_path_2', 'file_path_3'])

In [None]:
def consolidate_file_paths(df, levels):
    grouped = df.groupby('study_id')
    results = []
    
    for name, group in grouped:
        # Sort the group based on the level order
        group['level'] = pd.Categorical(group['level'], categories=levels, ordered=True)
        group = group.sort_values('level')
        
        # Collect the file paths
        file_paths = group[['file_path_1', 'file_path_2', 'file_path_3']].values.flatten().tolist()
        
        # verifies tehre are 15 file paths, if not uses the next avaiable
        consolidated_paths = []
        for i in range(15):
            if i < len(file_paths):
                consolidated_paths.append(file_paths[i])
            else:
                consolidated_paths.append(file_paths[i % len(file_paths)])
        
        # Creates a new row with the study_id and the consolidated paths
        result = [name] + consolidated_paths
        results.append(result)
    
    # Creates a new DataFrame from the results
    columns = ['study_id'] + [f'file_path_{i+1}' for i in range(15)]
    consolidated_df = pd.DataFrame(results, columns=columns)
    
    return consolidated_df

In [None]:
axial_df[['file_path_1', 'file_path_2', 'file_path_3']] = None

# Apply the function to each row
for index, row in axial_df.iterrows():
    file_paths_dict = get_file_paths_dict(row['study_id'], row['series_id'])
    axial_df.loc[index, ['file_path_1', 'file_path_2', 'file_path_3']] = get_adjacent_file_paths(row, file_paths_dict)

In [None]:
axial_df

In [None]:
#groups by study/series IDs and level to get rid of duplicates found in data
axial_df.groupby(['study_id', 'series_id', 'level']).first().reset_index()
#drops columns no longer needed
columns_to_drop = ['series_id',
                   'instance_number',
                   'condition',
                   'x',
                   'y',
                   'severity',
                   'series_description'
                  ]
axial_df.drop(columns=columns_to_drop, inplace=True)

In [None]:
#Creates a list of the different levels
levels = [
    'l1_l2',
    'l2_l3',
    'l3_l4',
    'l4_l5',
    'l5_s1',
]


In [1]:
#combines all of the file paths for each study and series ID
axial_combine_df = consolidate_file_paths(axial_df, levels)
axial_combine_df.head(5)

NameError: name 'consolidate_file_paths' is not defined

In [None]:
#Merges file path data frame with training dataframe
axial_merge_df = pd.merge(axial_combine_df,train_df, on = ['study_id'])
axial_merge_df.fillna(0,inplace=True)

axial_merge_df.shape

# FInalize Both Data Frames

In [None]:
#Creates lists of each condition type for the specific MRI image description
sagittal_conditions_drop = [
    'left_subarticular_stenosis',
    'right_subarticular_stenosis'
]

axial_conditions_drop = [
    'spinal_canal_stenosis', 
    'left_neural_foraminal_narrowing', 
    'right_neural_foraminal_narrowing',
]

In [None]:
#Removes the functions that aren't used in each plane
def remove_coditions(df, conditions, levels):
    df_copy = df.copy()
    column_drop = []
    for c in conditions:
        for l in levels:
            column_drop.append(c + '_' + l)
            
    df_copy.drop(columns=column_drop,inplace=True)
    
    return df_copy

In [None]:
#runs th eprevious function
sagittal_cleaned = remove_coditions(sag_merge_df, sagittal_conditions_drop, levels)
axial_cleaned = remove_coditions(axial_merge_df, axial_conditions_drop, levels)

In [None]:
print(sagittal_cleaned.shape)
print(axial_cleaned.shape)

In [None]:
#One hot encodes the dataframes
def one_hot_encode(df, num_encode):
    columns_to_encode = df.columns[-num_encode:]
    return pd.get_dummies(df, columns=columns_to_encode)

In [None]:
sagittal_final = one_hot_encode(sagittal_cleaned,15)
axial_final = one_hot_encode(axial_cleaned,10)

In [None]:
print(sagittal_final.shape)
print(axial_final.shape)

In [None]:
sagittal_final.to_csv('sagittal__df.csv', index=False)
axial_final.to_csv('axial_df.csv', index=False)