In [None]:
#Imports need packages
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import os, gc, sys, copy, pickle
from pathlib import Path
import glob
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import cv2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools
import pydicom

import warnings
warnings.filterwarnings("ignore")

In [None]:
#Imports CSV files needed for notebook
path = '/kaggle/input/rsna-2024-lumbar-spine-degenerative-classification/'

train_df = pd.read_csv(path + 'train.csv')
train_coordinates_df = pd.read_csv(path + 'train_label_coordinates.csv')
train_desc_df = pd.read_csv(path + 'train_series_descriptions.csv')
test_desc_df = pd.read_csv(path + 'test_series_descriptions.csv')

In [None]:
#prints size/shapes of all of the dateframes
df_names = ["train_df", "train_coordinates_df", "train_desc_df", "test_desc_df"]

for name in df_names:
    df = globals()[name]
    print(f"{name}: {df.shape}")

In [None]:
#rearanges the train CSV into 3 columns to be more useful for training
melted_df = train_df.melt(id_vars='study_id', var_name='condition', value_name='severity')
#Spilts the conditions and level classifications
melted_df['level'] = melted_df['condition'].str[-5:]
melted_df['condition'] = melted_df['condition'].str[:-6]
melted_df['level'] = melted_df['level'].str.replace("_","/")
melted_df['condition'] = melted_df['condition'].str.replace("_"," ")
melted_df.sample(10)

In [None]:
#joins the melted dataset with the coordinate dataset
train_coordinates_df['condition'] = train_coordinates_df['condition'].str.lower()
train_coordinates_df['level'] = train_coordinates_df['level'].str.lower()
training_df = pd.merge(train_coordinates_df,melted_df, on = ['study_id','condition','level'])

#merges new dataframe with description dataset
training_df = pd.merge(training_df,train_desc_df, on = ['study_id','series_id'])

#creates the file names
training_df['file_path'] = training_df[
    'study_id'].astype(str) + '/' + training_df['series_id'].astype(str) + '/' + training_df['instance_number'].astype(str) + '.dcm'

training_df.dropna(inplace=True)
training_df = training_df[(training_df['x'] != 0) | (training_df['y'] != 0)]
training_df = training_df[training_df['series_description'] != 'Axial T2']
training_df.shape

In [None]:
#function to obtain metadata from images
def get_shape(image_path):
    dicom = pydicom.dcmread(image_path)
    height = dicom.Rows
    width = dicom.Columns
    x_pixel_spacing, y_pixel_spacing  = dicom.PixelSpacing
    x_image_position,y_image_position,z_image_position = dicom.ImagePositionPatient
    
    return height, width, x_image_position,y_image_position,z_image_position,x_pixel_spacing,y_pixel_spacing

In [None]:
#runs metadata function  and adds to dataframe
image_path= path + 'train_images/'
training_df['height'] = None
training_df['width'] = None

for index, row in training_df.iterrows():
    full_path = os.path.join(image_path, row['file_path'])
    height, width, x_image_position,y_image_position,z_image_position,x_pixel_spacing,y_pixel_spacing = get_shape(full_path)
    training_df.at[index, 'height'] = height
    training_df.at[index, 'width'] = width
    training_df.at[index, 'x_image_position'] = x_image_position
    training_df.at[index, 'y_image_position'] = y_image_position
    training_df.at[index, 'z_image_position'] = z_image_position
    training_df.at[index, 'x_pixel_spacing'] = x_pixel_spacing
    training_df.at[index, 'y_pixel_spacing'] = y_pixel_spacing

In [None]:
#prepares data for training
training_df = training_df[['series_description',
                           'level',
                           'height','width',
                           'x_image_position',
                           'y_image_position',
                           'z_image_position',
                           'x_pixel_spacing',
                           'y_pixel_spacing',
                           'y']
                         ]

level_values = {'l1/l2': 1, 'l2/l3': 2, 'l3/l4': 3, 'l4/l5': 4, 'l5/s1': 5}
series_values = {'Sagittal T2/STIR': 1, 'Sagittal T1': 2}
# Replace values with integers
training_df['series_description'] = training_df['series_description'].replace(series_values)
training_df['level'] = training_df['level'].replace(level_values)
training_df = training_df.apply(pd.to_numeric, errors='coerce')


training_df

In [None]:
#splits data into y and x dataframes
y_coor = training_df['y']
training_df.drop(columns=['y'], inplace=True)

In [None]:
#splits data for trainign and tesing
ycoor_xtrain, ycoor_xtest, ycoor_ytrain, ycoor_ytest = train_test_split(training_df, y_coor, test_size = 0.10)

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_cols = ['series_description',
                  'level',
                  'height',
                  'width',
                  'x_image_position',
                  'y_image_position',
                  'z_image_position',
                  'x_pixel_spacing',
                  'y_pixel_spacing',
                 ]


# Initializes the scaler
scaler = StandardScaler()

# Fit and transform the numerical columns
ycoor_xtrain[numerical_cols] = scaler.fit_transform(ycoor_xtrain[numerical_cols])
ycoor_xtest[numerical_cols] = scaler.transform(ycoor_xtest[numerical_cols])

In [None]:
import xgboost as xgb
#trains the xgboost model
xgb_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.2, max_depth=8)


y_coordinate_model = xgb_reg.fit(ycoor_xtrain, ycoor_ytrain)

In [None]:
ycoor_pred = y_coordinate_model.predict(ycoor_xtest)

In [None]:
#check model performance
y_rmse = mean_squared_error(ycoor_ytest, ycoor_pred, squared=False) 
y_mae = mean_absolute_error(ycoor_ytest, ycoor_pred)
print('y_rmse: ' + str(y_rmse))
print('y_mae: ' + str(y_mae))

In [None]:
y_errors = np.abs(ycoor_pred - ycoor_ytest)

# Find max error
max_y_error = np.max(y_errors)
print(f"Max Error (Y): {max_y_error}")

In [None]:
#plot distubution of errors
plt.figure(figsize=(12, 6))

plt.hist(y_errors, bins=100, edgecolor='black')
plt.title('Distribution of Y Errors')
plt.xlabel('Error')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Save the scaler to a pickle file
with open('y_coordinate_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save the model
y_model_path = 'y_coordinate_model.pkl'
joblib.dump(y_coordinate_model, y_model_path)