In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pydicom import dcmread
from PIL import Image
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils import data

## Preparing labels

In [None]:
label_data = pd.read_csv('../input/rsna-pneumonia-detection-challenge/stage_2_train_labels.csv')
columns = ['patientId', 'Target']

label_data = label_data.filter(columns)
label_data.head(5)

## Dividing labels for train and validation set

In [None]:
train_labels, val_labels = train_test_split(label_data.values, test_size=0.2)
print(train_labels.shape)
print(val_labels.shape)

In [None]:
print(f'patientId: {train_labels[0][0]}, Target: {train_labels[0][1]}')

## Preparing train and validation image paths

In [None]:
train_f = '../input/rsna-pneumonia-detection-challenge/stage_2_train_images'
test_f = '../input/rsna-pneumonia-detection-challenge/stage_2_test_images'

train_paths = [os.path.join(train_f, image[0]) for image in train_labels]
val_paths = [os.path.join(train_f, image[0]) for image in val_labels]

print(len(train_paths))
print(len(val_paths))

## Transforming and Compiling Data

In [None]:
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize(128)])

In [None]:
len(train_paths)

In [None]:
img_np = dcmread(f'{train_paths[0]}.dcm').pixel_array
arr = np.array(transform(img_np)).flatten()
arr

In [None]:
train_data_pixels = np.zeros([len(train_paths), 16384])
train_data_labels = np.zeros([len(train_paths), 1])
for i in range(0, len(train_paths)):
    img_np = dcmread(f'{train_paths[i]}.dcm').pixel_array
    train_data_pixels[i] = np.array(transform(img_np)).flatten()
    train_data_labels[i] = train_labels[i][1]
    print(i)

In [None]:
df_train_data = pd.DataFrame(train_data_pixels)
df_train_data['label'] = pd.Series(train_data_labels.flatten(), index=df_train_data.index)
df_train_data.to_csv('train_data_and_labels2.csv')

from IPython.display import FileLink
FileLink(r'train_data_and_labels2.csv')

In [None]:
val_data_pixels = np.zeros([len(val_paths), 16384])
val_data_labels = np.zeros([len(val_paths), 1])
for i in range(0, len(val_paths)):
    img_np = dcmread(f'{val_paths[i]}.dcm').pixel_array
    val_data_pixels[i] = np.array(transform(img_np)).flatten()
    val_data_labels[i] = val_labels[i][1]
    print(i)

In [None]:
df_val_data = pd.DataFrame(val_data_pixels)
df_val_data['label'] = pd.Series(val_data_labels.flatten(), index=df_val_data.index)
df_val_data.to_csv('val_data_and_labels2.csv')

from IPython.display import FileLink
FileLink(r'val_data_and_labels2.csv')