#  Bercystan Central Committee Master Notebook

**Import Necessary Libraries**

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import tensorflow as tf
import seaborn as sn
import matplotlib.pyplot as plt
from keras import layers
from tqdm import tqdm


**View CWD Files**

In [None]:
print(os.listdir("../input/siim-isic-melanoma-classification"))

**Read training and testing .csv files**

In [None]:
training = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
testing = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
training.head(10)

In [None]:
testing.head(10)

In [None]:
training.hist(column = "age_approx", bins = 10)
plt.title("Training Ages")
testing.hist(column = "age_approx", bins = 10)
plt.title("Testing Ages")

In [None]:
sn.set(font_scale=1.2)
training['sex'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.title("Training Dataset by Gender", y=1.02);

In [None]:
testing['sex'].value_counts().plot(kind='bar', figsize=(7, 6), rot=0)
plt.title("Testing Dataset by Gender", y=1.02);

In [None]:
print("0 = Benign, 1 = Melanoma")
training['target'].value_counts()

**Reading Images**

In [None]:
img = cv2.imread(f"../input/siim-isic-melanoma-classification/jpeg/train/ISIC_4131810.jpg", cv2.IMREAD_UNCHANGED)
print(img.shape)

Kevin here. So our jpeg images are RBG with 4k by 6k resolution which crashes memory if we load all of them. We either need to resize them, batch and save, or consider alternative file types (dcom etc). They give the exact same files using different file types. https://www.kaggle.com/parulpandey/melanoma-classification-eda-starter#2.-Reading-the-Image-datasets is a good place to start. For saving numpy arrays, this is the code I had for my own project:

Note that I significantly resized my images to 150x150 (I only had 8 gigabytes of memory). Kaggle gives you 16, so you can afford a bit more resolution.

# read images from disk
def upload(img_size=(150, 150), dir="rawdata"):
    print("Loading Images...")
    landuses = [landuse for landuse in get_classes()]
    for i in tqdm(range(len(landuses))):
        for name in os.listdir(f"files/{dir}/{landuses[i]}"):
            img = cv2.imread(
                f"files/{dir}/{landuses[i]}/{name}", cv2.IMREAD_UNCHANGED)
            img = cv2.resize(img, img_size)
            yield img, landuses[i]


# pickling images and labels to prevent re-uploading from rawdata every time
def serialize(name="Base", dir="rawdata", img_size=(150, 150)):
    print("Serializing images...")
    images, labels = zip(*upload(img_size=img_size, dir=dir))
    images = np.array(list(images))
    labels = np.array(list(labels))

    num_labels = []
    current = labels[0]
    index = 0
    for label in labels:
        if label != current:
            index += 1
            current = label
        num_labels.append(index)
    num_labels = np.array(num_labels)

    with open(f"files/{name}CompressedData.npz", "wb") as file:
        np.savez_compressed(file, images=images, labels=num_labels)


# retrieve serialized images
def load(filename):
    with open(f"files/{filename}.npz", "rb") as file:
        arr = np.load(file)
        return arr["images"], arr["labels"]

In [None]:
from PIL import Image

image = Image.open("../input/siim-isic-melanoma-classification/jpeg/train/ISIC_4131810.jpg")
image

In [None]:
# def readImages(dataset="train"):
#     imgs = []
#     count = 0
#     for file in tqdm(os.listdir(f"../input/siim-isic-melanoma-classification/jpeg/{dataset}")):
#         imgs.append(cv2.imread(f"../input/siim-isic-melanoma-classification/jpeg/{dataset}/{file}", cv2.IMREAD_UNCHANGED))
#         print(file)
#         count += 1
#         if (count == 50):
#             break
#     return np.array(imgs)
        
# training_imgs = readImages("train")
# # testing_imgs = readImages("test")
# print(training_imgs.shape)
# print(training_imgs[1])