In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import glob
import cv2
from tqdm import tqdm
from PIL import Image, ImageFile
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import *

from sklearn import model_selection
import matplotlib.pyplot as plt
%matplotlib inline
from plotly.offline import iplot
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
import warnings
warnings.filterwarnings('ignore')


from kaggle_datasets import KaggleDatasets

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Loading and Exploring Dataset

In [None]:
train_df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
train_df.shape

In [None]:
train_df.head()

In [None]:
train_df['target'].value_counts()

In [None]:
##checking for missing vals
train_df.isnull().sum()

In [None]:
train_df.dropna(axis=0, subset=['sex'], inplace=True)
train_df.dropna(axis = 0,subset=['age_approx'], inplace=True)
train_df.dropna(axis=0, subset=['anatom_site_general_challenge'], inplace=True)

train_df.isnull().sum()

In [None]:
train_df.head()

In [None]:
plt.figure(figsize = (10,10))
data = train_df.benign_malignant.value_counts()
data.iplot(kind = 'bar', color='blue', title = 'Data Imbalance')

## Exploring Image 

In [None]:
img = cv2.imread('../input/siim-isic-melanoma-classification/jpeg/train/ISIC_0015719.jpg')
print(img.shape)

## Preparing Dataset

In [None]:
##creating labels from target columns with corressponding images 
train_dir = '../input/siim-isic-melanoma-classification/jpeg/train/'
test_dir = '../input/siim-isic-melanoma-classification/jpeg/test/'
labels = []
data = []
for i in range(train_df.shape[0]):
    data.append(train_dir + train_df['image_name'].iloc[i] + '.jpg')
    labels.append(train_df['target'].iloc[i])

df = pd.DataFrame(data)
df.columns = ['images']
df['target'] = labels

In [None]:
test_df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
test_data=[]
for i in range(test_df.shape[0]):
    test_data.append(test_dir + test_df['image_name'].iloc[i]+'.jpg')
df_test=pd.DataFrame(test_data)
df_test.columns=['images']

## Stratified KFolds

In [None]:
groups_by_patient_id_list = train_df['patient_id'].copy().tolist()

In [None]:
skf = model_selection.StratifiedKFold(n_splits=10)
ylabels = train_df.target.values
result = []   
for train_idx, val_idx in skf.split(train_df, ylabels, groups = groups_by_patient_id_list):
    train_fold = train_df.iloc[train_idx]
    val_fold = train_df.iloc[val_idx]
    result.append((train_fold, val_fold))

In [None]:
train_fold_1, val_fold_1 = result[0][0],result[0][1]
train_fold_2, val_fold_2 = result[1][0],result[1][1]
train_fold_3, val_fold_3 = result[2][0],result[2][1]
train_fold_4, val_fold_4 = result[3][0],result[3][1]
train_fold_5, val_fold_5 = result[4][0],result[4][1]

train_fold_6, val_fold_6 = result[6][0],result[6][1]
train_fold_7, val_fold_7 = result[7][0],result[7][1]
train_fold_8, val_fold_8 = result[8][0],result[8][1]
train_fold_9, val_fold_9 = result[9][0],result[9][1]

In [None]:
## sanity checks
sample = train_fold_1.groupby("patient_id")
sample.get_group("IP_0147446")
sample.get_group("IP_0147446").count()

## TPU/GPU Initialization

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [None]:
# For tf.dataset
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_PATH = KaggleDatasets().get_gcs_path('siim-isic-melanoma-classification')

# Configuration
BATCH_SIZE = 8 * strategy.num_replicas_in_sync
image_size = 224
EPOCHS = 3

**Creating functions that will take in an image name and return image path, this we will apply to the training and validation folds, that we have created**
**Implementation courtesy of this awesome [kernel](https://www.kaggle.com/reighns/groupkfold-and-stratified-groupkfold-efficientnet/comments)**

In [None]:
def train_image_paths(img_name):
    return GCS_PATH + '/jpeg/train/' + img_name + '.jpg'

def test_image_paths(image_name):
    return GCS_PATH + '/jpeg/test/' + img_name + '.jpg'

##applying this to folds data which we created
train_path_fold_1 = train_fold_1.image_name.apply(train_image_paths).values
val_path_fold_1 = val_fold_1.image_name.apply(train_image_paths).values

train_path_fold_2 = train_fold_2.image_name.apply(train_image_paths).values
val_path_fold_2 = val_fold_2.image_name.apply(train_image_paths).values

train_path_fold_3 = train_fold_3.image_name.apply(train_image_paths).values
val_path_fold_3 = val_fold_3.image_name.apply(train_image_paths).values

train_path_fold_4 = train_fold_4.image_name.apply(train_image_paths).values
val_path_fold_4 = val_fold_4.image_name.apply(train_image_paths).values

train_path_fold_5 = train_fold_5.image_name.apply(train_image_paths).values
val_path_fold_5 = val_fold_5.image_name.apply(train_image_paths).values


train_labels_fold_1 = train_fold_1.target.values
val_labels_fold_1 = val_fold_1.target.values

## Preparing Dataset
- **Using decode_image function from tensorflow, we will generate labels, with the corresponding paths provided, it also will resize the image**

In [None]:
def decode_image(filename, label=None, image_size=(image_size,image_size)):
    bits = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(bits, channels=3)
    image = tf.cast(image, tf.float32) / 255.0
    image = tf.image.resize(image, size = image_size)
    
    if label is None:
        return image
    else:
        return image, label

##augmenting the data
def augment(image, label=None):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
#     image = tf.image.random_saturation(image, lower = 1, upper = 3)
#     image = tf.image.adjust_brightness(image, delta = 0.3)
    image = tf.image.random_contrast(image, lower = 1, upper = 2)
    if label is None:
        return image
    else:
        return image, label

**Let's check it out**

In [None]:
train_fold_1

In [None]:
sample_filename = 'gs://kds-5dd48bdf94d283b2157eff515b17b1a840a9133121124719ad41d5c4/jpeg/train/ISIC_1102075.jpg' 
sample_label = 0
image_size = 224

# 1. tf.io_read_file takes in a Tensor of type string and outputs a ensor of type string. 
#    Reads and outputs the entire contents of the input filename. 
bits = tf.io.read_file(sample_filename)

# 2. Decode a JPEG-encoded image to a uint8 tensor. You can also use tf.io.decode_jpeg but according to 
#    tensorflow's website, it might be cleaner to use tf.image.decode_jpeg
image = tf.image.decode_jpeg(bits, channels=3)

image.shape  # outputs TensorShape([4000, 6000, 3])

# 3. image = tf.cast(image, tf.float32) / 255.0 is easy to understand, it takes in 
#    an image, and cast the image into the data type you want. Here we also normalized by dividing by 255.

image = tf.cast(image, tf.float32) / 255.0


# 4. image = tf.image.resize(image, image_size) is also easy to understand. We merely resize this image to the image_size we wish for.
#    take note in our function defined above, the argument image_size is a tuple already. So we must pass in a tuple of our desired image_size.
image = tf.image.resize(image, size = (image_size, image_size))

image.shape

Next Step, I will be using [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset)

According to the tensorflow website: *The tf.data.Dataset* API supports writing descriptive and efficient input pipelines. Dataset usage follows a common pattern:

Create a source dataset from your input data.
Apply dataset transformations to preprocess the data.
Iterate over the dataset and process the elements.
Iteration happens in a streaming fashion, so the full dataset does not need to fit into memory.

In [None]:
# train_labels_fold_1 = train_fold_1.target.values
sample_data = tf.data.Dataset.from_tensor_slices((train_path_fold_1, train_labels_fold_1))
for data in sample_data:
    print(len(data))
    print(data[0])
    print(data[1])   
    break

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((train_path_fold_1, train_labels_fold_1)).map(decode_image, num_parallel_calls=AUTO)
for data in dataset:
    print(len(data))
    print(data[0])
    print(data[1])
    break