In [None]:
%%html
<marquee style='width: 100%; color: red;'><H1>SKIN_CANCER</H1></marquee>

![](https://nci-media.cancer.gov/pdq/media/images/578083-750.jpg)

L'échelle de Clark comporte 5 niveaux de mélanome :

   1. Les cellules se trouvent dans la couche externe de la peau (épiderme)

   2. Les cellules se trouvent dans la couche située directement sous l'épiderme (derme pupillaire)

   3. Les cellules touchent la couche suivante appelée derme profond

   4. Les cellules se sont répandues dans le derme réticulaire

   5. Les cellules se sont développées dans la couche de graisse

![](https://media.giphy.com/media/lSJElktZ5BKUvYSztq/giphy.gif)

## Références 
* [TensorFlow + Transfer Learning: Melanoma](https://www.kaggle.com/amyjang/tensorflow-transfer-learning-melanoma)
* [GENERAL INFORMATION ABOUT MELANOMA](https://www.uhhospitals.org/services/cancer-services/skin-cancer/melanoma/about-melanoma)


# 1. Introduction ▶

### 1.1Qu'est-ce que le mélanome:
* [Le mélanome est le cancer de la peau le moins fréquent mais le plus mortel, ne représentant qu'environ 1 % de tous les cas, mais la grande majorité des décès dus au cancer de la peau.](https://www.aimatmelanoma.org/about-melanoma/melanoma-stats-facts-and-figures/)
* Le mélanome est le troisième cancer le plus fréquent chez les hommes et les femmes âgés de 20 à 39 ans.
* Aux États-Unis, le mélanome continue d'être 
    * le cinquième cancer le plus fréquent chez les hommes de tous les groupes d'âge
    * le sixième cancer le plus fréquent chez les femmes de tous les groupes d'âge
* L'Australie et la Nouvelle-Zélande présentent la plus forte incidence de mélanomes au monde (plus de deux fois plus qu'en Amérique du Nord)


# Nôtre Data:
### Train Dataset se compose de:

   1. image name -> le nom de fichier de l'image spécifique pour train set
   2. patient_id -> id unique du patient 
   3. sex -> genre du patient
   4. age_approx -> âge approximatif du patient 
   5. anatom_site_general_challenge -> l'emplacement du  scan site
   6. diagnosis -> des informations sur le diagnostic
   7. benign_malignant - indique le résultat du scan s'il est malin ou bénin
   8. target -> même chose que ci-dessus mais en mieux pour la modélisation puisqu'elle est binaire

### Test Dataset se compose de:

   1. image name -> le nom de fichier de l'image spécifique pour test set
   2. patient_id -> id unique du patient 
   3. sex -> genre du patient
   4. age_approx -> âge approximatif du patient 
   5. anatom_site_general_challenge -> l'emplacement du  scan site




### 1.2 objectifs:
> L'objectif est d'identifier correctement les cas ****bénins**** et ****malins****. Une tumeur bénigne est une tumeur qui n'envahit pas les tissus environnants ou ne se propage pas dans le corps. Une tumeur maligne est une tumeur qui peut envahir les tissus environnants ou se propager dans le corps. .
<img src = 'https://www.verywellhealth.com/thmb/IFgBpbmhYCJdS4rvLACzX3Ukqsc=/1500x0/filters:no_upscale():max_bytes(150000):strip_icc():format(webp)/514240-article-img-malignant-vs-benign-tumor2111891f-54cc-47aa-8967-4cd5411fdb2f-5a2848f122fa3a0037c544be.png' width = 300>

> Data: DICOM Files split in Train (33,126 observations) and Test (10,982 observations)
<img src='https://i.imgur.com/or0AoVs.png' width = 500>


# 3.Préparation de la base de données

## Visualisation de données

In [None]:
# Regular Imports
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.image as mpimg
from tabulate import tabulate
import missingno as msno 
from IPython.display import display_html
from PIL import Image
import gc
import cv2

import pydicom # for DICOM images
from skimage.transform import resize

# SKLearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore")


# Set Style
sns.set(style="darkgrid")
sns.despine(left=True, bottom=True)



In [None]:
list(os.listdir('../input/siim-isic-melanoma-classification'))

In [None]:
# Directory
directory = '../input/siim-isic-melanoma-classification'

# Import the 2 csv s
train_df = pd.read_csv(directory + '/train.csv')
test_df = pd.read_csv(directory + '/test.csv')

print('Train has {:,} rows and Test has {:,} rows.'.format(len(train_df), len(test_df)))

# Change columns names
new_names = ['dcm_name', 'ID', 'sex', 'age', 'anatomy', 'diagnosis', 'benign_malignant', 'target']
train_df.columns = new_names
test_df.columns = new_names[:5]

In [None]:
print(train_df)

In [None]:
print(test_df)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="anatomy", data=train_df)
ax1.set_title("distribution de anatomy  dans  training data")
sns.countplot(ax=ax2, x="anatomy", data=test_df)
ax2.set_title("distribution de anatomy dans test data")
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
a = sns.countplot(data=train_df, x='benign_malignant', hue='anatomy')

for p in a.patches:
    a.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')

plt.title('distribution de Anatomy par Target', fontsize=16)
sns.despine(left=True, bottom=True);

1. Il y a plus d'hommes que de femmes dans l'ensemble de données
2. Cependant, les pourcentages sont presque les mêmes

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 6))

a = sns.countplot(train_df['anatomy'], ax=ax1)
b = sns.countplot(train_df['diagnosis'], ax=ax2)

a.set_xticklabels(a.get_xticklabels(), rotation=35, ha="right")
b.set_xticklabels(b.get_xticklabels(), rotation=35, ha="right")

for p in a.patches:
    a.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')
    
for p in b.patches:
    b.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')
    
ax1.set_title('Les fréquences de Anatomy', fontsize=16)
ax2.set_title('Les fréquences de Diagnosis', fontsize=16)
sns.despine(left=True, bottom=True);

In [None]:
fig, (ax1) = plt.subplots(1,1, figsize=(20,5))
sns.countplot(ax=ax1, x="benign_malignant", data=train_df)
ax1.set_title("distribution de benign_malignant  dans  training data")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="sex", data=train_df)
ax1.set_title("distribution de sex  dans  training data")
sns.countplot(ax=ax2, x="sex", data=test_df)
ax2.set_title("distribution de sex dans test data")
plt.show()

In [None]:
plt.figure(figsize=(16, 6))
a = sns.countplot(data=train_df, x='benign_malignant', hue='sex')

for p in a.patches:
    a.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')

plt.title('distribution de sex par target', fontsize=16)
sns.despine(left=True, bottom=True);

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5))
sns.countplot(ax=ax1, x="age", data=train_df)
ax1.set_title("distribution d'age  dans  training data")
sns.countplot(ax=ax2, x="age", data=test_df)
ax2.set_title("distribution d'age dans test data")
plt.show()

In [None]:
fig, (ax1) = plt.subplots(1,1, figsize=(20,5))
sns.countplot(ax=ax1, x="target", data=train_df)
ax1.set_title("distribution d'age  dans  training data")
plt.show()

* **0=bénins**
* **1=malins**

In [None]:
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 6))

a = sns.countplot(train_df[train_df['target']==0]['diagnosis'], ax=ax1)
b = sns.countplot(train_df[train_df['target']==1]['diagnosis'], ax=ax2)

a.set_xticklabels(a.get_xticklabels(), rotation=35, ha="right")
b.set_xticklabels(b.get_xticklabels(), rotation=35, ha="right")

for p in a.patches:
    a.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')
    
for p in b.patches:
    b.annotate(format(p.get_height(), ','), 
           (p.get_x() + p.get_width() / 2., 
            p.get_height()), ha = 'center', va = 'center', 
           xytext = (0, 4), textcoords = 'offset points')
    
ax1.set_title('Cas bénins: vue de diagnostic', fontsize=16)
ax2.set_title('Cas malins: vue de diagnostic', fontsize=16)
sns.despine(left=True, bottom=True);

In [None]:
colors_nude = ['#e0798c','#65365a','#da8886','#cfc4c4','#dfd7ca']
patients_count_train = train_df.groupby(by='ID')['dcm_name'].count().reset_index()
patients_count_test = test_df.groupby(by='ID')['dcm_name'].count().reset_index()

# Figure
f, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 6))

a = sns.distplot(patients_count_train['dcm_name'], kde=False, bins=50, 
                 ax=ax1, color=colors_nude[0], hist_kws={'alpha': 1})
b = sns.distplot(patients_count_test['dcm_name'], kde=False, bins=50, 
                 ax=ax2, color=colors_nude[1], hist_kws={'alpha': 1})
    
ax1.set_title('Train: Images per Patient Distribution', fontsize=16)
ax2.set_title('Test: Images per Patient Distribution', fontsize=16)
sns.despine(left=True, bottom=True);


In [None]:
# Create the paths
path_train = directory + '/train/' + train_df['dcm_name'] + '.dcm'
path_test = directory + '/test/' + test_df['dcm_name'] + '.dcm'

# Append to the original dataframes
train_df['path_dicom'] = path_train
test_df['path_dicom'] = path_test

# === JPEG ===
# Create the paths
path_train = directory + '/jpeg/train/' + train_df['dcm_name'] + '.jpg'
path_test = directory + '/jpeg/test/' + test_df['dcm_name'] + '.jpg'

# Append to the original dataframes
train_df['path_jpeg'] = path_train
test_df['path_jpeg'] = path_test

In [None]:
fig, ax = plt.subplots()
ax.imshow(image)
ax.axis('off')

In [None]:
 plt.subplots(nrows=1, ncols=1, figsize=(16,6))

In [None]:
data = pydicom.read_file(train_df['path_dicom'][0])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image, (512,512))
#image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
data = pydicom.read_file(train_df['path_dicom'][1])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
#image = cv2.cvtColor(image)
image = cv2.resize(image, (512,512))
#image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
data = pydicom.read_file(train_df['path_dicom'][2])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image, (512,512))
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
data = pydicom.read_file(train_df['path_dicom'][3])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image, (512,512))
#image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
data = pydicom.read_file(train_df['path_dicom'][4])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image, (512,512))
#image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
data = pydicom.read_file(train_df['path_dicom'][5])
image = data.pixel_array
fig, ax = plt.subplots(figsize=(16,16))
image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
image = cv2.resize(image, (512,512))
#image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
ax.imshow(image, cmap=plt.cm.bone,) 
ax.axis('off')

In [None]:
def show_images(data, n = 5, rows=1, cols=6, title='Default'):
    plt.figure(figsize=(16,4))

    for k, path in enumerate(data['path_dicom'][:n]):
        image = pydicom.read_file(path)
        image = image.pixel_array
        
        # image = resize(image, (200, 200), anti_aliasing=True)

        plt.suptitle(title, fontsize = 16)
        plt.subplot(rows, cols, k+1)
        plt.imshow(image)
        plt.axis('off')

In [None]:
show_images(train_df[train_df['target'] == 0], n=10, rows=2, cols=5, title='Benign Sample')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(16,6))
plt.suptitle("B&W", fontsize = 16)

for i in range(0, 1):
    data = pydicom.read_file(train_df['path_dicom'][i])
    image = data.pixel_array
    
    # Transform to B&W
    # The function converts an input image from one color space to another.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
    image = cv2.resize(image, (200,200))
    
    x = i 
    y = i  
    axes[x, y].imshow(image, cmap=plt.cm.bone) 
    axes[x, y].axis('off')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(16,6))
plt.suptitle("Without Gaussian Blur", fontsize = 16)

for i in range(0, 2*6):
    data = pydicom.read_file(train_df['path_dicom'][i])
    image = data.pixel_array
    
    # Transform to B&W
    # The function converts an input image from one color space to another.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
    image = cv2.resize(image, (200,200))
    
    x = i // 6
    y = i % 6
    axes[x, y].imshow(image, cmap=plt.cm.bone) 
    axes[x, y].axis('off')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(16,6))
plt.suptitle("With Gaussian Blur", fontsize = 16)

for i in range(0, 2*6):
    data = pydicom.read_file(train_df['path_dicom'][i])
    image = data.pixel_array
    
    # Transform to B&W
    # The function converts an input image from one color space to another.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
    image = cv2.resize(image, (200,200))
    image=cv2.addWeighted(image, 4, cv2.GaussianBlur(image, (0,0) ,256/10), -4, 128)
    
    x = i // 6
    y = i % 6
    axes[x, y].imshow(image, cmap=plt.cm.bone) 
    axes[x, y].axis('off')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(16,6))
plt.suptitle("Hue, Saturation, Brightness", fontsize = 16)

for i in range(0, 2*6):
    data = pydicom.read_file(train_df['path_dicom'][i])
    image = data.pixel_array
    
    # Transform to B&W
    # The function converts an input image from one color space to another.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
    image = cv2.resize(image, (200,200))
    
    x = i // 6
    y = i % 6
    axes[x, y].imshow(image, cmap=plt.cm.bone) 
    axes[x, y].axis('off')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=6, figsize=(16,6))
plt.suptitle("LUV Color Space", fontsize = 16)

for i in range(0, 2*6):
    data = pydicom.read_file(train_df['path_dicom'][i])
    image = data.pixel_array
    
    # Transform to B&W
    # The function converts an input image from one color space to another.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
    image = cv2.resize(image, (200,200))
    
    x = i // 6
    y = i % 6
    axes[x, y].imshow(image, cmap=plt.cm.bone) 
    axes[x, y].axis('off')



In [None]:
image_list

In [None]:
train_df

In [None]:
t0=train_df['path_jpeg'][0]
t1=train_df['path_jpeg'][1]
t2=train_df['path_jpeg'][2]
t3=train_df['path_jpeg'][3]
t4=train_df['path_jpeg'][4]
t5=train_df['path_jpeg'][5]

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t0)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t1)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t2)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t3)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t4)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t5)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t6)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')

In [None]:
#image_list = train_df.sample(20)['path_jpeg']
#image_list = image_list.reset_index()['path_jpeg']

# Show the sample
plt.figure(figsize=(16,16))
#plt.suptitle("Original View", fontsize = 16)
    

image = mpimg.imread(t0)
image = cv2.resize(image, (512,512))
#plt.subplot(2, 6, k+1)
plt.imshow(image)
plt.axis('off')
