In [None]:
'''Import basic modules.'''
import pandas as pd
import numpy as np
import os
import pydicom as dcm
import cv2
import time

'''Customize visualization
Seaborn and matplotlib visualization.'''
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline
import folium 
from IPython.core.display import HTML
import urllib.request
from PIL import Image
import imageio



'''Plotly visualization .'''
import plotly.express as px
import plotly.offline as py
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
py.init_notebook_mode(connected = True) # Required to use plotly offline in jupyter notebook



'''Display markdown formatted output like bold, italic bold etc.'''
from IPython.display import Markdown
def bold(string):
    display(Markdown(string))

import warnings
warnings.filterwarnings('ignore')

<h1 align="center" style="color:teal;font-family:Georgia">SEER & SIIM-ISIC Melanoma: Facts, Statistics, & EDA<h1>

<img src = "https://i0.wp.com/images-prod.healthline.com/hlcmsresource/images/topic_centers/2019-4/8211-Skin_Cancer-_Facts_Statistics_and_You-1296x728-Header.jpg?w=1155&h=1530"  width="700" height="700">

[Image source](https://www.healthline.com/health/skin-cancer/facts-and-stats#1)

# About Melanoma
Melanoma, also known as malignant melanoma, is a type of skin cancer that develops from the pigment-producing cells known as melanocytes.Melanomas typically occur in the skin but may rarely occur in the mouth, intestines or eye (uveal melanoma). In women, they most commonly occur on the legs, while in men they most commonly occur on the back.About 25% of melanomas develop from moles. Changes in a mole that can indicate melanoma include an increase in size, irregular edges, change in color, itchiness or skin breakdown.

### Cause
**Melanomas are usually caused by DNA damage** resulting from exposure to ultraviolet light from the sun. **Genetics also plays a role.** Melanoma can also occur in skin areas with little sun exposure (i.e. mouth, soles of feet, palms of hands, genital areas). People with dysplastic nevus syndrome, also known as familial atypical multiple mole melanoma (FAMMM), are at increased risk for the development of melanoma.

Having more than fifty moles indicates an increased risk melanoma might arise. A weakened immune system makes it easier for cancer to arise due to the body's weakened ability to fight cancer cells.

### Where melanoma is most likely to develop
<img src = "https://upload.wikimedia.org/wikipedia/commons/thumb/3/37/Diagram_showing_where_melanoma_is_most_likely_to_develop_CRUK_383.svg/800px-Diagram_showing_where_melanoma_is_most_likely_to_develop_CRUK_383.svg.png" width="300" height="300">

### Signs and symptoms
Early signs of melanoma are summarized by the mnemonic "ABCDEF":

* Asymmetry
* Borders (irregular with edges and corners)
* Colour (variegated)
* Diameter (greater than 6 mm (0.24 in), about the size of a pencil eraser)
* Evolving over time
* Funny looking

This classification does not apply to nodular melanoma, which has its own classifications:

* Elevated above the skin surface
* Firm to the touch
* Growing

### Prevention
* Minimizing exposure to sources of ultraviolet radiation (the sun and sunbeds), following sun protection measures and wearing sun protective clothing (long-sleeved shirts, long trousers, and broad-brimmed hats) can offer protection.

* Sunscreen appears to be effective in preventing melanoma.

* [Statin](https://en.wikipedia.org/wiki/Statin) and [Fibrate](https://en.wikipedia.org/wiki/Fibrate) medication may decrease the risk of melanoma.

**Reference:** 
* http://www.cancer.gov/types/skin/hp/melanoma-treatment-pdq
* https://www.iarc.fr/en/publications/pdfs-online/wcr/2003/WorldCancerReport.pdf

<h1 style="color:orangered; font-family:Georgia">U.S. Melanoma Skin Cancer: Facts & Statistics<h1>

In [None]:
incidence_rates = pd.read_csv('../input/melanoma-skin-cancer-dataset/annual_incidence_rates.csv')
death_rates = pd.read_csv('../input/melanoma-skin-cancer-dataset/annual_death_rates.csv')
age_incidence_rates = pd.read_csv('../input/melanoma-skin-cancer-dataset/age_specific_incidence_rate.csv')
age_death_rates = pd.read_csv('../input/melanoma-skin-cancer-dataset/age_specific_death_rate.csv')
state_death_rates = pd.read_csv('../input/melanoma-skin-cancer-dataset/state_death_rates.csv')
code = ['AL','AK','AZ','AR','CA','CO','CT','DE','DC','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']
state_death_rates['code'] = code

The incidence of both non-melanoma and melanoma skin cancers has been increasing over the past decades. Currently, between 2 and 3 million non-melanoma skin cancers and 132,000 melanoma skin cancers occur globally each year. One in every three cancers diagnosed is a skin cancer and, according to Skin Cancer Foundation Statistics, one in every five Americans will develop skin cancer in their lifetime. [Source](https://www.who.int/)

**Dataset of U.S. Melanoma Skin Cancer Facts & Statistics Collected from:
* https://seer.cancer.gov/ 
* https://gis.cdc.gov/Cancer/USCS/DataViz.html
* https://seer.cancer.gov/csr/1975_2017/results_figure/sect_01_intro2_25pgs.pdf

In [None]:
trace1 = go.Scatter(
                x=incidence_rates['Year of Diagnosis'],
                y=incidence_rates['All Races,Males'],
                name="All Races,Males",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='dodgerblue')

trace2 = go.Scatter(
                x=incidence_rates['Year of Diagnosis'],
                y=incidence_rates['All Races,Females'],
                name="All Races,Females",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='fuchsia')

layout = go.Layout(template = 'plotly_white', width=700, height=500, title_text = '<b>Incidencea Rates by Year, All Race and Sex </b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1,trace2], layout = layout)
fig.add_shape(
        # Line Vertical
        dict(
            type="line",
            x0='2015',
            y0=6.5,
            x1='2015',
            y1=35,
            line=dict(
                color="black",
                width=1,
                dash="dashdot"
            )))
fig.add_annotation( # add a text callout with arrow
    text=" Slightly Decreasing", x='2015', y=23, arrowhead=1, showarrow=True
)
fig.show()

trace1 = go.Scatter(
                x=incidence_rates['Year of Diagnosis'],
                y=incidence_rates['Whites,Both Sexes'],
                name="Whites,Both Sexes",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='lightsalmon')

trace2 = go.Scatter(
                x=incidence_rates['Year of Diagnosis'],
                y=incidence_rates['Blacks,Both Sexes'],
                name="Blacks,Both Sexes",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='maroon')

layout = go.Layout(template = 'plotly_white', width=700, height=500, title_text = '<b>Incidencea Rates by Year, Between Race</b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1,trace2], layout = layout)
fig.show()

üìå**Inference**: Men are two times more likely to develop melanoma than women as shown in above chart. We can see that continuous rise in the incidence rate until 2015, we can obverse that slightly decrease in rates. White american skin are 20 times more likely to be diagnosed with cancer that african American.

üìù**Footnotes:** SEER 9 areas (San Francisco, Connecticut, Detroit, Hawaii, Iowa, New Mexico, Seattle, Utah, and Atlanta). Rates are per 100,000 and are age-adjusted to the 2000 US Std Population (19 age groups - Census P25-1130).

In [None]:
trace1 = go.Scatter(
                x=death_rates['Year of Death'],
                y=death_rates['All Races,Males'],
                name="All Races,Males",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='dodgerblue')

trace2 = go.Scatter(
                x=death_rates['Year of Death'],
                y=death_rates['All Races,Females'],
                name="All Races,Females",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='fuchsia')

layout = go.Layout(template = 'plotly_white', width=700, height=500, title_text = '<b>Death  Rates by Year, All Race and Sex </b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1,trace2], layout = layout)
fig.add_shape(
        # Line Vertical
        dict(
            type="line",
            x0='2009',
            y0=1.3,
            x1='2009',
            y1=4.5,
            line=dict(
                color="black",
                width=1,
                dash="dashdot"
            )))
fig.add_annotation( # add a text callout with arrow
    text="Start Decreasing", x='2009', y=2.5, arrowhead=1, showarrow=True
)
fig.show()

trace1 = go.Scatter(
                x=death_rates['Year of Death'],
                y=death_rates['Whites,Both Sexes'],
                name="Whites,Both Sexes",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='lightsalmon')

trace2 = go.Scatter(
                x=death_rates['Year of Death'],
                y=death_rates['Blacks,Both Sexes'],
                name="Blacks,Both Sexes",
                marker=dict(size=3.5),
                mode='lines+markers',
                line_color='maroon')

layout = go.Layout(template = 'plotly_white', width=700, height=500, title_text = '<b>Death Rates by Year, Between Race</b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1,trace2], layout = layout)
fig.show()

üìå**Inference**: Death rate is also higher in males, charts shows increase trend but in females death rate is flate trend over the year. After 2019, we can see that death rate in both sex is significatly decreasing.

üìù**Footnotes:** US Mortality Files, National Center for Health Statistics, Centers for Disease Control and Prevention. Rates are per 100,000 and are age-adjusted to the 2000 US Std Population (19 age groups - Census P25-1130).

In [None]:

trace1 = go.Pie(
                labels=age_incidence_rates['  Age at Diagnosis'],
                values=age_incidence_rates['All Races,  Both Sexes'],
                hoverinfo='label+percent', 
                textfont_size=12,
                marker=dict(colors=px.colors.sequential.RdBu, 
                            line=dict(color='#000000', width=2)))
layout = go.Layout(width=600, height=500,title_text = '<b>Age-Specific Incidence Rates By All Race and Sex ,  2013-2017<b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

trace2 = go.Bar(
            x=age_incidence_rates['  Age at Diagnosis'], 
            y=age_incidence_rates['All Races,Males'],
            text=age_incidence_rates['All Races,Males'],
            name = 'All Races,Males',
            textposition='auto',
            marker_color='dodgerblue')
trace3 = go.Bar(
            x=age_incidence_rates['  Age at Diagnosis'], 
            y=age_incidence_rates['All Races,Females'],
            text=age_incidence_rates['All Races,Females'],
            name = 'All Races,Females',
            textposition='auto',
            marker_color='fuchsia')
layout = go.Layout(barmode='group', template = 'plotly_white',width=700, height=500, 
                  title_text = '<b>Age-Specific Incidence Rates Between Sex, 2013-2017<b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace2, trace3], layout = layout)
fig.show()

üìå**Inference:** The age of 50, men are more likely to develop melanoma than women. Over an entire lifetime, 1 in 34 men will develop melanoma. Only 1 in 53 women will be.

Moreover, men are two times more likely to develop melanoma than women after the age of 60. After age 80, men are three times more likely to develop melanoma. Before 60, though, the statistic is reversed. Women are nearly twice as likely to develop melanoma before their 60th birthday.

üìù**Footnotes:** Rates are per 100,000 and are age-adjusted to the 2000 US Std Population (19 age groups - Census P25-1130)


In [None]:
trace1 = go.Pie(
                labels=age_death_rates['  Age at Death'],
                values=age_death_rates['All Races,  Both Sexes'],
                hoverinfo='label+percent', 
                textfont_size=12,
                marker=dict(colors=px.colors.sequential.RdBu, 
                            line=dict(color='#000000', width=2)))
layout = go.Layout(width=600, height=500,title_text = '<b>Age-Specific Death Rates By All Race and Sex ,  2013-2017<b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace1], layout = layout)
fig.show()

trace2 = go.Bar(
            x=age_death_rates['  Age at Death'], 
            y=age_death_rates['All Races,Males'],
            text=age_death_rates['All Races,Males'],
            name = 'All Races,Males',
            textposition='auto',
            marker_color='dodgerblue')
trace3 = go.Bar(
            x=age_death_rates['  Age at Death'], 
            y=age_death_rates['All Races,Females'],
            text=age_death_rates['All Races,Females'],
            name = 'All Races,Females',
            textposition='auto',
            marker_color='fuchsia')
layout = go.Layout(barmode='group', template = 'plotly_white',width=650, height=500, 
                  title_text = '<b>Age-Specific Death Rates Between Sex, 2013-2017<b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))
fig = go.Figure(data = [trace2, trace3], layout = layout)
fig.show()

üìå**Inference:** Deaths are singificantly increase in the age above 50. High death rates recorded in the age of above 70 in the both males and females.

üìù**Footnotes:** Rates are per 100,000 and are age-adjusted to the 2000 US Std Population (19 age groups - Census P25-1130)

In [None]:
fig = go.Figure(data=go.Choropleth(
    locations=state_death_rates['code'], 
    z = state_death_rates['Both Sex '], 
    locationmode = 'USA-states',
    colorscale = 'brbg',
    colorbar_title = "Death Rate",
    text=state_death_rates['State'],
))

fig.update_layout(width=650, height=600, 
                  title_text = '<b>Individual State Death Rates<b>',
                  font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'),
    geo = dict(
        scope='usa',
        projection=go.layout.geo.Projection(type = 'albers usa'), 
))

fig.show()

fig = px.bar(state_death_rates.sort_values(by='Both Sex ', ascending=False).head(10).sort_values('Both Sex ', ascending=True), 
             x='Both Sex ', y="State", 
             title='<b>Top 10 States by Death Rates<b>',
             text='Both Sex ', 
             orientation='h', 
             width=700, height=500)
fig.update_traces(marker_color='khaki', opacity=0.8, textposition='inside',)

fig.update_layout(template = 'plotly_white')
fig.show()

In [None]:
Female = np.array([-1.50,-1.21,-1.67,-1.35,-1.26,-1.97,-1.31,-1.69,-0.00,-1.48,-1.39,-0.77,-2.15,-1.45,
         -1.72,-1.88,-1.82,-1.87,-1.07,-1.93,-1.22,-1.73,-1.41,-1.60,-1.24,-1.59,-1.50,-1.52,-1.35,
         -1.84,-1.44,-1.44,-1.17,-1.54,-1.26,-1.71,-1.61,-1.73,-1.72,-1.62,-1.49,-1.51,-1.71,-1.19,
         -1.64,-1.67,-1.51,-1.74,-2.00,-1.46,-1.69])

data = [go.Bar(y=state_death_rates['State'],
               x=state_death_rates['Male'],
               orientation='h',
               name='Male',
               hoverinfo='x',
               marker=dict(color='dodgerblue')
               ),
        go.Bar(y=state_death_rates['State'],
               x=Female,
               orientation='h',
               name='Female',
               text=-1 * Female,
               hoverinfo='text',
               marker=dict(color='fuchsia')
               )]

layout = go.Layout(width=650, height=700,
                   template = 'plotly_white',
                   yaxis=go.layout.YAxis(title='State'),
                   xaxis=go.layout.XAxis(
                       range=[-6, 6],
                       tickvals=[-5.5,-4.5,-3.5, -2.5, -1.5, 0, 1.5, 2.5, 3.5, 4.5, 5.5],
                       ticktext=[5.5,4.5,3.5, 2.5, 1.5, 0, 1.5, 2.5, 3.5, 4.5, 5.5],
                       title='Death Rate'),
                    barmode='overlay',
                    bargap=0.1,
                    title_text = '<b>Pyramid Chart of States Death Rates Between Sex<b>',
                      font=dict(family="Arial, Balto, Courier New, Droid Sans",color='black'))

py.iplot(dict(data=data, layout=layout))

üìå**Inference:** You may think places with sunnier, hotter weather have more cases of skin cancer. This isn‚Äôt necessarily the case. In fact, the Centers for Disease Control and Prevention notes California and Florida had fewer cases per 100,000 people than states with cooler climates, like Wyoming, Montana, and Idaho. 
<br>The states with the fewest cases of skin cancer are:<br>
* D.C.	
* Hawaii
* Louisiana
* Florida
* New York	
* Texas

üìù**Footnotes:** US Mortality Files, National Center for Health Statistics, Centers for Disease Control and Prevention. Rates are per 100,000 and are age-adjusted to the 2000 US Std Population (19 age groups - Census P25-1130).

<br><hr></hr></br>

<h1 style="color:orangered; font-family:Georgia">Competition: SIIM-ISIC Melanoma Classification -EDA<h1>

### About the competition
In this competition, you‚Äôll identify melanoma in images of skin lesions. In particular, you‚Äôll use images within the same patient and determine which are likely to represent a melanoma. Using patient-level contextual information may help the development of image analysis tools, which could better support clinical dermatologists.

### What am I predicting?
You are predicting a binary target for each image. Your model should predict the probability (floating point) between 0.0 and 1.0 that the lesion in the image is malignant (the target). In the training data, train.csv, the value 0 denotes benign, and 1 indicates malignant.

### Dataset
The dataset consists of images in :

* DIOCOM format
* JPEG format in JPEG directory
* TFRecord format in tfrecords directory
* Additionally, there is a metadata comprising of train, test and submission file in CSV format.

### Columns
* image_name - unique identifier, points to filename of related DICOM image
* patient_id - unique patient identifier
* sex - the sex of the patient (when unknown, will be blank)
* age_approx - approximate patient age at time of imaging
* anatom_site_general_challenge - location of imaged site
* diagnosis - detailed diagnosis information (train only)
* benign_malignant - indicator of malignancy of imaged lesion
* target - binarized version of the target variable

### Evaluation Metric
Submissions are evaluated on [area under the ROC curve](https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc) between the predicted probability and the observed target.

## 1. Reading The Dataset

In [None]:
IMAGE_PATH = "../input/siim-isic-melanoma-classification/"

train_df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')


#Training data
print('Training data shape: ', train_df.shape)
print(' ')
print('Test data shape: ', test_df.shape)

bold('**TRAINING DATA**')
display(train_df.head(5))
bold('**TEST DATA**')
display(test_df.head(5))

## 2. Data Exploration

In [None]:
bold('**MISSING DATA AND DATA TYPES**')
print('===== Train Set =====')
print(train_df.info())
print('\n')
print('===== Train Set =====')
print(train_df.info())

bold('**TOTAL NUMBER OF IMAGES**')
print("Total images in Train set: ",train_df['image_name'].count())
print("Total images in Test set: ",test_df['image_name'].count())

bold('**UNIQUE IDs**')
print('Total nuber of patient ids in train set ', train_df['patient_id'].count())
print('Total nuber of patient ids test set ', test_df['patient_id'].count())
print('Unique ids in train set ',train_df['patient_id'].nunique())
print('Unique ids in test set ', test_df['patient_id'].nunique())

ids_train = train_df.patient_id.values
ids_test = test_df.patient_id.values
patient_overlap = list(set(ids_train).intersection(set(ids_test)))
print('Patient IDs in both the training and test sets ', len(patient_overlap))


*There are some missing values in both the train and text set.*

## 3. EDA

### 3.1 Target (binarized version)

In [None]:
target_count = train_df['target'].value_counts().reset_index()
target_count['percent']=np.round(train_df['target'].value_counts(normalize=True), 2)
target_count.rename(columns={'index': 'target', 'target':'count'}, inplace=True)

fig = px.bar(target_count, 
             x='target', y="count", 
             title='<b>Distribution of the Target (binarized version)<b>',
             text='percent', 
             orientation='v', 
             width=500, height=600)
fig.update_traces(opacity=0.8, marker=dict(color='yellowgreen',
                                  line=dict(width=2, color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*Target variable is highly imbalanced*

### 3.2 Gender distribution

In [None]:
sex_count = train_df['sex'].value_counts().reset_index()
sex_count['percent']=np.round(train_df['sex'].value_counts(normalize=True).reset_index()['sex'], 2)
sex_count.rename(columns={'index': 'sex', 'sex':'count'}, inplace=True)

fig = px.bar(sex_count, 
             x='sex', y="count", 
             title='<b>Distribution of the gender<b>',
             text='percent', 
             orientation='v', 
             width=500, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(color='lightsalmon',
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

### 3.3 Gender by target

In [None]:
temp = train_df.groupby(['target','sex'])['target'].count().to_frame('count').reset_index()
fig = px.bar(temp, 
             x="target", y="count", 
             color='sex', 
             barmode='group',
             title='<b>Distribution of the gender by target<b>',
             text='count', 
             orientation='v', 
             width=500, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*It seem Gender variable is equally distributed between target.*

### 3.4 Distribution of patients age

In [None]:
fig = px.histogram(train_df, 
             x="age_approx", 
             nbins=30,
             barmode='group',
             title='<b>Distribution of the gender by target<b>',
             marginal="box",
             width=600, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(color='palegoldenrod',
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*Age variable is normally distributed* 

### 3.5 Distribution of patients age by gender

In [None]:
temp = train_df[['age_approx', 'sex']].dropna()
fig = px.histogram(temp, 
             x="age_approx",
             color = 'sex',
             nbins=30,
             barmode='group',
             title='<b>Distribution of patients age by gender<b>',
             marginal="box",
             width=600, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

### 3.6 Distribution of benign malignant (target variable non binarized version)

In [None]:
benign_count = train_df['benign_malignant'].value_counts().reset_index()
benign_count['percent']=np.round(train_df['benign_malignant'].value_counts(normalize=True).reset_index()['benign_malignant'], 2)
benign_count.rename(columns={'index': 'benign_malignant', 'benign_malignant':'count'}, inplace=True)

fig = px.bar(benign_count, 
             x='benign_malignant', y="count", 
             title='<b>Distribution of the benign malignant<b>',
             text='percent', 
             orientation='v', 
             width=500, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(color='moccasin',
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*It seem dataset have more of beign tumor than malignant tumor. There are two main classifications of tumors. One is known as benign and the other as malignant. A benign tumor is a tumor that does not invade its surrounding tissue or spread around the body. A malignant tumor is a tumor that may invade its surrounding tissue or spread around the body.[source](https://study.com/academy/lesson/benign-vs-malignant-definition-characteristics-differences.html).*
![](https://charlottedermatologypa.com/new/wp-content/uploads/2016/08/melanoma.jpeg)

### 3.7 Distribution of patients age by benign malignant

In [None]:
temp = train_df[['age_approx', 'benign_malignant']].dropna()
fig = px.histogram(temp, 
             x="age_approx",
             color = 'benign_malignant',
             nbins=30,
             barmode='group',
             title='<b>Distribution of patients age by benign malignant<b>',
             marginal="box",
             width=600, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

### 3.8 Location of imaged site

In [None]:
anatom_count = train_df['anatom_site_general_challenge'].value_counts().reset_index()
anatom_count['percent']=np.round(train_df['anatom_site_general_challenge'].value_counts(normalize=True).reset_index()['anatom_site_general_challenge'], 2)
anatom_count.rename(columns={'index': 'anatom_site_general_challenge', 'anatom_site_general_challenge':'count'}, inplace=True)

fig = px.bar(anatom_count, 
             x='anatom_site_general_challenge', y="count", 
             title='<b>Distribution of the Location of imaged site<b>',
             text='percent', 
             orientation='v', 
             width=600, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(color='lightcoral',
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*It seem torso (the trunk of the human body) is most affected area by melanoma, after the extremities of the body (upper/lower) and Very less cases in palms/ soles and oral/genital areas*

### 3.9 Location of imaged site by gender

In [None]:
temp = train_df.groupby(['anatom_site_general_challenge','sex'])['anatom_site_general_challenge'].count().to_frame('count').reset_index()
fig = px.bar(temp, 
             x="anatom_site_general_challenge", y="count", 
             color='sex', 
             barmode='group',
             title='<b>Distribution of the Location of imaged site by gender<b>',
             text='count', 
             orientation='v', 
             width=700, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*In men, melanoma is most commonly found on the back and other places on the trunk (from the shoulders to the hips) or the head and neck. The most common sites in women are the arms and the legs.*

### 3.10 Distribution of Diagnosis

In [None]:
diagnosis_count = train_df['diagnosis'].value_counts().reset_index()
diagnosis_count['percent']=np.round(train_df['diagnosis'].value_counts(normalize=True).reset_index()['diagnosis'], 2)
diagnosis_count.rename(columns={'index': 'diagnosis', 'diagnosis':'count'}, inplace=True)

fig = px.bar(diagnosis_count, 
             x='diagnosis', y="count", 
             title='<b>Distribution of the diagnosis<b>',
             text='count', 
             orientation='v', 
             width=600, height=600,
            )
fig.update_traces(opacity=0.8, marker=dict(color='rebeccapurple',
                                  line=dict(width=2,
                                        color='DarkSlateGrey')))

fig.update_layout(template = 'plotly_white')
fig.show()

*These are the different form of molanoma skin cancer. Most of the unknown but nevus is a type of mole that looks different from a common mole, and it is most likely to convert into melonama shin cancer. Lentigo has a lower rate of transformation to invasive melanoma than the other form of melanoma*
* Unknown refers to metastatic melanoma in lymph nodes, subcutaneous tissue, or visceral sites in the absence of a detectable primary tumour despite detailed examination

[Source](https://www.cancer.gov/types/skin/moles-fact-sheet)

## 4. Visualising Images
Looking at the images we can observe the moles that are irregular in color or shape are typically treated as candidates. To detect melanomas (and increase survival rates), it is recommended to learn to recognize them (see "ABCDE" mnemonic), to regularly examine moles for changes (shape, size, color, itching or bleeding) and to consult a qualified physician when a candidate appears.

### 4.1 Exploration of Images

In [None]:
bold('**Let‚Äôs load an image and observe its various properties in general**')

images = train_df['image_name'].values
img_dir = IMAGE_PATH+'/jpeg/train'

img = imageio.imread(os.path.join(img_dir, (images+'.jpg')[1]))
plt.figure(figsize = (5,5))
plt.imshow(img)
plt.show()

bold('**Observe Basic Properties of Image**')
print('Type of the image : ' , type(img)) 
print('Shape of the image : {}'.format(img.shape)) 
print('Image Hight {}'.format(img.shape[0])) 
print('Image Width {}'.format(img.shape[1])) 
print('Dimension of Image {}'.format(img.ndim))

bold('**calculate the size of an RGB image**')
print('Image size {}'.format(img.size)) 
print('Maximum RGB value in this image {}'.format(img.max())) 
print('Minimum RGB value in this image {}'.format(img.min()))
print('Value of only R channel {}'.format(img[ 100, 50, 0])) 
print('Value of only G channel {}'.format(img[ 100, 50, 1])) 
print('Value of only B channel {}'.format(img[ 100, 50, 2]))

In [None]:
bold('**view of random images in single channel**')
random_images = [np.random.choice(images+'.jpg') for i in range(9)]
plt.figure(figsize=(10,8))

# Iterate and plot random images
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = imageio.imread(os.path.join(img_dir, random_images[i]))
    plt.imshow(img[ : , : , 0], cmap='gray')
    
# Adjust subplot parameters to give specified padding
plt.tight_layout() 

### 4.2 Images corresponding to target

In [None]:
# https://www.kaggle.com/gpreda/siim-isic-melanoma-classification-eda
def show_dicom_images(data):
    img_data = list(data.T.to_dict().values())
    f, ax = plt.subplots(3,3, figsize=(16,18))
    for i,data_row in enumerate(img_data):
        patientImage = data_row['image_name']+'.dcm'
        imagePath = os.path.join(IMAGE_PATH,"train/",patientImage)
        data_row_img_data = dcm.read_file(imagePath)
        modality = data_row_img_data.Modality
        age = data_row_img_data.PatientAge
        sex = data_row_img_data.PatientSex
        data_row_img = dcm.dcmread(imagePath)
        ax[i//3, i%3].imshow(data_row_img.pixel_array, cmap=plt.cm.gray) 
        ax[i//3, i%3].axis('off')
        ax[i//3, i%3].set_title(f"ID: {data_row['image_name']}\nModality: {modality} Age: {age} Sex: {sex}\nDiagnosis: {data_row['diagnosis']}")
    plt.show()

In [None]:
bold('**Images with Malignant lesions**')
show_dicom_images(train_df[train_df['target']==1].sample(9))

In [None]:
bold('**Images with benign lesions**')
show_dicom_images(train_df[train_df['target']==0].sample(9))

### 4.3 Histograms
Histograms are a graphical representation showing how frequently various color values occur in the image i.e frequency of pixels intensity values. In a RGB color space, pixel values range from 0 to 255 where 0 stands for black and 255 stands for white. Analysis of a histogram can help us understand thee brightness, contrast and intensity distribution of an image. Now let's look at the histogram of a random selected sample from each category.

![](https://opencv-python-tutroals.readthedocs.io/en/latest/_images/histogram_sample.jpg)

### Histogram of Grayscale Image

In [None]:
benign = train_df[train_df['benign_malignant']=='benign']
malignant = train_df[train_df['benign_malignant']=='malignant']


f = plt.figure(figsize=(16,8))
f.add_subplot(2,2, 1)

benign_img = benign['image_name'][1]+'.jpg'
benign_img = plt.imread(os.path.join(img_dir, benign_img))
plt.imshow(benign_img[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Benign Image', fontsize=15)

f.add_subplot(2,2, 2)
_ = plt.hist(benign_img.ravel(),256,[0,256])

f.add_subplot(2,2, 3)
malignant_img = malignant['image_name'][235]+'.jpg'
malignant_img = plt.imread(os.path.join(img_dir, malignant_img))
plt.imshow(malignant_img[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Malignant Image', fontsize=15)

f.add_subplot(2,2, 4)
_ = plt.hist(malignant_img.ravel(),256,[0,256])

plt.tight_layout() 
plt.show()

*Images are stored as pixels values, each pixel value represents a color intensity value. Histograms are frequency distribution of these intensity values that occur in an image.* ***Here we can differentiate the  tine gray spot distribution of benign image between 150 to 200 gray scale  and spread of gray scale distribution of malignant image between 100 to 250 gray scale.*** *We can more classify this by histogram of masked image.*

### Histogram of Mask image
What if you want to find histograms of some regions of an image? Just create a mask image with white color on the region you want to find histogram and black otherwise. 

In [None]:
f = plt.figure(figsize=(10,5))
f.add_subplot(2,2, 1)

benign_img = benign['image_name'][2]+'.jpg'
img = plt.imread(os.path.join(img_dir, benign_img))

# create a mask
mask = np.zeros(img.shape[:2], np.uint8)
mask[200:800, 500:1400] = 255
masked_img = cv2.bitwise_and(img,img,mask = mask)

# Calculate histogram with mask and without mask
# Check third argument for mask
hist_full = cv2.calcHist([img],[0],None,[256],[0,256])
hist_mask = cv2.calcHist([img],[0],mask,[256],[0,256])

raw_image = plt.imread(os.path.join(img_dir, benign_img))
plt.imshow(raw_image[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Benign Image', fontsize=15)

f.add_subplot(2,2, 2)
plt.imshow(mask, cmap='gray')
plt.colorbar()
plt.title('Mask', fontsize=15)

f.add_subplot(2,2, 3)
plt.imshow(masked_img[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Mask Image', fontsize=15)

f.add_subplot(2,2, 4)
plt.plot(hist_full)
plt.plot(hist_mask)
plt.title('Histogram', fontsize=15)

plt.tight_layout() 
plt.show()

In [None]:
f = plt.figure(figsize=(10,5))
f.add_subplot(2,2, 1)

malignant_img = malignant['image_name'][235]+'.jpg'
img = plt.imread(os.path.join(img_dir, malignant_img))

# create a mask
mask = np.zeros(img.shape[:2], np.uint8)
mask[500:2000, 500:2500] = 255
masked_img = cv2.bitwise_and(img,img,mask = mask)

# Calculate histogram with mask and without mask
# Check third argument for mask
hist_full = cv2.calcHist([img],[0],None,[256],[0,256])
hist_mask = cv2.calcHist([img],[0],mask,[256],[0,256])

raw_image = plt.imread(os.path.join(img_dir, malignant_img))
plt.imshow(raw_image[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Malignant Image', fontsize=15)

f.add_subplot(2,2, 2)
plt.imshow(mask, cmap='gray')
plt.colorbar()
plt.title('Mask', fontsize=15)

f.add_subplot(2,2, 3)
plt.imshow(masked_img[ : , : , 0], cmap='gray')
plt.colorbar()
plt.title('Mask Image', fontsize=15)

f.add_subplot(2,2, 4)
plt.plot(hist_full)
plt.plot(hist_mask)
plt.title('Histogram', fontsize=15)

plt.tight_layout() 
plt.show()

***In the histogram plot, blue line shows histogram of full image while orange line shows histogram of masked region.*** *Here we can clearly distribution of benign image between 180 to 220 gray scale and spread of gray scale distribution of malignant image between 150 to 250 gray scale.*

### Histogram of Color Image

In [None]:
# https://www.kaggle.com/parulpandey/melanoma-classification-eda-starter
f = plt.figure(figsize=(16,8))
f.add_subplot(2,2, 1)

benign_img = benign['image_name'][1]+'.jpg'
benign_img = plt.imread(os.path.join(img_dir, benign_img))
plt.imshow(benign_img, cmap='gray')
plt.colorbar()
plt.title('Benign Image', fontsize=15)

f.add_subplot(2,2, 2)
_ = plt.hist(benign_img.ravel(),bins = 256, color = 'orange', alpha=0.3)
_ = plt.hist(benign_img[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
_ = plt.hist(benign_img[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
_ = plt.hist(benign_img[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
_ = plt.xlabel('Intensity Value')
_ = plt.ylabel('Count')
_ = plt.legend(['Total', 'Red_Channel', 'Green_Channel', 'Blue_Channel'])

f.add_subplot(2,2, 3)
malignant_img = malignant['image_name'][235]+'.jpg'
malignant_img = plt.imread(os.path.join(img_dir, malignant_img))
plt.imshow(malignant_img, cmap='gray')
plt.colorbar()
plt.title('Malignant Image', fontsize=15)

f.add_subplot(2,2, 4)
_ = plt.hist(malignant_img.ravel(),bins = 256, color = 'orange', alpha = 0.3)
_ = plt.hist(malignant_img[:, :, 0].ravel(), bins = 256, color = 'red', alpha = 0.5)
_ = plt.hist(malignant_img[:, :, 1].ravel(), bins = 256, color = 'Green', alpha = 0.5)
_ = plt.hist(malignant_img[:, :, 2].ravel(), bins = 256, color = 'Blue', alpha = 0.5)
_ = plt.xlabel('Intensity Value')
_ = plt.ylabel('Count')
_ = plt.legend(['Total', 'Red_Channel', 'Green_Channel', 'Blue_Channel'])

plt.tight_layout() 
plt.show()

## 5. Explore DICOM files
A DICOM file is an image saved in the Digital Imaging and Communications in Medicine (DICOM) format. It contains an image from a medical scan, such as an ultrasound or MRI. DICOM files may also include identification data for patients so that the image is linked to a specific individual. [source](https://fileinfo.com/extension/dicom#:~:text=A%20DICOM%20file%20is%20an,linked%20to%20a%20specific%20individual.)

<br>[1] https://pydicom.github.io/pydicom/stable/auto_examples/input_output/plot_read_dicom.html</br>
<br>[2] https://www.kaggle.com/gpreda/siim-isic-melanoma-classification-eda</br>
<br>[3] https://www.kaggle.com/schlerp/getting-to-know-dicom-and-the-data</br>

In [None]:
def show_dcm_info(dataset):
    print("Filename.........:", file_path)
    print("Storage type.....:", dataset.SOPClassUID)
    print()

    pat_name = dataset.PatientName
    display_name = pat_name.family_name + ", " + pat_name.given_name
    print("Patient's name......:", display_name)
    print("Patient id..........:", dataset.PatientID)
    print("Patient's Age.......:", dataset.PatientAge)
    print("Patient's Sex.......:", dataset.PatientSex)
    print("Modality............:", dataset.Modality)
    print("Body Part Examined..:", dataset.BodyPartExamined)
    
    
    if 'PixelData' in dataset:
        rows = int(dataset.Rows)
        cols = int(dataset.Columns)
        print("Image size.......: {rows:d} x {cols:d}, {size:d} bytes".format(
            rows=rows, cols=cols, size=len(dataset.PixelData)))
        if 'PixelSpacing' in dataset:
            print("Pixel spacing....:", dataset.PixelSpacing)

In [None]:
def plot_pixel_array(dataset, figsize=(5,5)):
    plt.figure(figsize=figsize)
    plt.grid(False)
    plt.imshow(dataset.pixel_array)
    plt.show()
    
i = 1
num_to_plot = 5
for file_name in os.listdir('../input/siim-isic-melanoma-classification/train/'):
        file_path = os.path.join('../input/siim-isic-melanoma-classification/train/',file_name)
        dataset = dcm.dcmread(file_path)
        show_dcm_info(dataset)
        plot_pixel_array(dataset)
    
        if i >= num_to_plot:
            break
    
        i += 1

## 6. tSNE Embeddings 
* t-SNE stands for t-distributed stochastic neighbor embedding. It is a technique for dimensionality reduction that is best suited for the visualization of high dimensional data-set. t-SNE is a randomized algorithm, i;e every time we run the algorithm it returns slightly different results on the same data-set. To control this we set a random state with some arbitrary value. Random state is used here to seed the cost function of the algorithm.

In [None]:
%%time
# https://www.kaggle.com/tunguz/melanoma-tsne-and-umap-embeddings-with-rapids/?

# INSTALL RAPIDS OFFLINE (FROM KAGGLE DATASET). TAKES 1 MINUTE :-)
import sys
!cp ../input/rapids/rapids.0.13.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz
sys.path = ["/opt/conda/envs/rapids/lib"] + ["/opt/conda/envs/rapids/lib/python3.6"] + ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
# load resized image numpy arry
from cuml.manifold import TSNE
from cuml.decomposition import PCA

train = np.load('../input/siimisic-melanoma-resized-images/x_train_32.npy')
train = train.reshape((train.shape[0], 32*32*3))
train.shape

In [None]:
time_start = time.time()
pca = PCA(n_components=2)
pca_2D = pca.fit_transform(train.astype(np.float32))
print(pca.explained_variance_ratio_)
print(' ')
print('PCA done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
pca_2D_one  = pca_2D[:,0]
pca_2D_two = pca_2D[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=pca_2D_one, y=pca_2D_two,
    hue=train_df['target'].values,
    palette=sns.color_palette("Paired", 2),
    legend="full",
    alpha=0.7
)
plt.xlabel('Principal Component 1', fontsize = 15)
plt.ylabel('Principal Component 2', fontsize = 15)
plt.title('2 component PCA', fontsize = 20)
plt.show()

By using the attribute explained_variance_ratio_, you can see that the first principal component contains 70.27% of the variance and the second principal component contains 0.07% of the variance. Together, the two components contain 70.34% of the information. From graph we can say that image pixel data not clearly have some clustering structure. 

Let's now take a look at what TSNE can discern.

In [None]:
time_start = time.time()
tsne = TSNE(n_components=2)
tsne_2D = tsne.fit_transform(train)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

In [None]:
tsne_2D_one  = tsne_2D[:,0]
tsne_2D_two = tsne_2D[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x=tsne_2D_one, y=tsne_2D_two,
    hue=train_df['target'].values,
    palette=sns.color_palette("Set2", 2),
    legend="full",
    alpha=0.7
)
plt.xlabel('t-SNE 1', fontsize = 15)
plt.ylabel('t-SNE 2', fontsize = 15)
plt.title('2 component t-SNE' ,fontsize = 20)
plt.show()

This is already a significant improvement over the PCA visualisation we used earlier. We see that the image pixel data definitley has some clustering structure. At first glance it doesn't seem that it's easy to clearly separate the target cases, partly becasue they comprise less than 2% of all the points. However, it seems like the most of them are concentrated in the lower areas.

## --Thank you for reading my kernel
## Give me your feedback and if you like my kernel please UPVOTE will be appreciated.