# Melanoma Image Classification

### Objective: 
The purpose of this project is to identify Melanoma cases in images of skin lesions. In particular, we need to create a model that predicts the probability whether the lesion is malignant or benign.

### Data Overview:
**The dataset consists of images in :**
- DIOCOM format
- JPEG format in JPEG directory
- TFRecord format in tfrecords directory
- A metadata comprising of train, test and submission file in CSV format.

In [None]:
# Import libraries 
import numpy as np 
import pandas as pd
import missingno as msno
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Load the data 
train = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
sample_submission = pd.read_csv('../input/siim-isic-melanoma-classification/sample_submission.csv')

### Data cleaning

In [None]:
# Rename the columns 
new_names = ['image_name', 'patient_ID', 'sex', 'age', 'anatomy', 'diagnosis', 'benign_malignant', 'target']
train.columns = new_names
test.columns = new_names[:5]

In [None]:
# Print a concise summary of the DataFrame
print('Train:', train.info(),'\n')
print('Test:', test.info())

In [None]:
# Print the dimensionality of the DataFrame
print('Train:', train.shape)
print('Test :',test.shape)

In [None]:
# Print the columns' names 
print('Train:', train.columns,'\n')
print('Test:', test.columns)

In [None]:
# Observe the index values
train.head()

In [None]:
test.head()

In [None]:
# The data type of each column
print('Train:', train.dtypes,'\n')
print('Test:', test.dtypes)

In [None]:
# The unique IDs
print(f"The total patient IDs are {train['patient_ID'].count()}, from those the unique IDs are {train['patient_ID'].value_counts().shape[0]} ")

The number of unique patients is less than the total number of patients. Which means, several patients have multiple records.

In [None]:
# Total number of images in the dataset(train+test)
print("Total images in Train set: ",train['image_name'].count())
print("Total images in Test set: ",test['image_name'].count())

### Missing Data:

In [None]:
print('Train:', train.isna().sum(),'\n')
print('Test:', test.isna().sum())

In [None]:
# Missing data visualization

f, (ax1, ax2) = plt.subplots(1, 2, figsize = (16, 6))

msno.matrix(train, ax = ax1, fontsize=10)
msno.matrix(test, ax = ax2, fontsize=10)

ax1.set_title('Train Missing Values', fontsize = 16)
ax2.set_title('Test Missing Values', fontsize = 16);

**Train**:
- `sex`: 65 missing values (0.2% of the total data)
- `age`: 68 missing values (correspond with sex missingness)
- `anatomy`: 527 missing values (1.59% of the total data)

**Test**:
- `anatomy`: 351 missing values (3.1% of the total data)

## Dealing with the missing data

### Train: SEX Variable

In [None]:
train['sex'].value_counts()

In [None]:
train['sex'].mode()

In [None]:
# Impute the missing values with the mode 
train['sex'].fillna(train['sex'].mode()[0], inplace=True)

### Train: AGE Variable

In [None]:
age_median = int(train['age'].median())
print('Median:', age_median)
age_mean = int(train['age'].mean())
print('Mean:', age_mean)

The mean and median of `age` variable has the same value of 50, while the mode is 45. The distribution is normal, so we'll use the MEDIAN to impute. 

In [None]:
# Impute the missing values with the median
train['age'] = train['age'].fillna(age_median)

### Train & Test: ANATOMY Variable

In [None]:
# Fill the missing data in anatomy with 'unknown'
train['anatomy'].fillna('unknown', inplace=True)
test['anatomy'].fillna('unknown', inplace=True)

## Data Exploration

In [None]:
# The sex in each dataset
fig, ax = plt.subplots(1,2,figsize=(15,5));
#train
sns.countplot(train['sex'], ax=ax[0]);
ax[0].set_title('sex count of Train');
#test
sns.countplot(test['sex'], ax=ax[1]);
ax[1].set_title('sex count of Test');

In [None]:
# Gender vs Target
target_sex = train.groupby(['target','sex'])['benign_malignant'].count().to_frame().reset_index()
target_sex.style.background_gradient() 

In [None]:
sns.catplot(x='target',y='benign_malignant', hue='sex',data=target_sex,kind='bar')
plt.ylabel('Count')
plt.xlabel('benign:0 vs malignant:1');

In [None]:
# Age distribution
fig, ax = plt.subplots(1,2,figsize=(17,6))
#train
sns.countplot(train['age'], ax=ax[0]);
label = ax[0].get_xticklabels();
ax[0].set_xticklabels(label, rotation=50);
ax[0].set_title('Train age distribution');
#test
sns.countplot(test['age'], ax=ax[1]);
label = ax[1].get_xticklabels();
ax[1].set_xticklabels(label, rotation=50);
ax[1].set_title('Test age distribution');

In [None]:
# Count The anatomy location imaged site for each dataset
fig, ax = plt.subplots(1,2, figsize=(17,6));
#train
sns.countplot(train['anatomy'].sort_values(), ax=ax[0]);
label = ax[0].get_xticklabels();
ax[0].set_xticklabels(label, rotation=50);
ax[0].set_title('Train imaged site');
#test
sns.countplot(test['anatomy'].sort_values(), ax=ax[1]);
label = ax[1].get_xticklabels();
ax[1].set_xticklabels(label, rotation=50);
ax[1].set_title('Test imaged site');

In [None]:
# sex vs. anatomy
sex_anatomy = train.groupby(['sex','anatomy'])['benign_malignant'].count().to_frame().reset_index();
sns.catplot(x='anatomy',y='benign_malignant', hue='sex',data=sex_anatomy,kind='bar');
plt.xlabel('Location of imaged site');
plt.xticks(rotation=90,fontsize='10');
plt.ylabel('Count of melanoma cases');

In [None]:
# Count benign and malignant
fig, ax = plt.subplots(figsize=(10,6))
sns.countplot(train['benign_malignant']);

## Images Visualization

In [None]:
# Visualizing a random selection of images

images = train['image_name'].values

# Extract 9 random images from it
random_images = [np.random.choice(images+'.jpg') for i in range(9)]

# Location of the image dirictory 
IMAGE_PATH = "../input/siim-isic-melanoma-classification/"
img_dir = IMAGE_PATH+'/jpeg/train'

print('Display Random Images')

# Adjust the size of your images
plt.figure(figsize=(10,8))

# Iterate and plot random images
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = plt.imread(os.path.join(img_dir, random_images[i]))
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    
# Adjust subplot parameters to give specified padding
plt.tight_layout()   

In [None]:
# Visualizing Images with benign lesions

benign = train[train['benign_malignant']=='benign']
malignant = train[train['benign_malignant']=='malignant']

images = benign['image_name'].values
random_images = [np.random.choice(images+'.jpg') for i in range(9)]
img_dir = IMAGE_PATH+'/jpeg/train'

print('Display benign Images')

plt.figure(figsize=(10,8))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = plt.imread(os.path.join(img_dir, random_images[i]))
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    
plt.tight_layout() 

In [None]:
# Visualizing Images with Malignant lesions

images = malignant['image_name'].values
random_images = [np.random.choice(images+'.jpg') for i in range(9)]
img_dir = IMAGE_PATH+'/jpeg/train'

print('Display malignant Images')

plt.figure(figsize=(10,8))
for i in range(9):
    plt.subplot(3, 3, i + 1)
    img = plt.imread(os.path.join(img_dir, random_images[i]))
    plt.imshow(img, cmap='gray')
    plt.axis('off')
    
plt.tight_layout()  