In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

In [None]:
# Constants
path = '../input/siim-isic-melanoma-classification'
jpeg_path = '../input/siim-isic-melanoma-classification/jpeg'
test_images_path = '../input/siim-isic-melanoma-classification/test'
train_images_path = '../input/siim-isic-melanoma-classification/train'
tf_records_path = '../input/siim-isic-melanoma-classification/tfrecords'
train_csv_path = '../input/siim-isic-melanoma-classification/train.csv'
test_csv_path = '../input/siim-isic-melanoma-classification/test.csv'

In [None]:
# Read data
train_csv = pd.read_csv(train_csv_path)
print(f"Train: {train_csv.shape}")
print(train_csv.head())
test_csv = pd.read_csv(test_csv_path)
print(f"Test: {test_csv.shape}")
print(f"Train columns {train_csv.columns}")
print(f"Test columns {test_csv.columns}")

In [None]:
# Lets check target distribution
target_counts = train_csv.target.value_counts()
print()
print("Target distribution")
print(target_counts)
sns.barplot(x=target_counts.index, y=target_counts.values)
plt.xlabel("Targets")
plt.ylabel("Counts")
plt.show()

In [None]:
# Sex
# Lets check sex distribution
sex_counts = train_csv.sex.value_counts()
print("Sex distribution")
print(sex_counts)
sns.barplot(x=sex_counts.index, y=sex_counts.values)
plt.xlabel("Sex")
plt.ylabel("Counts")
plt.show()

In [None]:
# Age
# Lets check Age distribution
age_counts = train_csv.age_approx.value_counts()
plt.figure(figsize=(10,10))
sns.barplot(x=age_counts.index, y=age_counts.values)
plt.xlabel("Age")
plt.ylabel("Counts")
plt.show()

In [None]:
# Anatom Site
# Lets check sex distribution
anatom_site_counts = train_csv.anatom_site_general_challenge.value_counts()
print("Anatom Site distribution")
print(anatom_site_counts)
sns.barplot(x=anatom_site_counts.index, y=anatom_site_counts.values)
plt.xticks(rotation=90)
plt.xlabel("Anatom Sites")
plt.ylabel("Counts")
plt.show()

In [None]:
# Diagnosis
# Lets check Diagnosis distribution
diagnosis_counts = train_csv.diagnosis.value_counts()
print("Diagnosis distribution")
print(diagnosis_counts)
sns.barplot(x=diagnosis_counts.index, y=diagnosis_counts.values)
plt.xticks(rotation=90)
plt.xlabel("Diagnosis")
plt.ylabel("Counts")
plt.show()

In [None]:
# Type
# Lets check type distribution
benign_malignant_counts = train_csv.benign_malignant.value_counts()
print("benign_malignant distribution")
print(benign_malignant_counts)
sns.barplot(x=benign_malignant_counts.index, y=benign_malignant_counts.values)
plt.xlabel("benign or malignant")
plt.ylabel("Counts")
plt.show()

In [None]:
# Lets look at some images
print("Samples with Melanoma")
imgs = train_csv[train_csv.target==1]['image_name'].values
_, axs = plt.subplots(2, 5, figsize=(20, 8))
axs = axs.flatten()
for f_name,ax in zip(imgs[:10],axs):
    img = Image.open(f"{path}/jpeg/train/{f_name}.jpg")
    ax.imshow(img)
    ax.axis('off')
plt.show()

print("Samples without Melanoma")
imgs = train_csv[train_csv.target==0]['image_name'].values
_, axs = plt.subplots(2, 5, figsize=(20, 8))
axs = axs.flatten()
for f_name,ax in zip(imgs[:10],axs):
    img = Image.open(f"{path}/jpeg/train/{f_name}.jpg")
    ax.imshow(img)
    ax.axis('off')    
plt.show()