# Contents

# Loading train and test data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from os import listdir
import time
import sys

from pydicom import dcmread
import matplotlib.pyplot as plt
import matplotlib.colors
import seaborn as sns

from scipy.stats import kurtosis, skew, mode
from scipy import stats


from PIL import Image
import cv2
from skimage import color

path = "/kaggle/input/siim-isic-melanoma-classification/"

In [None]:
train_df = pd.read_csv(path + 'train.csv')
train_df

In [None]:
test_df = pd.read_csv(path + 'test.csv')
test_df

As we see above, we have 33126 images in the train set, and 10982 images in the test set. There are 8 columns in the train dataframe, wheras there are 5 columns in the test dataframe. The three columns, "diagnosis",	"benign_malignant",	"target" are the target columns.

In [None]:
train_df["target"].value_counts()

In [None]:
train_df["benign_malignant"].value_counts()

In [None]:
train_df["diagnosis"].value_counts()

In [None]:
train_df[(train_df["diagnosis"] == "melanoma") &(train_df["benign_malignant"] == "malignant") & (train_df["target"] == 1)]

As we see above, the "target" values of 1 only appears when the "benign_malignant" column is "malignant" and the "diagnosis" column is "melanoma". There are 584 such cases, whereas the resting 32542 cases are marked as 0 in the "target" column. There is a big imbalance between the target values, targets with value of 1 constitutes only 584/32542, 1.8% of all the train set.

We also notice that the majority of the "diagnosis" falls into "unknown" class with the ratio of 27124/32542, 83%.

In [None]:
train_df["image_name"].nunique()

In [None]:
test_df["image_name"].nunique()

"image_name"s are unique.

In [None]:
train_df["patient_id"].nunique()

In [None]:
test_df["patient_id"].nunique()

There are only 2056 patients in the train set and 690 images in the test set, which means patients have several images in the set.

In [None]:
train_df["patient_id"].value_counts()

In [None]:
test_df["patient_id"].value_counts()

The maximum number of images per patient is 115 in the train set and 240 images in the test set, wheras, the minimum is 2 in the train set and 3 in the test set.

In [None]:
a = train_df["age_approx"].unique()
a.sort()
a

In [None]:
train_df[train_df["age_approx"].isna()].shape

In [None]:
a = test_df["age_approx"].unique()
a.sort()
a

In [None]:
test_df[test_df["age_approx"].isna()].shape

The ages are in 5-years bin and between 0-90 in the train set and 10-90 in the test set.
There are 68 null value in the "age_approx" whereas there isn't any in the test set.

The distribution graphs of ages in the datasets are as follows:

In [None]:
fig, ax = plt.subplots(1,2)

sns.distplot(train_df[train_df["age_approx"].notna()]["age_approx"], ax=ax[0], color="#992299")
ax[0].set_title("distribution of age in train")
    
sns.distplot(test_df[test_df["age_approx"].notna()]["age_approx"], ax=ax[1], color="#ee2200")
ax[1].set_title("distribution of age in test");
    
fig.set_size_inches(10, 3)

In [None]:
train_df["sex"].value_counts(normalize=True)

In [None]:
test_df["sex"].value_counts(normalize=True)

Test dataset has more females than train dataset.

"anatom_site_general_challenge" column has the values distributed like:

In [None]:
train_df["anatom_site_general_challenge"].value_counts(normalize=True)

In [None]:
test_df["anatom_site_general_challenge"].value_counts(normalize=True)

In [None]:
fig, ax = plt.subplots(1,2, figsize=(20,8))

a = train_df["anatom_site_general_challenge"].value_counts()

f = sns.barplot(x=a.keys(),y=a.values, ax=ax[0])
ax[0].set_title("anatomical sites in train")

n=0
for key in a.keys():
    f.text(n,a[n]+200 , a[key], color='black', ha="center")
    n+=1

    
a = test_df["anatom_site_general_challenge"].value_counts()

f = sns.barplot(x=a.keys(),y=a.values, ax=ax[1])
ax[1].set_title("anatomical sites in test")

n=0
for key in a.keys():
    f.text(n,a[n]+100 , a[key], color='black', ha="center")
    n+=1 

As we see above, the distribution of anatomical sites in the train and test datasets are similar. The majority of the images are form torso, and the lowest number of images are from oral/genital sites.

Now, let's how many null values present in the columns:

In [None]:
for col in train_df.columns:
    print(col,":",len(train_df[train_df[col].isna()]))

In [None]:
for col in test_df.columns:
    print(col,":",len(test_df[test_df[col].isna()]))

# Images

Let's have a look at some images:

In [None]:
image_classes = train_df["diagnosis"].unique()

fig, ax = plt.subplots(len(image_classes),5,figsize=(50,50))

m=0
for imclass in image_classes:
    image_names = train_df.loc[train_df["diagnosis"]==imclass,"image_name"].values[:5]
    n=0
    for image_name in image_names:
        image = cv2.imread(path + "jpeg/train/" + image_name + ".jpg")
        ax[m,n].imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB));
        if (n == 2) | (m>6):
            ax[m,n].set_title(label=imclass, fontdict={'fontsize':50})
        n+=1
    m+=1