# Preparation before creating predictive model, EDA

## In order to construct a classification model for melanoma, I confirm the contents of the training and test data sets and EDA the data.

# Libraries

In [None]:
# data processing
import numpy as np 
import pandas as pd 

# basic
import glob
import os
import time
import gc
import sys

# visualization
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use("fivethirtyeight")

# Open CV
import cv2

# dicom data
import pydicom

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

In [None]:
np.random.seed(100)

# Data loading

### Table data

In [None]:
path = '/kaggle/input/siim-isic-melanoma-classification/'

train = pd.read_csv(os.path.join(path, "train.csv"))
test = pd.read_csv(os.path.join(path, "test.csv"))
sample = pd.read_csv(os.path.join(path, "sample_submission.csv"))

## Check of table data

In [None]:
print("train data size:{}".format(train.shape))
train.head()

In [None]:
print("test data size:{}".format(test.shape))
test.head()

In [None]:
# Submission data format
sample.head()

In [None]:
# Unique data
print("*"*10,"train data","*"*10)
print("anatom_site_general_challenge\n{}".format(train.anatom_site_general_challenge.unique()))
print("diagnosis\n{}".format(train.diagnosis.unique()))
print("/"*100)
print("*"*10,"test data","*"*10)
print("anatom_site_general_challenge\n{}".format(test.anatom_site_general_challenge.unique()))

### Unique values
anatom_site_general_challenge : Including null data, 7 kind of category values.<br>
<br>
diagnosis : 9 kind of category values.

## patient_id Unique values

In [None]:
train["patient_id"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(train["patient_id"].value_counts())
plt.ylabel("frequency")
plt.title("train data")

In [None]:
test["patient_id"].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.distplot(test["patient_id"].value_counts())
plt.ylabel("frequency")
plt.title("test data")

Most of the images are not from another patient, but from a single patient in large numbers. Also, it can be seen that the deviation of the numbers is very large.

In [None]:
train[train["patient_id"]=="IP_0656529"]

When I output the data of patients with multiple images, we can see that there are various parts of the images. In other words, multiple test results are given as data for one patient.

In [None]:
# Check patient overlap between training and test data
pd.Series(list(train["patient_id"].unique()) + list(test["patient_id"].unique())).value_counts()

Patients do not overlap in training and test data.

In [None]:
# Null check
print("*"*10,"train data","*"*10)
print("train data null count\n{}".format(train.isnull().sum()))
print("/"*100)
print("*"*10,"test data","*"*10)
print("test data null count\n{}".format(test.isnull().sum()))

sex, age_approx, anatom_site_general_challenge have null data. Especially, anatom.. data have many null count about 1.5% in train and 3% in test data.

In [None]:
# Create dcm dataframe
img_list = glob.glob(os.path.join(path, "train", "*"))
img_list = [str(img_id.replace("/kaggle/input/siim-isic-melanoma-classification/train/", "")) for img_id in img_list]
img_id = [str(img_id.split(".")[0]) for img_id in img_list]
extension_list = [str(img_id.split(".")[1]) for img_id in img_list]

img_train_dcm = pd.DataFrame({"img_id":img_id, "extension":extension_list})
print("train data size of dcm data:{}".format(len(img_train_dcm)))
img_train_dcm.head()

In [None]:
# Create jpeg dataframe
img_list = glob.glob(os.path.join(path, "jpeg","train","*"))
img_list = [str(img_id.replace("/kaggle/input/siim-isic-melanoma-classification/jpeg/train/", "")) for img_id in img_list]
img_id = [str(img_id.split(".")[0]) for img_id in img_list]
extension_list = [str(img_id.split(".")[1]) for img_id in img_list]

img_train_jpg = pd.DataFrame({"img_id":img_id, "extension":extension_list})
print("train data size of jpeg data:{}".format(len(img_train_jpg)))
img_train_jpg.head()

In [None]:
# Create dcm dataframe
img_list = glob.glob(os.path.join(path, "test", "*"))
img_list = [str(img_id.replace("/kaggle/input/siim-isic-melanoma-classification/test/", "")) for img_id in img_list]
img_id = [str(img_id.split(".")[0]) for img_id in img_list]
extension_list = [str(img_id.split(".")[1]) for img_id in img_list]

img_test_dcm = pd.DataFrame({"img_id":img_id, "extension":extension_list})
print("test data size of dcm data:{}".format(len(img_test_dcm)))
img_test_dcm.head()

In [None]:
# Create jpeg dataframe
img_list = glob.glob(os.path.join(path, "jpeg","test","*"))
img_list = [str(img_id.replace("/kaggle/input/siim-isic-melanoma-classification/jpeg/test/", "")) for img_id in img_list]
img_id = [str(img_id.split(".")[0]) for img_id in img_list]
extension_list = [str(img_id.split(".")[1]) for img_id in img_list]

img_test_jpg = pd.DataFrame({"img_id":img_id, "extension":extension_list})
print("test data size of jpeg data:{}".format(len(img_test_jpg)))
img_test_jpg.head()

It was confirmed that similar data was prepared for dcm and jpg data.

## Table data information

In [None]:
# Organize category data for train data
# pivot table
pd.pivot_table(data=train, index ="anatom_site_general_challenge" , columns=["sex", "benign_malignant"], values="image_name", aggfunc="count")

It can be confirmed that the number of malignant data is very small compared to the benign data. Especially, oral/genital and palms/soles have very few positive numbers.<br>
The head/neck and palms/soles and torso seem to have a large gender difference of benign/malignant.

In [None]:
# test data 
pd.pivot_table(test, index="anatom_site_general_challenge", columns="sex", values="image_name", aggfunc="count")

About test dataset, female and male difference is similar to traindata.

In [None]:
# benign_malignant data count
cnt = train["benign_malignant"].value_counts()
plt.figure(figsize=(10,6))
plt.bar(cnt.index, cnt)
plt.ylabel("count")
plt.title("benign_count : {}\nmalignant_count : {}".format(cnt.values[0], cnt.values[1]))

It can be seen that there is a large bias between positive and negative in the training data set.
Predictive models will need to be accommodated because they will need to be trained on impalanced datasets.

In [None]:
# benign vs malignant, age band
benign_df = train[train["benign_malignant"]=="benign"]
malignant_df = train[train["benign_malignant"]=="malignant"]

# Visualization
fig, ax = plt.subplots(1,2, figsize=(20,6))
plt.rcParams["font.size"] = 15

sns.violinplot(x="sex", y="age_approx", data = benign_df, ax=ax[0])
ax[0].set_title("benign group")
ax[0].set_ylim([0,100])
sns.violinplot(x="sex", y="age_approx", data = malignant_df, ax=ax[1])
ax[1].set_title("malignant group")
ax[1].set_ylim([0,100])

About age, malignant group is higher than benign group. Especially, the tendency is strong especially for female groups

In [None]:
# test data
plt.figure(figsize=(10,6))
plt.rcParams["font.size"] = 15

sns.violinplot(x="sex", y="age_approx", data=test)
plt.title("test data set age data")
plt.ylim([0,100])

The age distribution of the test data was confirmed.<br>
For male, the data bias characteristic of 70 years old can be confirmed. In addition, there are many data for women aged 40 to 50.<br>
In the training data, in particular, there were many positive cases of women around the age of 70, but there were few test data for that age group in test data. It may be necessary to pay attention to the difference in the parameter space of the learning model.

In [None]:
# Visualization
fig, ax = plt.subplots(1,2, figsize=(20,6))
plt.rcParams["font.size"] = 15

sns.distplot(benign_df[benign_df["sex"]=="male"]["age_approx"], ax=ax[0], label="male")
sns.distplot(benign_df[benign_df["sex"]=="female"]["age_approx"], ax=ax[0], label="female")
ax[0].set_title("Train data benign group")
ax[0].set_xlabel("age")
ax[0].set_ylabel("frequency")
ax[0].legend()

sns.distplot(malignant_df[malignant_df["sex"]=="male"]["age_approx"], ax=ax[1], label="male")
sns.distplot(malignant_df[malignant_df["sex"]=="female"]["age_approx"], ax=ax[1], label="female")
ax[1].set_title("Train data malignant group")
ax[1].set_xlabel("age")
ax[1].set_ylabel("frequency")
ax[1].legend()

Although age may be the difference in the acquisition method, the exact age is recorded for the person with malignant judgment, and the person for benign is recorded in the age zone, which is discrete data.<br>
<br>
The distribution of age is relatively regular. The benign group is particularly uniform and the age of the health-care examinees is normal. It can be inferred that there are various backgrounds such as strong interest around 40 years old, young people not interested.<Br>
On the other hand, in the positive (malignant) group, females are normal, but males are biased toward older age.

Understand the characteristics of the image data set to be handled. Especially, we checked dicom data and jpeg data.

## Image data check

The dicom data is a data format used in the medical system and includes not only image data but also various information as metadata.

In [None]:
# sample data
samp_img_name = str(img_train_dcm["img_id"][0] + '.dcm')

samp_dcm = pydicom.dcmread(os.path.join(path, "train", samp_img_name))

# dicom data
print(samp_dcm)

In [None]:
# image data , comparison with jpeg
samp_img = samp_dcm.pixel_array

# jpeg image loading
samp_img_name = str(img_train_dcm["img_id"][0] + '.jpg')
samp_img_jpg = cv2.imread(os.path.join(path, "jpeg", "train", samp_img_name))
samp_img_jpg = cv2.cvtColor(samp_img_jpg, cv2.COLOR_BGR2RGB)

# visualization
fig, ax = plt.subplots(1,2,figsize=(20,6))
ax[0].imshow(samp_img)
ax[0].grid()
ax[0].set_title("dcm image")

ax[1].imshow(samp_img_jpg)
ax[1].grid()
ax[1].set_title("jpeg image")

In [None]:
print("dicom data shape:{}".format(samp_img.shape))
print("jpeg data shape:{}".format(samp_img_jpg.shape))

For some of the sample data, images were acquired and compared with jpeg data.

Image data are different, btw dcm and jpg image.

In [None]:
# Hist gram
fig, ax = plt.subplots(1, 2, figsize=(20,6))

# dcm data
color = ["red", "green", "blue"]
for i in range(0,3):
    hist_dcm = cv2.calcHist([samp_img], [i], None, [256], [0,256])
    ax[0].plot(hist_dcm, color=color[i], label=color[i])
    ax[0].set_title("dcm data, YBR space")
    ax[0].legend()
    
# jpeg data
for i in range(0,3):
    hist_jpg = cv2.calcHist([samp_img_jpg], [i], None, [256], [0,256])
    ax[1].plot(hist_jpg, color=color[i], label=color[i])
    ax[1].set_title("jpg data, RGB space")
    ax[1].legend()

Looking histgram, difference is more obvious.<br>
dcm data has more sharpe distribution, and about blue color it has 2 characteristic peaks can be confirmed, and it can be confirmed that they are not in the jpeg data.<br>
The data range of jpeg is wider, and the contrast is visually emphasized for easier viewing.<br>

I'm not a medical expert, so I don't know what kind of information this difference in characteristics can give from the data, but in constructing a predictive model from images, an important point that may make a difference in the results.

## image size distribution

I want to unify the shapes of images when learning and building models. The image size varies, so I checked the data size.<br>
All data does not fit in the memory, so 1000 samples were acquired and verified. This would be good information to some extent in estimating the population from the sample.

In [None]:
%%time
# sampling 1000 data
# Check with jpeg data
path = '/kaggle/input/siim-isic-melanoma-classification/jpeg/train'

image = []
image_h = []
image_w = []
for name in train["image_name"].sample(1000):
    img = cv2.imread(os.path.join(path, name+'.jpg')).astype("uint8")
    img_h = img.shape[0]
    img_w = img.shape[1]
    image_h.append(img_h)
    image_w.append(img_w)
    
# visualization
plt.figure(figsize=(6,6))
plt.scatter(image_w, image_h)
plt.xlabel("image width")
plt.ylabel("image height")
plt.xlim([0,7000])
plt.ylim([0,7000])

The size of the image data varies from small to large. It can be seen that the aspect ratios are not constant, and those that vary greatly are included.
However, as a tendency, it seems that width:height = 6:4 can be unified.

In [None]:
del image_h, image_w
gc.collect()

## Time of image data processing

The larger the data size and the number of training data, the better.<br>
However, due to the limitations of the computing environment, we took samples and confirmed them in order to estimate the factors that affect memory and computation time.

In [None]:
# loading and reshape to (600,400)
# sampling 100 data
path = '/kaggle/input/siim-isic-melanoma-classification'
sample_name = train["image_name"].sample(100)

In [None]:
%%time
# dicom data
image = []

for name in sample_name:
    img = pydicom.dcmread(os.path.join(path, 'train', name+'.dcm')).pixel_array.astype("uint8")
    img = cv2.resize(img, (600,400), interpolation=cv2.INTER_AREA)
    image.append(img)
    
print("dicom data size : {} byte".format(sys.getsizeof(image)))
print("dicom data loading and resize time :")

In [None]:
%%time
# jpeg data
image = []

for name in sample_name:
    img = cv2.imread(os.path.join(path, 'jpeg', 'train', name+'.jpg')).astype("uint8")
    img = cv2.resize(img, (600,400), interpolation=cv2.INTER_AREA)
    image.append(img)
    
print("jpeg data size : {} byte".format(sys.getsizeof(image)))
print("jpeg data loading and resize time :")

We took 100 samples and compared the load time and the calculation time of the resizing process only.<br>
It was found that the dicom data requires more than twice the calculation time.

In [None]:
# data shape vs time and size
# jpeg data
times = [50,100,200,400,800]
pro_time = []
data_size = []

for t in times:
    start = time.time()
    img = cv2.imread(os.path.join(path, 'jpeg', 'train', name+'.jpg')).astype("uint8")
    img = cv2.resize(img, (6*t, 4*t), interpolation=cv2.INTER_AREA)
    s = sys.getsizeof(img)
    t = time.time() - start
    pro_time.append(t)
    data_size.append(s)

# visualization
# For 33,126 data of train, multiply and assume
image_shape = ["(300,200)", "(600,400)", "(1200,800)", "(2400,1600)", "(4800,2100)"]

fig, ax = plt.subplots(1,2,figsize=(20,6))

# Time
ax[0].bar(image_shape, np.array(pro_time)*33126/60)
ax[0].set_xlabel("image shape")
ax[0].set_ylabel("Time(min)")
ax[0].set_title("image shape vs data processing time(load and resize)\n all train data 33,216")

# data size
ax[1].bar(image_shape, np.array(data_size)*33126/1000000000)
ax[1].set_xlabel("image shape")
ax[1].set_ylabel("Data size(Gbyte)")
ax[1].set_yscale("log")
ax[1].set_ylim([0,10000])
ax[1].set_title("image shape vs data size\n all train data 33,216")

I compared the required processing time and data size for each image size. Based on the above example, rough estimation was performed by directly multiplying the total number of training samples. Assuming jpeg data processing <br>
As a result, it can be seen that the time is the shortest (400,600) size. On the other hand, the data size increases as the image size increases. Since its size is giga size, it must be processed well in memory.

In [None]:
# data size vs time and size
# image shape condition : (600,400)

# jpeg data
base_time = pro_time[1]
base_size = data_size[1]

size = np.array([5000,10000,15000,20000,30000,33216])

pro_time_size = base_time*size
data_size_size = base_size*size

fig, ax = plt.subplots(1,2,figsize=(20,6))

# Time
ax[0].plot(size, pro_time_size/60)
ax[0].set_xlabel("data size")
ax[0].set_xlim([0,35000])
ax[0].set_ylabel("Time(min)")
ax[0].set_title("data size vs data processing time(load and resize)\n image shape (600,400)")

# data size
ax[1].plot(size, data_size_size/1000000000)
ax[1].set_xlabel("data size")
ax[1].set_xlim([0,35000])
ax[1].set_ylabel("Data size(Gbyte)")
ax[1].set_ylim([0,30])
ax[1].set_title("image shape vs data size\n image shape (600,400)")

When the image size is (600,400), the processing time of the number of training data and the data size when using the jpeg image were plotted. Assuming that the processing time increases according to the number of data, the data size is 100 giga in the case of all data. The premise of this calculation is the list format, but if you convert it to a numpy array for model training, the size will increase further.<br>
It is also necessary to optimize the prediction model within the memory limit based on the image size and the number of data.

## Image data check

Image confirmation is performed to construct a prediction model from image images. Consider optimizing training data by removing noise that is not only in the tumor of interest but also in other areas.

In [None]:
# loading and reshape to (120,80)
# sampling 20 data from each pos and neg target.
path = '/kaggle/input/siim-isic-melanoma-classification'
sample_neg = train[train["target"]==0].sample(20)
sample_pos = train[train["target"]==1].sample(20)

sample_neg_name = sample_neg["image_name"]
sample_neg_target = sample_neg["target"]

## Sample image from dcm data

In [None]:
%%time
sample_neg_img = []
for name in sample_neg_name:
    img = pydicom.dcmread(os.path.join(path, "train", name+'.dcm')).pixel_array.astype("uint8")
#    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # dicom image data is RGB
    img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
    sample_neg_img.append(img)
    
sample_pos_name = sample_pos["image_name"]
sample_pos_target = sample_pos["target"]

sample_pos_img = []
for name in sample_pos_name:
    img = pydicom.dcmread(os.path.join(path, "train", name+'.dcm')).pixel_array.astype("uint8")
#    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
    sample_pos_img.append(img)

In [None]:
# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_neg_img[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_neg_name.values[5*i+j]))

In [None]:
# Visualization
# positive sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_pos_img[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("malignant sample : \n{}".format(sample_pos_name.values[5*i+j]))

## Sample image from jpeg data

In [None]:
%%time
sample_neg_img = []
for name in sample_neg_name:
    img = cv2.imread(os.path.join(path,'jpeg', 'train', name+'.jpg')).astype("uint8")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
    sample_neg_img.append(img)
    
sample_pos_name = sample_pos["image_name"]
sample_pos_target = sample_pos["target"]

sample_pos_img = []
for name in sample_pos_name:
    img = cv2.imread(os.path.join(path, "jpeg", "train", name+'.jpg')).astype("uint8")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
    sample_pos_img.append(img)

In [None]:
# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_neg_img[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_neg_name.values[5*i+j]))

In [None]:
# Visualization
# positive sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_pos_img[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("malignant sample : \n{}".format(sample_pos_name.values[5*i+j]))

I confirmed the image based on dcm and jpeg data. <br>
From this result, the malignant group often has a large dark area and a large area. However, benign contains similar data, and it is impossible for an unseen person to judge. You can also see different types, such as different shapes and distributions. Foreseeing such things may require pre-classification, just like real people do.<br>
Also, it can be seen that the image contains not only tumors but also image noise such as hairs and pores.

Next, I confirm images of each part.

# Comparison each point (anatom_site_general_challenge point)

In [None]:
# image_name from each part
neg_hn_name = train[(train["anatom_site_general_challenge"]=="head/neck") & (train["benign_malignant"]=="benign")]["image_name"]
pos_hn_name = train[(train["anatom_site_general_challenge"]=="head/neck") & (train["benign_malignant"]=="malignant")]["image_name"]

neg_ue_name = train[(train["anatom_site_general_challenge"]=="upper extremity") & (train["benign_malignant"]=="benign")]["image_name"]
pos_ue_name = train[(train["anatom_site_general_challenge"]=="upper extremity") & (train["benign_malignant"]=="malignant")]["image_name"]

neg_le_name = train[(train["anatom_site_general_challenge"]=="lower extremity") & (train["benign_malignant"]=="benign")]["image_name"]
pos_le_name = train[(train["anatom_site_general_challenge"]=="lower extremity") & (train["benign_malignant"]=="malignant")]["image_name"]

neg_tr_name = train[(train["anatom_site_general_challenge"]=="torso") & (train["benign_malignant"]=="benign")]["image_name"]
pos_tr_name = train[(train["anatom_site_general_challenge"]=="torso") & (train["benign_malignant"]=="malignant")]["image_name"]

neg_ps_name = train[(train["anatom_site_general_challenge"]=="palms/soles") & (train["benign_malignant"]=="benign")]["image_name"]
pos_ps_name = train[(train["anatom_site_general_challenge"]=="palms/soles") & (train["benign_malignant"]=="malignant")]["image_name"]

neg_og_name = train[(train["anatom_site_general_challenge"]=="oral/genital") & (train["benign_malignant"]=="benign")]["image_name"]
pos_og_name = train[(train["anatom_site_general_challenge"]=="oral/genital") & (train["benign_malignant"]=="malignant")]["image_name"]

data_list = [neg_hn_name, pos_hn_name, neg_ue_name, pos_ue_name, 
             neg_le_name, pos_le_name, neg_tr_name, pos_tr_name, 
             neg_ps_name, pos_ps_name, neg_og_name, pos_og_name]
pos_neg = ["benign", "malignant"]

In [None]:
# head/neck
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[0+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("head/neck : {}".format(pos_neg[i]))

In [None]:
# upper extremity
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[2+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("upper extremity : {}".format(pos_neg[i]))

In [None]:
# lower extremity
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[4+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("lower extremity : {}".format(pos_neg[i]))

In [None]:
# torso
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[6+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("torso : {}".format(pos_neg[i]))

In [None]:
# palms/soles
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[8+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("palms/soles : {}".format(pos_neg[i]))

In [None]:
# oral/genital
fig, ax = plt.subplots(1, 4, figsize=(24, 6))

for i in range(0,2):
    for j in range(0,2):
        img = cv2.imread(os.path.join(path, "jpeg", "train", data_list[10+i].values[j]+".jpg"))
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (120,80), interpolation=cv2.INTER_AREA)
        
        ax[i*2+j].imshow(img)
        ax[i*2+j].grid()
        ax[i*2+j].set_title("oral/genital : {}".format(pos_neg[i]))

When the images were checked for each part, it was found that the head/neck sample and the oral/genital sample had a lot of noise such as hair.
If the image classification for each part is possible in this way, it may be possible to examine the improvement of accuracy by composing a program such as filter processing according to each.

### Here, try to reduction of image noise(like hair), the result of some image processing is confirmed.

### Gaussian weighted averaging (GaussianBlur)

In [None]:
sample_neg_img_gb = []

for i in sample_neg_img:
    img = cv2.GaussianBlur(i, (5,5), 1, 1)
    sample_neg_img_gb.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_neg_img_gb[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_neg_name.values[5*i+j]))

In [None]:
sample_pos_img_gb = []

for i in sample_pos_img:
    img = cv2.GaussianBlur(i, (5,5), 1, 1)
    sample_pos_img_gb.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_pos_img_gb[5*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_pos_name.values[5*i+j]))

By adding filtering, in some cases noise such as hair and pores can be reduced. This may be effective in suppressing overfitting in the learning model. However, verification is required.
And, small tumors or those with many hairs may be difficult to identify. Again, it is necessary to actually create a model and verify it.

## Gray scale and Canny edge detection

In [None]:
sample_neg_img_gc = []

for i in sample_neg_img:
    img = cv2.cvtColor(i, cv2.COLOR_RGB2GRAY)
    img = cv2.Canny(img, 50, 150)
    sample_neg_img_gc.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_neg_img_gc[5*i+j], cmap=plt.get_cmap("gray"), vmin=0, vmax=255)
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_neg_name.values[5*i+j]))

In [None]:
sample_pos_img_gc = []

for i in sample_pos_img:
    img = cv2.cvtColor(i, cv2.COLOR_RGB2GRAY)
    img = cv2.Canny(img, 50, 150)
    sample_pos_img_gc.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_pos_img_gc[5*i+j], cmap=plt.get_cmap("gray"), vmin=0, vmax=255)
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_pos_name.values[5*i+j]))

Both negative and positive data show the shape of the tumor. However, the difference between them is not visible. <br>
In particular, there is a lot of noise such as hair and pores, and there are many data whose shapes are not captured, which is a problem.

The Canny process itself includes a filter process as a pre-process, but the result of applying the above-mentioned Blur process to the pre-process and strengthening the filter is confirmed.

In [None]:
sample_neg_img_gb_gc = []

for i in sample_neg_img_gb:
    img = cv2.cvtColor(i, cv2.COLOR_RGB2GRAY)
    img = cv2.Canny(img, 50, 150)
    sample_neg_img_gb_gc.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_neg_img_gb_gc[5*i+j], cmap=plt.get_cmap("gray"), vmin=0, vmax=255)
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_neg_name.values[5*i+j]))

In [None]:
sample_pos_img_gb_gc = []

for i in sample_pos_img_gb:
    img = cv2.cvtColor(i, cv2.COLOR_RGB2GRAY)
    img = cv2.Canny(img, 50, 150)
    sample_pos_img_gb_gc.append(img)

# Visualization
# negative sample
fig, ax = plt.subplots(4,5, figsize=(30,24))

for i in range(0,4):
    for j in range(0,5):
        ax[i,j].imshow(sample_pos_img_gb_gc[5*i+j], cmap=plt.get_cmap("gray"), vmin=0, vmax=255)
        ax[i,j].grid()
        ax[i,j].set_title("benign sample : \n{}".format(sample_pos_name.values[5*i+j]))

For some samples, the negative data disappears but the positive data retains shape. This may suggest that the slope of the data is often stronger for positive data. But not all are true.

## One of patient images

### sample_patient IP_4938382

In [None]:
%%time
sample_pat_img = []
sample_pat_img_name = train[train["patient_id"]=="IP_4938382"]["image_name"]
sample_pat_img_posneg = train[train["patient_id"]=="IP_4938382"]["benign_malignant"]

for name in sample_pat_img_name:
    img = cv2.imread(os.path.join(path,'jpeg', 'train', name+'.jpg')).astype("uint8")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (48,32), interpolation=cv2.INTER_AREA)
    sample_pat_img.append(img)

In [None]:
# Visualization
# positive sample
fig, ax = plt.subplots(10,10, figsize=(40,40))

for i in range(0,10):
    for j in range(0,10):
        ax[i,j].imshow(sample_pat_img[10*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("sample_patient\nIP_4938382\n{}".format(sample_pat_img_posneg.values[10*i+j]))

### sample_patient IP_0656529

In [None]:
%%time
sample_pat_img = []
sample_pat_img_name = train[train["patient_id"]=="IP_0656529"]["image_name"]
sample_pat_img_posneg = train[train["patient_id"]=="IP_0656529"]["benign_malignant"]

for name in sample_pat_img_name:
    img = cv2.imread(os.path.join(path,'jpeg', 'train', name+'.jpg')).astype("uint8")
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (48,32), interpolation=cv2.INTER_AREA)
    sample_pat_img.append(img)

In [None]:
# Visualization
# positive sample
fig, ax = plt.subplots(10,10, figsize=(40,40))

for i in range(0,10):
    for j in range(0,10):
        ax[i,j].imshow(sample_pat_img[10*i+j])
        ax[i,j].grid()
        ax[i,j].set_title("sample_patient\nIP_0656529\n{}".format(sample_pat_img_posneg.values[10*i+j]))

These are examples of 2 patients, but it can be seen that there are many very similar images. In particular, the former is so similar that the images are indistinguishable. Moreover, both examples are benign.<Br>
This can bias the training data and increase the tendency for overfitting, so caution is required.<br>
This must be taken into consideration when selecting training data.

## Next step

From the results so far, it may be possible to improve accuracy in classification by images by combining not only one data-processed model but also a plurality of models.<br>
It is necessary to consider not only the image data but also other medical information.

## Based on the above results, for create the prediction model.
- Decide how to deal with imbalanced data. Oversampling and undersampling, etc.
- Organize tabular data, digitize missing or categorical data, and create other models to build models.
- Pre-processing of image data and determination of data format. Image data size, number of training data, etc. according to the computing environment.
- Build a predictive model based on deep learning.