In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
import os
import cv2
import warnings
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from pylab import rcParams
warnings.simplefilter('ignore')
%matplotlib inline

In [None]:
train_df = pd.read_csv('/kaggle/input/jpeg-melanoma-256x256/train.csv')
test_df = pd.read_csv('/kaggle/input/jpeg-melanoma-256x256/test.csv')
train_img_dir = '../input/jpeg-melanoma-256x256/train'
test_img_dir = '../input/jpeg-melanoma-256x256/test'


In [None]:
train_df.head()

In [None]:
#targetとbenign_malignantが一致していることを確認
res = train_df[train_df["benign_malignant"]=="benign"]
print("benign",res["target"].unique())
res = train_df[train_df["benign_malignant"]=="malignant"]
print("malignant",res["target"].unique())

In [None]:
#陽性・陰性比の確認
print("陽性対陰性 : ")
l = len(train_df)
dic1 = defaultdict(int)
for i in train_df["target"]:
    dic1[i] += 1
print(f'陰性者数 : {dic1[0]}')
print(f'陽性者数 : {dic1[1]}')
print(f'陰性率 : {dic1[0]/l*100:.3f} %')
print(f'陽性率 : {dic1[1]/l*100:.3f} %')

In [None]:
dic2 = defaultdict(int)
for i in train_df["patient_id"]:
    dic2[i] += 1
print("総患者数:",len(dic2))
dic1 = defaultdict(int)
res = train_df[train_df["target"]==1]
for i in res["patient_id"]:
    dic1[i] += 1
print("陽性患者数:",len(dic1))

In [None]:
positive_id = []
for i,j in enumerate(dic1.items()):
    positive_id.append(j)
    if i == 10:break
print(*positive_id)

In [None]:
res = 0
for i in train_df["patient_id"]:
    if i in dic1:
        res += 1
print(f'陰性者平均撮影数 : {(len(train_df)-res)/(len(dic2)-len(dic1)):.2f}')
print(f'陽性者平均撮影数 : {res/len(dic1):.2f}')


In [None]:
#性別存在比の確認
l = len(train_df)

dic1 = defaultdict(int)
for i in train_df["sex"]:
    dic1[i] += 1

print("男女比 : ")
print(dic1)
print("male : ",round(dic1["male"]/l*100,2),"%")
print("female : ",round(dic1["female"]/l*100,2),"%")
print("nan : ",round(dic1[np.nan]/l*100,4),"%")

In [None]:
#陽性・陰性比の確認
print("男女比(陽性) : ")
res = train_df[train_df["target"]==1]
l = len(res)
dic1 = defaultdict(int)
for i in res["sex"]:
    dic1[i] += 1
print("male : ",round(dic1["male"]/l*100,2),"%")
print("female : ",round(dic1["female"]/l*100,2),"%")
print("nan : ",round(dic1[np.nan]/l*100,4),"%")

In [None]:
def plot_age(data):
    left = [i for i in range(len(data))]
    height = [i[1] for i in data]
    labels = ["nan" if np.isnan(i[0]) else int(i[0]) for i in data]
    plt.bar(left, height, width=0.5,linewidth=2, tick_label=labels)
    plt.title("age")
    plt.ylabel("count")
    plt.xlabel("age")
    plt.show()

data = [[i,(train_df["age_approx"] == i).sum()] for i in train_df["age_approx"].unique()]
data.sort()
data = [i for i in data if not np.isnan(i[0])]
plot_age(data)

In [None]:
#陽性者限定
data = [[i,((train_df["age_approx"] == i) & (train_df["target"]== 1)).sum()] for i in train_df["age_approx"].unique()]
data.sort()
data = [i for i in data if not np.isnan(i[0])]
plot_age(data)

In [None]:
#年齢(陽性者割合)
data1 = [[i,(train_df["age_approx"] == i).sum()] for i in train_df["age_approx"].unique()]
data2 = [[i,((train_df["age_approx"] == i) & (train_df["target"]== 1)).sum()] for i in train_df["age_approx"].unique()]
data = []
for i,j in zip(data1,data2):
    data.append([i[0],j[1]/i[1]])
data.sort()
data = [i for i in data if not np.isnan(i[0])]
plot_age(data)

In [None]:
print("撮影部位別割合")
def position(data):
    plt.figure(figsize=(10, 4))
    left = [i for i in range(len(data))]
    height = [i[1] for i in data]
    labels = [i[0] for i in data]
    plt.bar(left, height, width=0.5,linewidth=2, tick_label=labels)
    plt.title("pisition")
    plt.ylabel("count")
    plt.xlabel("pisition")
    plt.show()
data = [[i,(train_df["anatom_site_general_challenge"] == i).sum()] for i in train_df["anatom_site_general_challenge"].unique()]
position(data)


In [None]:
print("撮影部位別割合(陽性)")
def position(data):
    plt.figure(figsize=(10, 4))
    left = [i for i in range(len(data))]
    height = [i[1] for i in data]
    labels = [i[0] for i in data]
    plt.bar(left, height, width=0.5,linewidth=2, tick_label=labels)
    plt.title("pisition")
    plt.ylabel("count")
    plt.xlabel("pisition")
    plt.show()
data = [[i,((train_df["anatom_site_general_challenge"] == i) & (train_df["target"]== 1)).sum()] for i in train_df["anatom_site_general_challenge"].unique()]
position(data)

In [None]:
im_path = os.path.join(train_img_dir, train_df["image_name"][4] + '.jpg')
x = cv2.imread(im_path)

In [None]:
#cv2ではBGRを前提としているが、入力jpgはRGB
plt.imshow(x)

In [None]:
#そこで、cv2側の読み込み方をBGR→RGBに変えてやる。
x_rgb = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
plt.imshow(x_rgb)

In [None]:
#ちなみに、matplotlibは最初からRGB
x2 = plt.imread(os.path.join(train_img_dir, train_df["image_name"][4]+".jpg"))
plt.imshow(x2)

In [None]:
#画像の確認
print(train_df[train_df["image_name"] == "ISIC_0155012"]["target"])
x2 = plt.imread(os.path.join(train_img_dir, "ISIC_0155012"+".jpg"))
plt.imshow(x2)
plt.axis("off")

In [None]:
#画像の確認
print(train_df[train_df["image_name"] == "ISIC_0159568"]["target"])
x2 = plt.imread(os.path.join(train_img_dir, "ISIC_0159568"+".jpg"))
plt.imshow(x2)
plt.axis("off")

In [None]:
#陰性画像の確認
res = train_df[train_df["target"]==0]
res.reset_index(inplace =True)

rcParams['figure.figsize'] = 20,10
for i in range(3):
    f, axarr = plt.subplots(1,5)
    for p in range(5):
        x2 = plt.imread(os.path.join(train_img_dir, res["image_name"][i*5+p]+".jpg"))
        
        axarr[p].imshow(x2)
        axarr[p].set_title(str(res["benign_malignant"][i*5+p]))
        axarr[p].axis('off')

In [None]:
#陽性画像の確認
res = train_df[train_df["target"]==1]
res.reset_index(inplace =True)
for i in range(3):
    f, axarr = plt.subplots(1,5)
    for p in range(5):
        x2 = plt.imread(os.path.join(train_img_dir, res["image_name"][i*5+p]+".jpg"))
        
        axarr[p].imshow(x2)
        axarr[p].set_title(str(res["benign_malignant"][i*5+p]))
        axarr[p].axis('off')