## Melanoma Detection:
Generally in any medical image diagnosis Machine Learning problems, the number of positive labelled data will be less compared to negative labelled data since the number of people suffering from the disease will be less compared to number of people tested.It is no different in our current dataset.

The number of images corresponding to benign tumours is 98% which leads to huge Class Imbalance Problem.

There are various techniques for handling Class Imbalance.The one used is this kernel is ***UnderSampling***.
UnderSampling in simple terms can be thought of as reducing the number of data points corresponding to the class which has significantly more data points in a class imbalance scenario

![](http://)




In [None]:
# Installing Necessary Packages

!pip install efficientnet
!pip install sweetviz

In [None]:
import os
import albumentations
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn import model_selection
import tensorflow as tf
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import cv2
import efficientnet.tfkeras as efn 
import tensorflow.keras.layers as L
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import (ModelCheckpoint, LearningRateScheduler,
                                        EarlyStopping, ReduceLROnPlateau, CSVLogger)
import math
import sweetviz as sv
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold

In [None]:
# Create a dataframe out of train csv file
df = pd.read_csv("../input/siim-isic-melanoma-classification/train.csv")
df_test = pd.read_csv("../input/siim-isic-melanoma-classification/test.csv")

In [None]:
df.head()

In [None]:
labels=df['diagnosis'].value_counts().index[1:]
values=df['diagnosis'].value_counts().values[1:]
fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

In [None]:
labels=df['anatom_site_general_challenge'].value_counts().index
values=df['anatom_site_general_challenge'].value_counts().values
fig = go.Figure(data=[go.Pie(labels=labels, values=values, textinfo='label+percent',
                             insidetextorientation='radial'
                            )])
fig.show()

# 欠損値処理、testにないcolの消去

diagnosisの扱いについてはまた今度考える

In [None]:
df=df.drop(['benign_malignant','diagnosis'],axis=1)

In [None]:
df.isnull().sum()/len(df)

In [None]:
df_test.isnull().sum()/len(df_test)

### 欠損値代入

In [None]:
df['age_approx'] = df['age_approx'].fillna(df['age_approx'].mean())

### 特徴量

In [None]:
tmp = df.groupby('patient_id').size()
df['exam_num'] = df['patient_id'].map(tmp)

In [None]:
tmp = df_test.groupby('patient_id').size()
df_test['exam_num'] = df_test['patient_id'].map(tmp)

In [None]:
df['exam_freq'] = df['age_approx']/df['exam_num']

In [None]:
df_test['exam_freq'] = df_test['age_approx']/df_test['exam_num']

In [None]:
tmp = df.groupby('patient_id')['age_approx'].std()
df['exam_std'] = df['patient_id'].map(tmp).fillna(0)

In [None]:
tmp = df_test.groupby('patient_id')['age_approx'].std()
df_test['exam_std'] = df_test['patient_id'].map(tmp).fillna(0)

### encording

In [None]:
df['image_name'] = df['image_name']+'.png'

In [None]:
df['target'] = df['target'].astype('int')

In [None]:
def to_sex_encord(word):
    if word=='male':
        return pd.Series([1,0])
    if word=='female':
        return pd.Series([0,1])
    else:
        return pd.Series([0.5,0.5])

df[['male','female']]=df['sex'].apply(to_sex_encord)
df_test[['male','female']]=df_test['sex'].apply(to_sex_encord)

In [None]:
df['anatom_site_general_challenge2'] = df['anatom_site_general_challenge'].replace( {'head/neck':'other','palms/soles':'other', 'oral/genital':'other',np.nan:'other'})
df_test['anatom_site_general_challenge2'] = df_test['anatom_site_general_challenge'].replace( {'head/neck':'other','palms/soles':'other', 'oral/genital':'other',np.nan:'other'})

df = pd.get_dummies(df,columns=['anatom_site_general_challenge2'])
df_test = pd.get_dummies(df_test,columns=['anatom_site_general_challenge2'])

In [None]:
df.isnull().sum()

In [None]:
df_test.isnull().sum()

In [None]:
df.mean()[['age_approx','exam_num']]

In [None]:
# 正規化
def to_normalize( df,cols ):
    df[cols] = ( df[cols] - df.mean()[cols] )/df.std()[cols]
    return df

## UnderSampling

On looking into the dataset it can be noted that for the same person[Patient ID] and for the same region of the body[anatom_site_general_challenge] , there are multiple Images. Only one image per person per anatomy region only is used, the rest all are dropped for benign cases.The malignant datapoints are not touched.

malignant →　584

benign →　32542

benign(一致例消去) → 6271

In [None]:
def undersampling(df,n_models=1):
    
    if n_models!=1:
        print('undersamplingはデータフレーム１つしかない')
        
    #一致例消去benignをn_splitsで分割してそれぞれにmalignant例を加えたdfのlistを返す
    #これだけはdfsは１つしかないことに注意
    dfs = []
    
    df_malignant = df[df['target'] == 1]
    df_benign = df[df['target'] == 0].drop_duplicates(subset=['patient_id','anatom_site_general_challenge'], keep = "first")
    
    df_concat = pd.concat([df_malignant, df_benign]).reset_index(drop = True)
    df_concat = df_concat.sample(frac=1).reset_index(drop=True)
    
    dfs.append(df_concat)
    
    return dfs

In [None]:
def undersampling2(df,n_models=6):
    
    #benignをn_splitsで分割してそれぞれにmalignant例を加えたdfのlistを返す
    dfs = []
    
    df_malignant = df[df['target'] == 1]
    df_benign = df[df['target'] == 0]
    
    kf = KFold(n_splits=n_models,shuffle=True)
    for _,index in kf.split(df_benign):
        df_benign2 = df_benign.iloc[index]
        dfs.append( pd.concat([df_malignant, df_benign2]).sample(frac=1).reset_index(drop = True) )
    
    return dfs

In [None]:
def undersampling3(df,n_models=6):
    
    #benignをn_splitsで分割してそれぞれにmalignant例を加えたdfのlistを返す
    #できるだけpatient_id+siteがかぶらないように
    
    dfs = []
    
    df_malignant = df[df['target'] == 1]
    df_benign = df[df['target'] == 0]
    df_benign['id'] = df_benign['patient_id']+(df_benign['anatom_site_general_challenge'].replace({np.nan:'NAN'}))
    kf = StratifiedKFold(n_models)
    df_benign = df_benign.sample(frac=1).reset_index(drop=True)

    for _, (_, index) in enumerate(kf.split(X=df_benign, y=df_benign.id.values)):
        df_benign2 = df_benign.iloc[index].drop('id',axis=1)
        dfs.append( pd.concat([df_malignant, df_benign2]).sample(frac=1).reset_index(drop = True) )
    
    return dfs

In [None]:
a = undersampling3(df)
for i in range(len(a)):
    print(a[i].shape)

In [None]:
def undersampling4(df,n_models=6):
    
    #benignをn_splitsで分割してそれぞれにmalignant例を加えたdfのlistを返す
    #できるだけpatient_id+siteがかぶらないように、そしてかぶりは消す
    
    dfs = []
    
    df_malignant = df[df['target'] == 1]
    df_benign = df[df['target'] == 0]
    df_benign['id'] = df_benign['patient_id']+(df_benign['anatom_site_general_challenge'].replace({np.nan:'NAN'}))
    kf = StratifiedKFold(n_models)
    df_benign = df_benign.sample(frac=1).reset_index(drop=True)

    for _, (_, index) in enumerate(kf.split(X=df_benign, y=df_benign.id.values)):
        df_benign2 = df_benign.iloc[index].drop_duplicates(subset='id', keep = "first").drop('id',axis=1)
        dfs.append( pd.concat([df_malignant, df_benign2]).sample(frac=1).reset_index(drop = True) )
    
    return dfs

In [None]:
a = undersampling4(df)
for i in range(len(a)):
    print(a[i].shape)

In [None]:
def undersampling5(df ,normalized_cols ,n_models=6):
    
    #benignをn_splitsで分割してそれぞれにmalignant例を加えたdfのlistを返す
    #できるだけpatient_id+siteがかぶらないように、そしてかぶりは消す
    
    dfs = []
    
    df_malignant = df[df['target'] == 1]
    df_benign = df[df['target'] == 0]
    df_benign['id'] = df_benign['patient_id']+(df_benign['anatom_site_general_challenge'].replace({np.nan:'NAN'}))
    kf = StratifiedKFold(n_models)
    df_benign = df_benign.sample(frac=1).reset_index(drop=True)

    for _, (_, index) in enumerate(kf.split(X=df_benign, y=df_benign.id.values)):
        df_benign2 = df_benign.iloc[index].drop_duplicates(subset='id', keep = "first").drop('id',axis=1)
        dfs.append( to_normalize( pd.concat([df_malignant, df_benign2]) , normalized_cols ).sample(frac=1).reset_index(drop = True) )
    
    return dfs

# **生成部**

In [None]:
data_col = ['age_approx','exam_num',
       'exam_freq', 'exam_std', 
       'anatom_site_general_challenge2_lower extremity',
       'anatom_site_general_challenge2_other',
       'anatom_site_general_challenge2_torso',
       'anatom_site_general_challenge2_upper extremity', 'male', 'female']

In [None]:
n_models=1

dfs = undersampling(df,n_models)

In [None]:
dfs[0]


# foldの作成

In [None]:
# Stratified KFold samples
def my_KFold( df , n_splits=5):

    df = df.sample(frac=1).reset_index(drop=True)
    
    kf = StratifiedKFold(n_splits)
    for fold, (_, val_ind) in enumerate(kf.split(X=df, y=df.target.values)):
        df.loc[val_ind, 'fold'] = fold

    df = df.sample(frac=1).reset_index(drop=True) # shuffling 
    
    return df

In [None]:
for i in range(n_models):
    df_model = my_KFold( dfs[i] )
    df_model.to_csv('df'+str(i)+'.csv' , index=False)

In [None]:
df_model.head()

In [None]:
df_test

In [None]:
df_test2 = df_test.copy()
df_test2['image_name'] = df_test2['image_name']+'.png'
df_test2['index'] =  np.arange(df_test2.shape[0])

In [None]:
df_test2.head()

In [None]:
df_test2.to_csv('df_test.csv')

In [None]:
df_submit = pd.DataFrame({'image_name':df_test['image_name']})

In [None]:
df_submit.to_csv('df_submit.csv')