In [None]:

import numpy as np
import pandas as pd 
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from collections import Counter
import keras
from keras.preprocessing.image import ImageDataGenerator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

sample=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')
train=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')

# **1-DATA ANALYSIS.**
<div id="intAnalysis"></div>

# **A-Train dataset**

In [None]:
train.info()

There are NaN values in the columns 'sex', 'age_approx', 'anatom_site_general_challenge'. We dont' need to handle this NaN values because we are going to classify the images using a convolutional neural network(hence, the only parameters we are going to need are image_name and target). That doesn't mean that we can't extract useful information from the rest of parameters, and we are going to do that in the next analysis. 

If you wanted to do a classic machine learning model with sklearn you'll need to handle the NaN values. 

**A.1. Distribution of the train dataset by sex.**

In [None]:
sexDataset=train.groupby('sex').size().reset_index(name='count')

fig = go.Figure()

fig= px.bar(sexDataset, x='sex', y='count',color='sex')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Distribution of the train dataset by sex',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

**A.2.Distribution of the train dataset by age.**

In [None]:
ageDataset=train.groupby('age_approx').size().reset_index(name='count')

fig = go.Figure()



fig= px.bar(ageDataset, x='age_approx', y='count',color='count')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Distribution of the train dataset by age',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

In [None]:
train['age_approx'].skew()#Skewness involves the symmetry of the distribution. 
#Skewness is 0 in a normal distribution, so the farther away from 0, the more non-normal the distribution

As you can see, the age is normally distributed. 

**A.3.Images by patient.**

In [None]:
train['patient_id'].nunique()

In the dataset train there are 2056 patients. 

In [None]:
patientsTrain = train.patient_id.value_counts()

fig = go.Figure()
# Use x instead of y argument for horizontal plot
fig.add_trace(go.Box(x=patientsTrain,name="Images by patient in the train dataset",marker_color='white'))
fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

The patient with the biggest number of images has 115 and the patient with the few number of images has 2. The median of images by patients is 12. The upper fence is 47. There are a few outliers. 

**A.4.Most common areas where the images are made in the train dataset.**

In [None]:
anatomDataset=train.groupby('anatom_site_general_challenge').size().reset_index(name='count')

fig= px.pie(anatomDataset, names='anatom_site_general_challenge', values='count',color='count')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Distribution of the train dataset by anatomic areas',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

The most frequent anatomic areas are torso, lower extremity and upper extremity. 

**A.5.Are tumors mostly benign or malignant in the train dataset?**

In [None]:
BMDataset=train.groupby('benign_malignant').size().reset_index(name='count')

fig= px.pie(BMDataset, names='benign_malignant', values='count',color='count')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Distribution of the train dataset by benign or malignant',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

There are a lot more of benign tumors in the train dataset, it's very imbalanced.

**A.6.Distribution of the dataset by benign and malignant and diagnosis.**

In [None]:
BMDiagnosisDataset=train.groupby(['benign_malignant','diagnosis']).size().reset_index(name='count')

fig= px.bar(BMDiagnosisDataset, y='diagnosis', x='count',color='benign_malignant')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by diagnosis',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

The only diagnosis with a value 'malignant' associated is melanoma. The rest of the diagnosis are 'benign'.

**A.7.Distribution in the dataset by benign and malignant and sex.**

In [None]:
BMSexDataset=train.groupby(['benign_malignant','sex']).size().reset_index(name='count')

fig= px.bar(BMSexDataset, x='sex', y='count',color='benign_malignant')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by sex',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

There are slightly more cases of malignant in persons with male sex. 

**A.8.Distribution in the dataset by benign and malignant and anatomic area.**

In [None]:

BMAnatomicDataset=train.groupby(['benign_malignant','anatom_site_general_challenge']).size().reset_index(name='count')

fig= px.bar(BMAnatomicDataset, y='anatom_site_general_challenge', x='count',color='benign_malignant')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by anatomic site',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

* Melanomas in head/neck: 74

* Melanomas in lower extremity: 124

* Melanomas in oral/genital: 4

* Melanomas in palms/soles: 5

* Melanomas in torso: 257

* Melanomas in upper extremity: 111

**A.8.Distribution in the train dataset by benign and malignant and age.**

In [None]:
BMAgeDataset=train.groupby(['benign_malignant','age_approx','sex']).size().reset_index(name='count')

fig= px.bar(BMAgeDataset, x='age_approx', y='count',color='benign_malignant',hover_data=['sex'])

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by age',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

There are more malignant cases between 45 and 80 years old. According to various sources of information (Cancer Research UK, American Cancer Society), melanoma is more common in men, but before age 50/60 the rates are higher in women.

**A.9.Check if a target 0 is always associated with a value 'benign' in the benign_malignant column, and with a value 'melanoma' in the diagnosis column.**

In [None]:

targetDataset=train.groupby(['benign_malignant','target','diagnosis']).size().reset_index(name='count')

fig = go.Figure()

fig = px.bar(targetDataset, y="count", x="target", color='benign_malignant',height=500)

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by target',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

A target '0' is always associated with benign. A target '1' is always associated with malignant. The target is very imbalanced.

In [None]:
fig = go.Figure()

fig = px.bar(targetDataset, x="count", y="diagnosis", color='target',height=300)

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Diagnosis by target',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

A diagnosis 'melanoma' is always associated with a target 1.

**A.10.Check if the column image_name has unique values for each row .**

In [None]:
imagesNameTrain = train.image_name.value_counts()

fig = go.Figure()
# Use x instead of y argument for horizontal plot
fig.add_trace(go.Box(x=imagesNameTrain,name="Has each image an unique image_name?",marker_color='white'))
fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

Yes, the images names are unique. 

This is the whole analysis of the train dataset. 

# **B-Test Dataset.**

In [None]:
test.info()

There are NaN values in the column 'anatom_site_general_challenge'. 

**B.1. Distribution of the test dataset by sex.**

In [None]:
testSexDataset=test.groupby('sex').size().reset_index(name='count')

In [None]:
fig = go.Figure()

fig = px.bar(testSexDataset,
             x='sex',
             y='count',
             title='Distribution of the test dataset by sex',
              color='sex',
             barmode='stack')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

    

fig.show()

In the dataset test there are more males than females, when in the dataset train the quantity of both sexs was similar.

**B.2. Distribution of the test dataset by age.**

In [None]:
testAgeDataset=test.groupby('age_approx').size().reset_index(name='count')
fig = go.Figure()

fig = px.bar(testAgeDataset,
             x='age_approx',
             y='count',
             title='Distribution of the test dataset by age',
              color='age_approx',
             barmode='stack')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

    

fig.show()

In [None]:
test['age_approx'].skew() #Skewness involves the symmetry of the distribution. 
#Skewness is 0 in a normal distribution, so the farther away from 0, the more non-normal the distribution

There are more older patients in the test dataset.

**B.3. Images by patient.**

In [None]:
patientsTest = test.patient_id.value_counts()
fig = go.Figure()
# Use x instead of y argument for horizontal plot
fig.add_trace(go.Box(x=patientsTest,name="Images by patient in the test dataset",marker_color='white'))
fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

The patient with the biggest number of images has 240 and the patient with the few number of images has 3. The median of images by patients is 10. The upper fence is 43. There are less outliers here than in the train dataset, but there is a huge one: the patient with 240 images. 

**B.4.Most common areas where the images are made in the test dataset.**

In [None]:
testAnatomDataset=test.groupby('anatom_site_general_challenge').size().reset_index(name='count')



fig= px.bar(testAnatomDataset, y='anatom_site_general_challenge', x='count',color='count')

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Distribution of the test dataset by anatomic areas',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

The distribution is similar to the one of the train dataset. There are a lot of rows with 'No anatomic site known'.

**B.5.Check if the column image_name has unique values for each row .**

In [None]:
imagesNameTrain = test.image_name.value_counts()

fig = go.Figure()
# Use x instead of y argument for horizontal plot
fig.add_trace(go.Box(x=imagesNameTrain,name="Has each image an unique image_name?",marker_color='white'))
fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

Yes, the images names are unique.

# **2-PREPROCESSING THE DATA.**

As you can saw in the EDA before, the train dataset is very unbalaced, so this can lead to overfitting problems. There are many ways to deal with imbalanced data. There are these amazing papers which gives you some ways to solution this problem. 

* https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/
* https://medium.com/analytics-vidhya/how-to-apply-data-augmentation-to-deal-with-unbalanced-datasets-in-20-lines-of-code-ada8521320c9
* https://towardsdatascience.com/deep-learning-unbalanced-training-data-solve-it-like-this-6c528e9efea6
* https://towardsdatascience.com/handling-imbalanced-datasets-in-deep-learning-f48407a0e758

I'm going to fix the umbalanced data by doing oversampling using the RandomOverSampler algorithm, by using imbalanced-learn library. For more information about this library, check the link: https://imbalanced-learn.readthedocs.io/en/stable/index.html

In [None]:
pip install -U imbalanced-learn

In [None]:
X=train.drop('target',axis=1)
y=train['target']
print('Original dataset shape %s' % Counter(y))

In [None]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

In [None]:
#mix X_res and y_res into train dataset again
X_res['target']=y_res
train_resampled=X_res

In [None]:
targetDatasetResampled=train_resampled.groupby(['benign_malignant','target','diagnosis']).size().reset_index(name='count')

fig = go.Figure()

fig = px.bar(targetDatasetResampled, y="count", x="target", color='benign_malignant',height=500)

fig.update_layout(
   paper_bgcolor='rgb(0,0,0)',
   plot_bgcolor='rgb(0,0,0)',
    font_family="Helvetica",
    font_color="white",
    title_font_family="Helvetica",
    title_font_color="white",
    legend_title_font_color="white",
    title_text='Benign and malignant by target',
    xaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    },
    yaxis = { 
    'showgrid': False, 
    'zeroline': True, 
    'visible': True,
    
    }
)
    

fig.show()

# 3-BUILDING A CONVOLUTIONAL NEURAL NETWORK TO PREDICT THE IMAGES.

I will use MobileNet. It's not the most efficient convolutional neural network but I wanted to obtain the predictions the quickest possible. If you want to check all the Keras' convolutional neural networks and their efficient, check the link: https://keras.io/api/applications/

In [None]:
import tensorflow as tf
from keras.applications import MobileNet as model
from keras.layers import Dense, GlobalAveragePooling2D,Activation,Flatten
from keras.models import Model

base_model = model(weights='imagenet',include_top=False)







In [None]:
base_model.summary()

In [None]:
#apply transfer learning to the model
# add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)
# let's add a fully-connected layer
x = Dense(1024, activation='relu')(x) # #we add dense layers so that the model can learn more complex functions and classify for better results
x=Dense(1024,activation='relu')(x) 
x=Dense(512,activation='relu')(x)
# and a logistic layer -- 
predictions = Dense(2, activation='softmax')(x)

In [None]:
# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [None]:
# first: train only the top layers (which were randomly initialized)
# i.e. freeze all convolutional layers
for layer in model.layers:
    layer.trainable = False

In [None]:
train_data=train_resampled
test_data=test

train_data['image_name'] = train_data['image_name'].apply(lambda x: x + '.jpg')
test_data['image_name'] = test_data['image_name'].apply(lambda x: x + '.jpg')

X2_train, X2_val = train_test_split(train_data, test_size=0.2, random_state=42)



In [None]:
train_datagen=tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=360,rescale=1./255, horizontal_flip=True)

In [None]:

train_generator=train_datagen.flow_from_dataframe(
    dataframe=X2_train,
    directory='../input/siim-isic-melanoma-classification/jpeg/train/',
    x_col="image_name",
    y_col="target",
    class_mode="raw",
    batch_size=8,
    target_size=(224, 224),
    color_mode="rgb",
    
    
    )

validation_datagen=tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

valid_generator=validation_datagen.flow_from_dataframe(
    dataframe=X2_val,
    directory='../input/siim-isic-melanoma-classification/jpeg/train/',
    x_col="image_name",
    y_col="target",
    class_mode="raw", 
    batch_size=8,   
    target_size=(224, 224),
    color_mode="rgb",
    
    )

test_datagen=tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

test_generator=test_datagen.flow_from_dataframe(  
        dataframe=test_data,
        directory = '../input/siim-isic-melanoma-classification/jpeg/test/',
        x_col="image_name",
        batch_size=1,
        class_mode=None,
        shuffle=False,
        target_size=(224, 224),
        )

In [None]:
model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy',metrics=['accuracy'])

model.fit_generator(generator=train_generator,
                                    steps_per_epoch=train_generator.n//64,
                                    validation_data=valid_generator,
                                    validation_steps=valid_generator.n//64,
                                    epochs=10,
                                    
                       )

# 4-MAKING THE PREDICTIONS.

In [None]:
import matplotlib.image as matimage
import cv2
#predict a random image
pathImage=test_generator.filepaths[np.random.random_integers(low=0,high=test_generator.samples)]
print(pathImage)
img=matimage.imread(pathImage)
plt.imshow(img)

from PIL import Image
image=Image.open(pathImage)
image=image.convert('RGB')
image=image.resize((224,224))
probabilities=model.predict(np.expand_dims(image,axis=0))
print(probabilities[0][0])



In [None]:
dataTestPathPredict='../input/siim-isic-melanoma-classification/jpeg/test/'
def make_predictions(image_name):
    image=Image.open(str(dataTestPathPredict + image_name))
    image=image.convert('RGB')
    image=image.resize((224,224))
    pre = model.predict(np.expand_dims(image,axis=0))
    return pre[0][0]
    
    
    

In [None]:
make_predictions('ISIC_4809071.jpg')

In [None]:
test_data['model_prediction'] = test_data['image_name'].apply(make_predictions)

In [None]:
test_data.to_csv("saveData.csv")

In [None]:
finalpred=test_data['model_prediction']

In [None]:
sample=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')

In [None]:
sample['target']=finalpred

In [None]:
sample.head()

In [None]:
sample.to_csv('FinalSubmission.csv',index=False)