# <font color = 'tomato'> Exploring and Sample Modelling on SIIM-ISIC Melanoma Data</font> 

## Import Librires

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import plotly as py
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

from sklearn import preprocessing

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Read Data

In [None]:
path =  '/kaggle/input/siim-isic-melanoma-classification/'

train = pd.read_csv(path+'train.csv')

test = pd.read_csv(path+'test.csv')

In [None]:
train.info()

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.isna().sum()

## Filling NaN values in the data

In [None]:
train.sex.fillna('Not Provoded', inplace = True)

train.age_approx.fillna(train.age_approx.mean(), inplace = True)

train.anatom_site_general_challenge.fillna('UnKnown' , inplace=True)

In [None]:
train.isna().sum()

## Visualise Data

In [None]:
b = train[train['target']==0]
n = 16
fig = plt.figure(figsize = (15,15))
for i, ind in zip(range(1, 1+n), [b.index[np.random.randint(b.shape[0])] for _ in range(n)]):
    fig.add_subplot(4,4,i)   
    plt.imshow(plt.imread(path+'jpeg/train/'+train.image_name[ind]+'.jpg'))
    plt.axis('off')
    plt.title('Benign'if train.target[ind] == 0 else 'Malignant')
plt.show()

In [None]:
b = train[train['target']==1]
n = 16
fig = plt.figure(figsize = (15,15))
for i, ind in zip(range(1, 1+n), [b.index[np.random.randint(b.shape[0])] for _ in range(n)]):
    fig.add_subplot(4,4,i)   
    plt.imshow(plt.imread(path+'jpeg/train/'+train.image_name[ind]+'.jpg'))
    plt.axis('off')
    plt.title('Benign'if train.target[ind] == 0 else 'Malignant')
plt.show()

## Few Insights
## Which gender affected most?

In [None]:
x = train.sex.value_counts()
x = pd.DataFrame(data={'sex': x.index.tolist(), 'Count': x.values.tolist()})
fig = px.pie(x, values='Count', names='sex', title='Gender Affected Most')
fig.show()

## Age

In [None]:
x = train.age_approx.value_counts()

df = pd.DataFrame({'Age':x.index, 
                  'Count':x.values})
px.bar(df, x = 'Age', y = 'Count', color='Age', barmode='group')

## Diagnosis

In [None]:
x = train.diagnosis.value_counts()
x = pd.DataFrame(data={'sex': x.index.tolist(), 'Count': x.values.tolist()})
fig = px.pie(x, values='Count', names='sex', title='Gender Affected Most')
fig.show()

## Modelling - XGBoost

In [None]:
tr = train[['sex', 'age_approx', 'anatom_site_general_challenge', 'target']]
tr.head()

In [None]:
tr.dtypes

## Encoding Data

In [None]:
label_encoder = preprocessing.LabelEncoder()

tr['sex']= label_encoder.fit_transform(tr['sex']) 

tr['anatom_site_general_challenge']= label_encoder.fit_transform(tr['anatom_site_general_challenge']) 

tr.head()

In [None]:
test.anatom_site_general_challenge.fillna('UnKnown' , inplace=True)
test.isna().sum()

In [None]:
ts = test[['sex', 'age_approx', 'anatom_site_general_challenge']]

label_encoder = preprocessing.LabelEncoder()

ts['sex']= label_encoder.fit_transform(ts['sex']) 

ts['anatom_site_general_challenge']= label_encoder.fit_transform(ts['anatom_site_general_challenge']) 

ts.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tr.iloc[:, :-1], tr.iloc[:, -1], test_size=0.3, random_state=11)

model = XGBClassifier()
model.fit(X_train, y_train)
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
x = confusion_matrix(y_test, predictions)

In [None]:
ff.create_annotated_heatmap(
    z=x,
    x=[0,1],
    y=[0,1],
    annotation_text=x,
    showscale=False, colorscale='Peach')

## Predictions on unseen data

In [None]:
pred = model.predict(ts)

In [None]:
sub = pd.read_csv(path+'sample_submission.csv')

In [None]:
sub.info()

In [None]:
sub.head()

In [None]:
sub.target = pred

In [None]:
sub.to_csv('submission_XGBoost.csv', index=False)

## Give an Upvote, If you like the work.