In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cv2 
from matplotlib.image import imread
# Technically not necessary in newest versions of jupyter
%matplotlib inline

In [None]:
train_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/train/'
test_dir='/kaggle/input/siim-isic-melanoma-classification/jpeg/test/'
train=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')
submission=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/sample_submission.csv')

In [None]:
train.head()

In [None]:
train['target'].value_counts()

In [None]:
labels=train['anatom_site_general_challenge'].value_counts().index
values=train['anatom_site_general_challenge'].value_counts().values

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')

ax.pie(values, labels = labels,autopct='%1.2f%%')
plt.show()
 
print(labels)


#### A "nevus" is basically a visible, circumscribed, chronic lesion of the skin.

In [None]:
new=train.drop(labels=['image_name','patient_id','sex','age_approx','anatom_site_general_challenge','target'],axis=1)
pd.crosstab(new['diagnosis'].values,new['benign_malignant'])

### Take a sample from train 

In [None]:
df_0=train[train['target']==0].sample(2000)
df_1=train[train['target']==1]
train=pd.concat([df_0,df_1])
train=train.reset_index()

### Preparing the Datasets

In [None]:
labels=[]
data=[]
for i in range(train.shape[0]):
    data.append(train_dir + train['image_name'].iloc[i]+'.jpg')
    labels.append(train['target'].iloc[i]/1.0)
df=pd.DataFrame(data)
df.columns=['images']
df['target']=labels

In [None]:
df.head()

In [None]:
test_data=[]
for i in range(test.shape[0]):
    test_data.append(test_dir + test['image_name'].iloc[i]+'.jpg')
df_test=pd.DataFrame(test_data)
df_test.columns=['images']

In [None]:
df_test.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df['images'],df['target'], test_size=0.2, random_state=1234)

train=pd.DataFrame(X_train)
train.columns=['images']
train['target']=y_train

validation=pd.DataFrame(X_val)
validation.columns=['images']
validation['target']=y_val

####  Preprocessing 

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(rescale=1./255,rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,horizontal_flip=True)
val_datagen=ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_dataframe(
    train,
    x_col='images',
    y_col='target',
    target_size=(224, 224),
    batch_size=8,
    shuffle=True,
    class_mode='raw')

validation_generator = val_datagen.flow_from_dataframe(
    validation,
    x_col='images',
    y_col='target',
    target_size=(224, 224),
    shuffle=False,
    batch_size=8,
    class_mode='raw')


### Modelling

In [None]:
from tensorflow.keras.applications import VGG16, Xception
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import *
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D

def vgg16_model( num_classes=None):

 #   model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
    model = Xception(
    include_top=False,
    weights="imagenet",
    input_shape=(224, 224, 3)
    )
    
    x=Flatten()(model.output)
    output=Dense(1,activation='sigmoid')(x)
    model=Model(model.input,output)
    
    return model

vgg_conv=vgg16_model(1)

In [None]:
import tensorflow.keras.backend as K 

def focal_loss(alpha=0.25,gamma=2.0):
    def focal_crossentropy(y_true, y_pred):
        bce = K.binary_crossentropy(y_true, y_pred)
        
        y_pred = K.clip(y_pred, K.epsilon(), 1.- K.epsilon())
        p_t = (y_true*y_pred) + ((1-y_true)*(1-y_pred))
        
        alpha_factor = 1
        modulating_factor = 1

        alpha_factor = y_true*alpha + ((1-alpha)*(1-y_true))
        modulating_factor = K.pow((1-p_t), gamma)

        # compute the final loss and return
        return K.mean(alpha_factor*modulating_factor*bce, axis=-1)
    return focal_crossentropy

In [None]:
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.metrics  import AUC 
opt = SGD(lr=0.001)
vgg_conv.compile(loss=focal_loss(), metrics=[AUC()],optimizer=opt)

In [None]:
nb_epochs = 10
batch_size=100
nb_train_steps = train.shape[0]//batch_size
nb_val_steps=validation.shape[0]//batch_size
print("Number of training and validation steps: {} and {}".format(nb_train_steps,nb_val_steps))


In [None]:
vgg_conv.fit_generator(
    train_generator,
    steps_per_epoch=nb_train_steps,
    epochs=nb_epochs,
    validation_data=validation_generator,
    validation_steps=nb_val_steps)


### Save Model

In [None]:
vgg_conv.summary()
vgg_conv.save('Xception_model.h5') 


###  Load Model 

In [None]:
import tensorflow
from tensorflow.keras.models import *

loaded_model = tensorflow.keras.models.load_model('Xception_model.h5',
                                                  custom_objects={'focal_loss': focal_loss, 'focal_crossentropy': focal_loss()})

### Improve prediction by using other data 

In [None]:
import pandas as pd
import numpy as np

train_orig=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/train.csv')
test_orig=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')

train_sample =  train_orig.copy()
test_sample = test_orig.copy()

train_sample.info()
test_sample.info()

In [None]:
##sample 
df_0=train_sample[train_sample['target']==0].sample(20000)
df_1=train_sample[train_sample['target']==1]
train_sample=pd.concat([df_0,df_1])
train_sample=train_sample.reset_index()

In [None]:
image_data=[]
for i in range(train_sample.shape[0]):
    image_data.append(train_dir + train_sample['image_name'].iloc[i]+'.jpg')

train_sample['images'] = image_data

In [None]:
from tqdm import tqdm

target=[]
for path in tqdm(train_sample['images']):
    img=cv2.imread(str(path))
    img = cv2.resize(img, (224,224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32)/255.
    img=np.reshape(img,(1,224,224,3))
    prediction=vgg_conv.predict(img)
    target.append(prediction[0][0])
    

train_sample['predicted']=target

In [None]:
train_sample.info()

In [None]:
train_sample.head()

In [None]:
train_sample.to_csv(r'traindata_withpredicted_xception.csv', index = False)

### import from  CSV

In [None]:
ls


In [None]:
#train_sample = pd.read_csv('../input/siimisic-melanoma-saved-model/traindata_withpredicted_xception.csv')
train_sample = pd.read_csv('traindata_withpredicted_xception.csv')

In [None]:
train_sample.head()

In [None]:
train_sample[train_sample['target']==1].describe().transpose()

In [None]:
sns.countplot(x='benign_malignant',data=train_sample)

In [None]:
train_sample.corr()

In [None]:
sns.heatmap(train_sample.corr())

In [None]:
X = train_sample[['sex','age_approx','anatom_site_general_challenge','predicted']]
y = train_sample[['target']]/1.0

In [None]:
X.head()

In [None]:
X.count()


In [None]:
y.count()

In [None]:
type(X)

In [None]:
X['age_approx'] = X['age_approx'].fillna(X['age_approx'].mean())

In [None]:
X[X['age_approx'].isnull()==True]

In [None]:
X['sex'] = X['sex'].fillna(value ='male')

In [None]:
X[X['sex'].isnull()==True]

In [None]:
sns.countplot(x='anatom_site_general_challenge',data=train_sample)

In [None]:
X['anatom_site_general_challenge'] = X['anatom_site_general_challenge'].fillna(value ='torso')

In [None]:
X[X['anatom_site_general_challenge'].isnull()==True]

In [None]:
X  = X.values

In [None]:
# Taking care of missing numerical data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X[:, 1:2])
X[:, 1:2] = imputer.transform(X[:, 1:2])

In [None]:
print(X)

In [None]:
# Encoding categorical data
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer 
from  sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers= [('encoder',OneHotEncoder(), [0,2])] , remainder='passthrough')
X = ct.fit_transform(X)


In [None]:
print(X)

In [None]:
type(y)

In [None]:
y = y.values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=101)



In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
model = Sequential()

# https://stats.stackexchange.com/questions/181/how-to-choose-the-number-of-hidden-layers-and-nodes-in-a-feedforward-neural-netw

model.add(Dense(units=1000,activation='relu'))

model.add(Dense(units=2000,activation='relu'))

model.add(Dense(units=5000,activation='relu'))

model.add(Dense(units=10000,activation='relu'))
model.add(Dense(units=20000,activation='relu'))

model.add(Dense(units=10000,activation='relu'))
model.add(Dense(units=5000,activation='relu'))

model.add(Dense(units=2000,activation='relu'))
model.add(Dense(units=1000,activation='relu'))

model.add(Dense(units=400,activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(units=200,activation='relu'))

model.add(Dense(units=100,activation='relu'))

model.add(Dense(units=50,activation='relu'))

model.add(Dense(units=30,activation='relu'))
model.add(Dense(units=10,activation='relu'))

model.add(Dense(units=5,activation='relu'))

model.add(Dense(units=1,activation='sigmoid'))

# For a binary classification problem
model.compile(loss=focal_loss(), metrics=[AUC()],optimizer=opt)

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=400, 
          batch_size=128,
          validation_data=(X_test, y_test), verbose=1
          )

In [None]:
model_loss = pd.DataFrame(model.history.history)

In [None]:
model_loss.plot()

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error 
XGBModel = XGBRegressor()
XGBModel.fit(X_train,y_train , verbose=False)

# Get the mean absolute error on the validation data :
XGBpredictions = XGBModel.predict(X_test)
MAE = mean_absolute_error(y_test , XGBpredictions)
print('XGBoost validation MAE = ',MAE)

### Submission

In [None]:
import cv2 
target=[]
for path in df_test['images']:
    img=cv2.imread(str(path))
    img = cv2.resize(img, (224,224))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = img.astype(np.float32)/255.
    img=np.reshape(img,(1,224,224,3))
    prediction=vgg_conv.predict(img)
    target.append(prediction[0][0])

submission['target']=target

submission.to_csv('submission_XGB.csv', index=False)
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
#train_sample = pd.read_csv('../input/siimisic-melanoma-saved-model/traindata_withpredicted_xception.csv')
test_pred = pd.read_csv('submission_XGB.csv')
test_data=pd.read_csv('/kaggle/input/siim-isic-melanoma-classification/test.csv')

In [None]:
ls 