In [88]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix 
base_skin_dir = os.path.join('..', 'input')

Using TensorFlow backend.


In [51]:
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'melanoma ',   #this is an error in the other scripts
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [52]:
tile_df = pd.read_csv(os.path.join(base_skin_dir, 'skin-cancer-mnist-ham10000/HAM10000_metadata.csv'))
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes
tile_df.sample(3)
tile_df.describe(exclude=[np.number])

Unnamed: 0,lesion_id,image_id,dx,dx_type,sex,localization,path,cell_type
count,10015,10015,10015,10015,10015,10015,0.0,10015
unique,7470,10015,7,4,3,15,0.0,7
top,HAM_0003789,ISIC_0031288,nv,histo,male,back,,Melanocytic nevi
freq,6,1,6705,5340,5406,2192,,6705


In [53]:
tile_df.isnull().sum()

lesion_id            0
image_id             0
dx                   0
dx_type              0
age                 57
sex                  0
localization         0
path             10015
cell_type            0
cell_type_idx        0
dtype: int64

In [54]:
tile_df['age'].fillna((tile_df['age'].mean()), inplace=True)

In [55]:
tile_df.isnull().sum()

lesion_id            0
image_id             0
dx                   0
dx_type              0
age                  0
sex                  0
localization         0
path             10015
cell_type            0
cell_type_idx        0
dtype: int64

In [56]:
images=pd.read_csv('../input/skin-cancer-mnist-ham10000/hmnist_28_28_RGB.csv')

In [57]:
#check  image label equals tiledf celltype
(images.label==tile_df.cell_type_idx).mean()

1.0

In [58]:
from sklearn.preprocessing import LabelEncoder
Encoder_X = LabelEncoder() 
for col in tile_df.columns:
    if tile_df.dtypes[col]=='object':
        tile_df[col]=col+'_'+tile_df[col].map(str)
        tile_df[col] = Encoder_X.fit_transform(tile_df[col])
Encoder_y=LabelEncoder()
#tile_df

In [59]:
tile_df[['dx','dx_type','age','sex','localization','cell_type']].head()

Unnamed: 0,dx,dx_type,age,sex,localization,cell_type
0,2,3,80.0,1,11,2
1,2,3,80.0,1,11,2
2,2,3,80.0,1,11,2
3,2,3,80.0,1,11,2
4,2,3,75.0,1,4,2


In [60]:
images=images.reset_index()
images=(images.T.append(tile_df[['dx_type','age','sex','localization']].T)).T
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(images.drop(['label'],axis=1),images['label'], test_size=0.2, random_state=42)

In [97]:
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score

In [98]:
class TrAdaboostClassifier:
    def __init__(self, base_classifier=DecisionTreeClassifier(), N=10):
        self.base_classifier = base_classifier
        self.N = N
        self.beta_all = np.zeros([1, self.N])
        self.classifiers = []

    def fitdtree(self, x_source, x_target, y_source, y_target):
        dtree_model = DecisionTreeClassifier(max_depth = 2).fit(X_train, y_train) 
        dtree_predictions = dtree_model.predict(X_test) 
        accuracy = dtree_model.score(X_test, y_test) 
        print('Using Decision Tree as the base Learner : ')
        print('Accuracy:')
        print(accuracy)
        scores = cross_val_score(dtree_model, X_test, y_test, cv=10)
        print('Cross-Validation:')
        print(scores)
        #cm=confusion_matrix(y_test,dtree_predictions) 
        #print('Confusion Matrix when Decision Tree is the base Learner : ')
        #print(cm)
        f1=f1_score(y_test, dtree_predictions, average='weighted')
        print('f1-score: ') 
        print(f1)
        p=precision_recall_fscore_support(y_test,dtree_predictions, average='macro')
        print('Precision , Recall , F-score:') 
        print(p)
        
    def fitsvm(self, x_source, x_target, y_source, y_target):
        svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train) 
        svm_predictions = svm_model_linear.predict(X_test) 
        accuracy = svm_model_linear.score(X_test, y_test) 
        print('Using SVM as the base Learner: ') 
        print('Accuracy:')
        print(accuracy)
        scores = cross_val_score(svm_model_linear, X_test, y_test, cv=10)
        print('Cross-Validation:')
        print(scores)
        #cm=confusion_matrix(y_test, svm_predictions) 
        #print('Confusion Matrix when SVM is the base Learner : ')
        #print(cm)
        f1=f1_score(y_test, svm_predictions, average='weighted')
        print('f1-score:') 
        print(f1)
        p=precision_recall_fscore_support(y_test,svm_predictions, average='macro')
        print('Precision , Recall , F-score: ') 
        print(p)
        
         
        
    def fitnaive(self, x_source, x_target, y_source, y_target):
        gnb = GaussianNB().fit(X_train, y_train) 
        gnb_predictions = gnb.predict(X_test) 
        accuracy = gnb.score(X_test, y_test) 
        print('Using Naive Bayes as the base Learner: ')
        print('Accuracy:')
        print(accuracy)
        scores = cross_val_score(gnb, X_test, y_test, cv=10)
        print('Cross-Validation:')
        print(scores)
        #cm=confusion_matrix(y_test, gnb_predictions) 
        #print('Confusion Matrix when Naive Bayes is the base Learner : ')
        #print(cm)
        f1=f1_score(y_test, gnb_predictions, average='weighted')
        print('f1-score : ') 
        print(f1)
        p=precision_recall_fscore_support(y_test,gnb_predictions, average='macro')
        print('Precision , Recall , F-score:') 
        print(p)
        
    def fitrf(self, x_source, x_target, y_source, y_target):
        model = RandomForestClassifier(n_estimators=10)
        rf=model.fit(X_train, y_train)
        rf_predictions=rf.predict(X_test)
        accuracy=rf.score(X_test, y_test)
        print('Using Random Forest Classifier as the base Learner : ')
        print('Accuracy:')
        print(accuracy)
        scores = cross_val_score(rf, X_test, y_test, cv=10)
        print('Cross-Validation:')
        print(scores)
        #cm=confusion_matrix(y_test, rf_predictions) 
        #print('Confusion Matrix when Random Forest Classifier is the base Learner : ')
        #print(cm)
        f1=f1_score(y_test, rf_predictions, average='weighted')
        print('f1-score: ') 
        print(f1)
        p=precision_recall_fscore_support(y_test,rf_predictions, average='macro')
        print('Precision , Recall , F-score: ') 
        print(p)
        
    def fitNearestNeighbours(self, x_source, x_target, y_source, y_target):
        model = neighbors.KNeighborsClassifier()
        nn=model.fit(X_train, y_train)
        nn_predictions=nn.predict(X_test)
        accuracy=nn.score(X_test, y_test)
        print('Using K Nearest Neighbours as the base Learner : ')
        print(accuracy)
        scores = cross_val_score(nn, X_test, y_test, cv=10)
        print('Cross-Validation:')
        print(scores)
        #cm=confusion_matrix(y_test, rf_predictions) 
        #print('Confusion Matrix when Random Forest Classifier is the base Learner : ')
        #print(cm)
        f1=f1_score(y_test, nn_predictions, average='weighted')
        print('f1-score: ') 
        print(f1)
        p=precision_recall_fscore_support(y_test,nn_predictions, average='macro')
        print('Precision , Recall , F-score: ') 
        print(p)
  
    def predict(self, x_test):
        result = np.ones([x_test.shape[0], self.N + 1])
        predict = []

        i = 0
        for classifier in self.classifiers:
            y_pred = classifier.predict(x_test)
            result[:, i] = y_pred
            i += 1

        for i in range(x_test.shape[0]):
            left = np.sum(result[i, int(np.ceil(self.N / 2)): self.N] *
                          np.log(1 / self.beta_all[0, int(np.ceil(self.N / 2)):self.N]))

            right = 0.5 * np.sum(np.log(1 / self.beta_all[0, int(np.ceil(self.N / 2)): self.N]))

            if left >= right:
                predict.append(1)
            else:
                predict.append(0)
        return predict

    def predict_prob(self, x_test):
        result = np.ones([x_test.shape[0], self.N + 1])
        predict = []

        i = 0
        for classifier in self.classifiers:
            y_pred = classifier.predict(x_test)
            result[:, i] = y_pred
            i += 1

        for i in range(x_test.shape[0]):
            left = np.sum(result[i, int(np.ceil(self.N / 2)): self.N] *
                          np.log(1 / self.beta_all[0, int(np.ceil(self.N / 2)):self.N]))

            right = 0.5 * np.sum(np.log(1 / self.beta_all[0, int(np.ceil(self.N / 2)): self.N]))
            predict.append([left, right])
        return predict

    def _calculate_weight(self, weights):
        sum_weight = np.sum(weights)
        return np.asarray(weights / sum_weight, order='C')

    def _calculate_error_rate(self, y_target, y_predict, weight_target):
        sum_weight = np.sum(weight_target)
        return np.sum(weight_target[:, 0] / sum_weight * np.abs(y_target - y_predict))
    

In [99]:
tr=TrAdaboostClassifier()

In [100]:
tr.fitdtree(X_train, X_test, y_train, y_test)
tr.fitsvm(X_train, X_test, y_train, y_test)
tr.fitnaive(X_train, X_test, y_train, y_test)
tr.fitrf(X_train, X_test, y_train, y_test)
tr.fitNearestNeighbours(X_train, X_test, y_train, y_test)

Accuracy when Decision Tree is the base Learner : 
0.9266100848726909
f1-score : 
0.8997699963497814
Precision , Recall , F-score:
(0.5134042433110542, 0.5688044469282655, 0.5346267748793733, None)


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy when SVM is base Learner : 
0.9475786320519222
[0.91089109 0.92537313 0.92039801 0.89949749 0.91183879]
f1-score : 
0.9479362943574741
Precision , Recall , F-score: 
(0.8219100238047619, 0.8389472563696339, 0.8270580958780958, None)
Accuracy when Naive Bayes is base Learner : 
0.5771342985521717
f1-score : 
0.6180130700362945
Precision , Recall , F-score:
(0.36992632058391706, 0.5286572540515171, 0.40946705588247256, None)
Accuracy when Random Forest Classifier is base Learner : 
0.7863205192211683
f1-score : 
0.7595449365491649
Precision , Recall , F-score: 
(0.7291381746235626, 0.4143458805745654, 0.4583063060348839, None)
Accuracy when K Nearest Neighbours is base Learner : 
0.9515726410384423
f1-score : 
0.9495024901998802
Precision , Recall , F-score: 
(0.7985096427305969, 0.7552048600956925, 0.7704854426603603, None)


In [85]:
tr.predict(X_test)



[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
