In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import time
import seaborn as sns 

from sklearn.model_selection import train_test_split
import xgboost as xgb
%pylab inline
warnings.filterwarnings(action="ignore")

plt.style.use('seaborn-darkgrid')
sns.set(font_scale=1)

import os
print(os.listdir("../input"))

In [None]:
breastCancer = pd.read_csv('../input/data.csv')

breastCancer = breastCancer[["diagnosis","radius_mean","radius_se","radius_worst",
              "perimeter_mean","perimeter_se","perimeter_worst",
              "area_mean","area_se","area_worst",
              "smoothness_mean","smoothness_se","smoothness_worst",
              "compactness_mean","compactness_se","compactness_worst",  
              "concavity_mean","concavity_se","concavity_worst",         
              "concave points_mean","concave points_se","concave points_worst",              
              "symmetry_mean","symmetry_se","symmetry_worst",        
              "fractal_dimension_mean","fractal_dimension_se","fractal_dimension_worst",
              "texture_mean","texture_se","texture_worst"]]


breastCancer.columns = ['diagnosis', 'radius_mean', 'radius_se', 'radius_worst',
       'perimeter_mean', 'perimeter_se', 'perimeter_worst', 'area_mean',
       'area_se', 'area_worst', 'smoothness_mean', 'smoothness_se',
       'smoothness_worst', 'compactness_mean', 'compactness_se',
       'compactness_worst', 'concavity_mean', 'concavity_se',
       'concavity_worst', 'concave_points_mean', 'concave_points_se',
       'concave_points_worst', 'symmetry_mean', 'symmetry_se',
       'symmetry_worst', 'fractal_dimension_mean', 'fractal_dimension_se',
       'fractal_dimension_worst', 'texture_mean', 'texture_se',
       'texture_worst']

data = breastCancer
data.isnull().sum()

In [None]:
diagnostic = {'M':0,'B':1}
data["diagnostic"] = data.diagnosis
data.diagnosis = data.diagnosis.apply(lambda x:diagnostic[x])
data.diagnostic = data.diagnostic.apply(lambda x:"Maligne" if x == "M" else "Bénigne")
 
data.head()

In [None]:
colormap = plt.cm.RdBu
sns.set(font_scale=1)
plt.figure(figsize=(28,28))
plt.title('Correlation Pearson des variables', y=1.05, size=24)
sns.heatmap(data.drop(columns=['diagnostic'],axis=1).astype(float).corr(),linewidths=0.3,vmax=1.0, 
            square=True, cmap=colormap, linecolor='white', annot=True)

# Split data 

In [None]:
X = data.iloc[:,1:-1]
y = data.diagnosis
y.unique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train Classifiers

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, \
                             AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc, accuracy_score, log_loss, hamming_loss, \
                            precision_score, recall_score, f1_score, jaccard_similarity_score, \
                            precision_recall_curve
from xgboost.sklearn import XGBClassifier

def comparaisonsTreeClassifieurs(X_train, X_test, y_train, y_test, n_tree = 5000):
    t0 = time.time()  
    np.random.seed(123456)
    aucROC,accuracy,logloss,hammingloss,precision,sensibilite,f1,jaccard_similarity = \
             dict(),dict(),dict(),dict(),dict(),dict(),dict(),dict()

    yClassifications = pd.DataFrame()
    yClassifications['Observations']=y_test

    names = ["Decision_Tree", "Tree_Bagging", 
             "Random_Forest", "Ada_Boost", "Gradient_Boosting",
            "XGBoost"]
    
    xgb_params = {'base_score'      : 0.5,
              'booster'         : 'gbtree',
              'gamma'           : 3,
              'learning_rate'   : 0.3,
              'max_depth'       : 4,
              'min_child_weight': 2,
              'n_estimators'    : n_tree,
              'objective'       : 'binary:logistic',
              'silent'          : True,
              'subsample'       : 0.8}

    classifiers = [
        DecisionTreeClassifier(max_depth=5),
        BaggingClassifier(n_estimators=n_tree,bootstrap=True),
        RandomForestClassifier(max_depth=5, n_estimators=n_tree, max_features=5),
        AdaBoostClassifier(n_estimators=n_tree),
        GradientBoostingClassifier(n_estimators=n_tree, max_leaf_nodes=4,min_samples_split=5),
        XGBClassifier(**xgb_params)]

    print('-'*100)
    plt.figure(figsize=(18,18))

    for name, clf in zip(names, classifiers):
        t1 = time.time() 
    
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)*100
        print(f'Prédiction : {name:17s}'+(' %.8f' % score).lstrip('0'),end=' -- ')
        yClassifications[name+'_prob'] = clf.predict_proba(X_test)[:, 1]
        yClassifications[name+'_pred'] = clf.predict(X_test)

        #-------------------------------------------------------------------------------------
        accuracy[name]                 = accuracy_score(yClassifications['Observations'] ,yClassifications[name+'_pred'])
        logloss[name]                  = log_loss(yClassifications['Observations']       ,yClassifications[name+'_pred'])
        hammingloss[name]              = hamming_loss(yClassifications['Observations']   ,yClassifications[name+'_pred'])
        precision[name]                = precision_score(yClassifications['Observations'],yClassifications[name+'_pred'])
        sensibilite[name]              = recall_score(yClassifications['Observations']   ,yClassifications[name+'_pred'])
        f1[name]                       = f1_score(yClassifications['Observations']       ,yClassifications[name+'_pred'])
        jaccard_similarity[name]       = jaccard_similarity_score(yClassifications['Observations'],yClassifications[name+'_pred'])
        #-------------------------------------------------------------------------------------
        # L'utilisation de la deuxième valeur dans les probas car 1 - fpr
        #-------------------------------------------------------------------------------------
        fpr, tpr, thresholds = roc_curve(y_test.ravel(), yClassifications[name+'_prob'])
        aucROC[name] = auc(fpr, tpr)
        print ("Area under the ROC curve : %0.8f" % aucROC[name],end=' -- ')
        plt.plot(fpr, tpr, label=f"{name}(AUC = {aucROC[name]:0.8f})")
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([-0.05, 1.05])
        plt.ylim([-0.05, 1.05])
        plt.xlabel('False Positive rate(1 - Specificity)',size=18)
        plt.ylabel('Ture positive rate(Sensitivity)',size=18)
        plt.title('ROC curve (Receiver Operating Caracteristic)',size=20)
        plt.legend(loc="lower right")
        print(('%0.2fs' % (time.time() - t1)).lstrip('0'))
    plt.show()

    print('Execution  :'+('%.2fs' % (time.time() - t0)).lstrip('0'))

    resultats = pd.DataFrame(pd.Series(aucROC),columns=["aucROC"])
    resultats["accuracy"] = pd.Series(accuracy)
    resultats["log_loss"] = pd.Series(logloss)
    resultats["hamming_loss"] = pd.Series(hammingloss)
    resultats["precision"] = pd.Series(precision)
    resultats["sensibilite"] = pd.Series(sensibilite)
    resultats["f1"] = pd.Series(f1)
    resultats["jaccard_similarity"] = pd.Series(jaccard_similarity)
    resultats.sort_values(by='aucROC',ascending=False, inplace=True)
    
    return yClassifications, resultats

In [None]:
yClassifications, resultats = comparaisonsTreeClassifieurs(X_train, X_test, y_train, y_test, n_tree = 5000)
resultats

# Feature Importances

In [None]:
xgb_params = {'base_score'  : 0.5,
          'booster'         : 'gbtree',
          'gamma'           : 3,
          'learning_rate'   : 0.3,
          'max_depth'       : 4,
          'min_child_weight': 2,
          'n_estimators'    : 5000,
          'objective'       : 'binary:logistic',
          'silent'          : True,
          'subsample'       : 0.8}
clf = XGBClassifier(**xgb_params)
clf.fit(X_train, y_train)
feature_importance = pd.DataFrame(clf.feature_importances_, index =X_train.columns.values)
feature_importance.sort_values(by=0, ascending=False)

# New features

In [None]:
import copy as cp 
data_ft = cp.deepcopy(data)

In [None]:
data_ft['calc_sup01'] = pd.qcut(data_ft.radius_worst * \
                        data_ft.smoothness_worst * \
                        data_ft.concave_points_worst * \
                        data_ft.fractal_dimension_worst * \
                        data_ft.texture_worst / \
                        (data_ft.compactness_worst + data_ft.symmetry_worst),30,labels=False)

data_ft['calc_sup02'] = pd.qcut(data_ft.radius_worst * \
                        data_ft.smoothness_worst * \
                        data_ft.concave_points_worst * \
                        data_ft.texture_worst * \
                        data_ft.symmetry_worst,30,labels=False)

data_ft['calc_sup03'] = pd.qcut(data_ft.radius_worst * \
                        data_ft.smoothness_worst * \
                        data_ft.concave_points_worst * \
                        data_ft.fractal_dimension_worst * \
                        data_ft.texture_worst ,30,labels=False)

data_ft['radiusWQ']      = pd.qcut(data_ft.radius_worst      - data_ft.radius_se     ,100,labels=False)
data_ft['perimeterWQ']   = pd.qcut(data_ft.perimeter_worst   - data_ft.perimeter_se  ,100,labels=False)
data_ft['areaWQ']        = pd.qcut(data_ft.area_worst        - data_ft.area_se       ,100,labels=False)
data_ft['smoothnessWQ']  = pd.qcut(data_ft.smoothness_worst  - data_ft.smoothness_se ,100,labels=False)
data_ft['textureWQ']     = pd.qcut(data_ft.texture_worst     - data_ft.texture_se    ,100,labels=False)
data_ft['compactnessWQ'] = pd.qcut(data_ft.compactness_worst - data_ft.compactness_se,100,labels=False)

data_ft = data_ft[['diagnosis', 'radius_mean', 'radius_se', 'radius_worst',
       'perimeter_mean', 'perimeter_se', 'perimeter_worst', 'area_mean',
       'area_se', 'area_worst', 'smoothness_mean', 'smoothness_se',
       'smoothness_worst', 'compactness_mean', 'compactness_se',
       'compactness_worst', 'concavity_mean', 'concavity_se',
       'concavity_worst', 'concave_points_mean', 'concave_points_se',
       'concave_points_worst', 'symmetry_mean', 'symmetry_se',
       'symmetry_worst', 'fractal_dimension_mean', 'fractal_dimension_se',
       'fractal_dimension_worst', 'texture_mean', 'texture_se',
       'texture_worst', 'calc_sup01', 'calc_sup02',
       'calc_sup03', 'radiusWQ', 'perimeterWQ', 'areaWQ', 'smoothnessWQ',
       'textureWQ', 'compactnessWQ', 'diagnostic']]

data_ft.head()

# Split augmented data

In [None]:
X = data_ft.iloc[:,1:-1]
y = data_ft.diagnosis
y.unique()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Train classifiers with augmented data

In [None]:
yClassifications, resultats = comparaisonsTreeClassifieurs(X_train, X_test, y_train, y_test, n_tree = 5000)
resultats