In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Classification of the breast tumors

This notebook consist in 2 parts:

**1 EDA:**
> * visualisation and features selection (thanks to a random forest and logistic regression )

**2 . Prediction:**
> * 2.1 Comparison of model results and Explaination of the best model with SHAP
> * 2.2 Resampling with ADASYN, results comparison and Explaination with SHAP

**Features descriptions:**

1) ID number

2) Diagnosis (M = malignant, B = benign)

3-32)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter)

b) texture (standard deviation of gray-scale values)

c) perimeter

d) area

e) smoothness (local variation in radius lengths)

f) compactness (perimeter^2 / area - 1.0)

g) concavity (severity of concave portions of the contour)

h) concave points (number of concave portions of the contour)

i) symmetry

j) fractal dimension ("coastline approximation" - 1)

The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.

In [None]:
df=pd.read_csv("/kaggle/input/breast-cancer-wisconsin-data/data.csv")
print(df.shape)
df.head()

In [None]:
df=df.drop(columns=["Unnamed: 32"])
print(df.columns)

In [None]:
df.describe()

In [None]:
sns.countplot(data=df,x="diagnosis")

# 1- EDA

In [None]:
X=df.drop(columns=["id"]).copy()
def diag_encoder(letter):
    if letter=="M":
        return 1
    else:
        return 0
X["diagnosis"]=X["diagnosis"].apply(diag_encoder)
X=X.dropna()
# calculate the correlation matrix
corr = X.corr()

# plot the heatmap
fig=plt.figure(figsize=[12,9])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Correlation linéaire entre les variables",size=18)
ax=sns.heatmap(corr, vmin=-1, vmax=1,cmap="bwr",
        xticklabels=X.columns,
        yticklabels=X.columns)

In [None]:
import numpy as np

#Pour récuperer les coefficients
coef_pearson=[]
coef_spearman=[]
#Pour récupérer les labels simplifiés
labelp=[]
labels=[]
#Pour récuperer les couleurs des bars
colorp=[]
colors=[]
bleu="#9fb4ff"
rouge="#ffae9f"

#Variables étudiées
variables=[ 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']

#Seuil d'affichage de la variable
seuil=0.35
#On récupére les information
for var in variables:
    cp=stats.pearsonr(X['diagnosis'].values,X[var])[0]
    cs=stats.spearmanr(X['diagnosis'].values,X[var])[0]
   #Pour les coef de Pearson
    if abs(cp)>= seuil:
        coef_pearson.append(cp)
        labelp.append(var)
        if cp>0:
            colorp.append(bleu)
        else:
            colorp.append(rouge)
    #Pour les coef de Spearman                       
    if abs(cs)>= seuil:
        coef_spearman.append(cs)
        labels.append(var)
        if cs>0:
            colors.append(bleu)
        else:
            colors.append(rouge)

In [None]:
fig=plt.figure(1,figsize=[18,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("Variables correlated with the diagnosis",size=16)

# premier barplot des coefficients de corrélation de Pearson
plt.subplot(1,2,1)
plt.title("Pearson correlation ",size=15)
plt.bar(np.arange(len(labelp))+1,coef_pearson,color=colorp,edgecolor='black')
plt.xticks(np.arange(len(labelp))+1,labelp,rotation=90,size=14)
plt.hlines(0,0.5,len(labelp)+0.5,color='black')
plt.ylabel("coefficients value")
plt.ylim(-0.1,1)
plt.grid()

# 2eme barplot des coefficients de corrélation de Spearman
plt.subplot(1,2,2)
plt.title("Spearman correlation.",size=15)
plt.bar(np.arange(len(labels))+1,coef_spearman,color=colors,edgecolor='black')
plt.xticks(np.arange(len(labels))+1,labels,rotation=90, size=14)
plt.hlines(0,0.5,len(labels)+1,color='black')
plt.ylim(-0.1,1)
plt.grid()

We select the feature with a random forest and a logistique regression .

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
    
X=df.drop(columns=["diagnosis"])
y=df["diagnosis"]
RF=RandomForestClassifier(random_state=40,class_weight="balanced",criterion='entropy'
                                        ,n_estimators=300,max_depth=8)
RF.fit(X,y)

Xs=ss.fit_transform(X)
lr=LogisticRegression(class_weight="balanced")
lr.fit(Xs,y)

In [None]:
selected_var={}
tresh_rf=0.04
tresh_rl=0.5
for name,reglog,randfor in zip(list(X.columns), list(lr.coef_[0]),list(RF.feature_importances_)):
    if randfor>tresh_rf or abs(reglog)>tresh_rl:
        dico={'RF':round(randfor,3),"LR":round(reglog,2)}
        selected_var[name]=dico

In [None]:
feature1=list(selected_var.keys())

In [None]:
print("the selected variables with their coefficients:")
selected_var

In [None]:
coef_rf=[]
coef_lr=[]
for n in selected_var:
    d=selected_var[n]
    coef_rf.append(d['RF'])
    coef_lr.append(d['LR'])
df_coef=pd.DataFrame({"Name":list(selected_var.keys()),"RF feature importance":coef_rf,"LR coef":coef_lr})
df_coef.head()

In [None]:
n_displayed=8
d=df_coef.sort_values(by=["RF feature importance"],ascending=False)
d=d.iloc[:n_displayed,:]
fig=plt.figure(1,figsize=[18,8])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.subplot(1,2,1)
plt.title("Feature importances of the Random forest .",size=16)
plt.bar(range(0,n_displayed), d["RF feature importance"].values,color="#28a2b4",edgecolor='black')
plt.xticks(range(0,n_displayed),d["Name"],rotation=90,size=13)
plt.grid()

d=df_coef.copy()
d["LR coef abs"]=abs(d["LR coef"])
d=d.sort_values(by=["LR coef abs"],ascending=False)
d=d.iloc[:n_displayed,:]
plt.subplot(1,2,2)
plt.title("Coefficients of the logistic regression .",size=16)
plt.bar(range(0,n_displayed), d["LR coef"],color="#28a2b4",edgecolor='black')
plt.xticks(range(0,n_displayed),d["Name"],rotation=90,size=13)
plt.grid()

In [None]:

data=df[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean',
       'concavity_mean', 'concave points_mean', 'radius_se', 'perimeter_se',
       'area_se', 'compactness_se', 'fractal_dimension_se', 'radius_worst',
       'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst',
       'concavity_worst', 'concave points_worst', 'symmetry_worst',
       'fractal_dimension_worst', 'diagnosis']].copy()
data.head(2)

In [None]:
fig=plt.figure(1,figsize=[16,7])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("Radius",size=16)
plt.subplot(1,2,1)
sns.scatterplot(data=data,x="radius_mean",y="radius_worst",hue="diagnosis",palette=["#f65c5c","#92c58a"])
plt.subplot(1,2,2)
plt.title("Radius Standard déviation")
sns.boxplot(data=data,y="radius_se",x="diagnosis",color="#afbbd0",width=0.4,showfliers=False)

plt.xticks([0,1],["Malignant","benign"])

In [None]:
fig=plt.figure(1,figsize=[16,7])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("Concavity",size=16)
plt.subplot(1,2,1)
sns.scatterplot(data=data,x="concavity_mean",y='concave points_worst',hue="diagnosis",palette=["#f65c5c","#92c58a"])
plt.subplot(1,2,2)
plt.title("concavity_worst")
sns.boxplot(data=data,y='concavity_worst',x="diagnosis",color="#afbbd0",width=0.4,showfliers=False)
plt.xticks([0,1],["Malignant","benign"])

In [None]:

fig=plt.figure(1,figsize=[16,7])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("Perimetre, area and compactness",size=16)
plt.subplot(1,2,1)
sns.scatterplot(data=data,x='perimeter_mean',y='area_mean',hue="diagnosis",palette=["#f65c5c","#92c58a"])
plt.subplot(1,2,2)
plt.title("compactness_mean")
sns.boxplot(data=data,y='compactness_mean',x="diagnosis",color="#afbbd0",width=0.4,showfliers=False)
plt.xticks([0,1],["Malignant","benign"])

# PCA

In [None]:
feature1

In [None]:
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def display_scree_plot(pca):
#Fonction permettant d'afficher l'éblouis des valeurs propres
    scree = pca.explained_variance_ratio_*100
    fig=plt.figure(figsize=[10,7])
    fig.patch.set_facecolor('#E0E0E0')
    fig.patch.set_alpha(0.7)
    plt.bar(np.arange(len(scree))+1, scree,color="#6fd67b",edgecolor='black')
    plt.plot(np.arange(len(scree))+1, scree.cumsum(),c="red",marker='o')
    plt.xlabel("Axe rank")
    plt.ylabel("Percent of inertia")
    plt.title("Screen of explained variance")
    plt.grid()
    plt.show(block=False)

In [None]:

    
data=df[['perimeter_mean',
 'area_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'radius_se',
 'perimeter_se',
 'area_se',
 'compactness_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst',
 'diagnosis']].copy()    
X=data.drop(columns=["diagnosis"])
y=data["diagnosis"]
    
scaler=StandardScaler()
Xs=scaler.fit_transform(X)

pca = PCA(n_components=X.shape[1])
pca.fit(Xs)
display_scree_plot(pca)

In [None]:
import math
lab=X.columns
(fig, ax) = plt.subplots(figsize=(10, 10))

fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
for i in range(0, len(pca.components_)):
    n=len(lab[i])
    x=pca.components_[0, i]
    y=pca.components_[1, i]
    n= x**2+y**2
    n=math.sqrt(n)
    if n>0.2:
        if x>y:
            ax.arrow(0,0, 
                pca.components_[0, i], #0 for PC1
                pca.components_[1, i], #1 for PC2
                head_width=0.02,
                head_length=0.03)
                #On va placer les labels de maniere à les voir le plus clairement possible
           # plt.text(pca.components_[0, i]-n/2 ,
               # pca.components_[1, i]+0.06,
                #s=lab[i],c="r",size=11)

            if y<0:
                    plt.text(pca.components_[0, i]+0.06 ,
                    pca.components_[1, i]-0.02,
                    s=lab[i],c="r",size=11)
            else:
                    plt.text(pca.components_[0, i]+0.06 ,
                    pca.components_[1, i],
                    s=lab[i],c="r",size=11)

        else:
                ax.arrow(0,0, 
                pca.components_[0, i], #0 for PC1
                pca.components_[1, i], #1 for PC2
                head_width=0.02,
                head_length=0.03)

                plt.text(pca.components_[0, i] ,
                    pca.components_[1, i]+0.15,
                    s=lab[i],c="g",size=11)

                

an = np.linspace(0, 2 * np.pi, 100)
plt.plot(np.cos(an), np.sin(an)) # Add a unit circle for scale
plt.axis('equal')
ax.set_title('correlation circle')
plt.grid()
plt.show()

In [None]:
PC1=[]
PC2=[]
for i in range(0, len(pca.components_)):
    n=len(lab[i])
    x=pca.components_[0, i]
    y=pca.components_[1, i]
    n= x**2+y**2
    n=math.sqrt(n)
    if n>0.2:
        if x>y:
            PC1.append(lab[i])
        else:
            PC2.append(lab[i])
            
print("PCA 1:")
print(PC1)
print("PCA 2:")
print(PC2)

In [None]:
pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
d = pca.fit_transform(Xs)
d=pd.DataFrame(d,columns=["PC 1","PC 2"])
d["diagnosis"]=data["diagnosis"].values

In [None]:
(fig, ax) = plt.subplots(figsize=(10, 6))

fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
sns.scatterplot(data=d,x="PC 1",y="PC 2",hue="diagnosis")

On this projection on the first plan of PCA we can see that the two classes of the target variable are easy to distinguish

# Prediction:

## On the original train dataset.

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

def conv_y(y):
    #to convert the target in 0 and 1
    y_converted=[]
    for i in y:
        if i =="M":
            y_=1
        else:
            y_=0
        y_converted.append(y_)
    return pd.Series(y_converted)

def lets_try(train, y):
    results = {}
    ss=StandardScaler()
    scaled_train=ss.fit_transform(train)
    
   
    def test_model(clf):
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, train, conv_y(y), scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores

    #for the model which needed standardized data 
    def test_model_scaler(clf):
    
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, scaled_train,conv_y(y), scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores
    
    clf = SVC(kernel="linear")
    results["SVC"] = test_model_scaler(clf)
    print("SVC done")
    
    clf = LogisticRegression()
    results["Logistic Regression"] = test_model_scaler(clf)
    print("Logistic Regression done")

    clf = KNeighborsClassifier()
    results["Kneighbors"] = test_model(clf)
    print("Kneighbors done")

    clf = SVC(kernel="poly")
    results["SVC poly"] = test_model_scaler(clf)
    print("SVC poly done.")

    clf = RandomForestClassifier()
    results["Random Forest Classifier"] = test_model(clf)
    print("Random Forest Classifier done")


    clf =SVC(kernel='rbf')
    results["SVC RBF"] = test_model_scaler(clf)
    print("SVC rbf done")

    clf=GradientBoostingClassifier()
    results["GradientBoosting"]=test_model(clf)
    print("Grandient boosting done")
    
    clf=AdaBoostClassifier()
    results["AdaBoostClassifier"]=test_model(clf)
    print("AdaBoostClassifier done")
    
    return results 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X=data.drop(columns=["diagnosis"])
y=data["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
results=lets_try(X_train, y_train)

In [None]:
fig=plt.figure(1,figsize=[10,7])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("CV resultas for each model on the train set ",size=16)
plt.boxplot(results.values(),labels=results.keys(),showmeans=True)
plt.ylabel("  Scores CV \n (f1)",size=14)
plt.ylim(0.5,1)
plt.xticks(rotation=90)
#plt.grid()

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
model0=RandomForestClassifier()
model0.fit(X_train, y_train)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model0,X_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the testing set ",size=14)
plt.show()


In [None]:
ss=StandardScaler()
scaled_train=ss.fit_transform(X_train)
scaled_test=ss.fit_transform(X_test)

In [None]:
model1=LogisticRegression()
model1.fit(scaled_train, y_train)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model1,scaled_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Logistic regression on the testing set ",size=14)
plt.grid()
plt.show()


In [None]:
model2=SVC(kernel="linear")
model2.fit(scaled_train, y_train)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model2,scaled_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the SVC on the testing set ",size=14)
plt.show()

In [None]:
import shap

shap.initjs()

In [None]:
explainer = shap.LinearExplainer(model1, scaled_train, model_output = 'probability')
shap_values = explainer.shap_values(scaled_test)

In [None]:
plt.style.use('fivethirtyeight')
shap.summary_plot(shap_values, scaled_test, feature_names=X_test.columns, show=False)
fig = plt.gcf()
ax = plt.gca()
ax.set_xlabel("Benign<===   Target class     ===> malignant \n SHAP value (impact of the model output)")
ax.set_title("Impact of each feature in the Logistic regression predictions")

As we can see on this graph, a hight value on most of the feature inficated a hight probability of a malignant tumor, exept for the compactness (mean and se). A very compact tumor has a high chance of being benign. 

The features are ranked in descending order of their impact 

In [None]:
scaled_test=pd.DataFrame(scaled_test,columns=X_test.columns)
def shap_plot(i):
    individual=scaled_test.iloc[[i],:]
    print("___________________________________")
    print("The case n° {}".format(i))
    print("___________________________________")
    print("Has a  malignancy score of {} %".format( model1.predict_proba(individual)[0][1].round(2)*100))
    
    print('True class :', y_test.iloc[i])
    return(shap.force_plot( explainer.expected_value, shap_values[i,:], X_test.iloc[i,:],
    feature_names=X_test.columns))

thanks to the shap.force_plot we can explain the impact of features in **individual predictions**

In [None]:
col=['area_mean', 'texture_worst', 'perimeter_mean','concave points_se',
       'area_mean','radius_se', 'perimeter_se','smoothness_worst']


In [None]:
fig=plt.figure(1,figsize=[18,8])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("Distribution",size=16)
print("---------")
print("MEAN:")
print("---------")
print("concave_points_mean mean:",round(df["concave points_mean"].mean(),2))
for i in range(0,8):
    print(col_,"mean:",round(df[col_].mean(),2))
    plt.subplot(2,4,i+1)
    col_=col[i]
    plt.title(col_,size=16)
    plt.hist(df[col_],bins=20, alpha=0.5, label=col_,edgecolor="black")
   
    
    #plt.show()

In [None]:

shap_plot(3)


This case have low values on each feature, except for the smoothness_worst (0.145) witch is 
a little high (mean =0.13) 

In [None]:
shap_plot(15)

**the case n° 15 have high values for:**

concave points_mean (mean=0.05) in this case 0.146

texture_worst (mean=25.68) in this case,32.72

perimeter_se (mean =2.87) in this case 5.8

perimeter_mean (mean=91.97) in this case 129,1


> **This explains the score of 100%**

In [None]:
for i , n in enumerate(y_test.values):
    print(i ,n)

## Improve the prediction by using the resampler ADASYN.

In [None]:
from imblearn.over_sampling import ADASYN
ada=ADASYN()
X_train_res, y_train_res=ada.fit_resample(X_train, y_train)
adasyn_results=lets_try(X_train_res, y_train_res)

In [None]:
fig=plt.figure(1,figsize=[15,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("CV results on the train set . 5 folds",size=16)

plt.subplot(1,2,1)
plt.title("Adasyn oversampling",size=16)
plt.boxplot(adasyn_results.values(),labels=adasyn_results.keys(),showmeans=True)
plt.ylabel("  Scores CV \n (f1)",size=14)
plt.ylim(0.7,1)
plt.xticks(rotation=90)
#plt.grid()

plt.subplot(1,2,2)
plt.title("With no resampling",size=16)
plt.boxplot(results.values(),labels=results.keys(),showmeans=True)
plt.ylim(0.7,1)
plt.xticks(rotation=90)
#plt.grid()

Let's chech the SVC RGF(fit on the resampled set) results on the testing set.

In [None]:
 ss=StandardScaler()
scaled_train_r=ss.fit_transform(X_train_res)

clf =SVC(kernel='rbf',probability=True)
clf .fit(scaled_train_r,y_train_res)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(clf ,scaled_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the SVC RBF (with resampled train set) on the testing set ",size=14)
plt.grid()
plt.show()


With this model we have a litte bit more false positif but less false negatif (there the positive class is "Malignant")
in **a medical context**, it's more important to diagnosis correctly the malignants tumors so this model seems to be better.



In [None]:
explainer = shap.KernelExplainer(clf.predict_proba, scaled_train_r)
shap_values = explainer.shap_values(scaled_test)

In [None]:

shap.summary_plot(shap_values[1],X_test, show=False)
fig = plt.gcf()
ax = plt.gca()
ax.set_xlabel("Benign<===   Target class     ===> malignant \n SHAP value (impact of the model output)")
ax.set_title("Impact of each feature in the SVC(rbf) predictions \n with Adasyn resampling on the trainning set")

This summary_plot is consistent with the previous one.

# Conclusion:

Two models obtain comparable results,

The **logistic regression** on the selected dataset and the **SVC** (with a rgf kernel) on the resampled dataset, this model minimizes the number of undetected malignant tumors . 