In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt 
import os


# **Personnal projet:** Stoke predictions with very unbalanced class.

**Goal:** *Predict the stroke risk thanks to the healthcare-dataset-stroke*

In this notebook I will have to deal with very balanced target variable classes.

So I'm going to use several class rebalancing strategies. two **oversampling** methods, one **undersampling** method and **class_weight = "balanced"** for models that support this option.

Is notebook consist in 5 parts:

> **1. Exploratory data analysis**
> 
> **2. Features engineering**
> 
> **3. Class rebalancing strategies**
> 
> **4. Classification**
> 
>  **5.Let's Interpret the predictions**
> 
> >I will use the **Shapley** method which explains the contribution of each feature in the prediction. This method explains how the model works globally, but also each prediction individually. 

* In the last section **(Bonus)** I propose my method to test the model on false data samples.

***I hope you wiil enjoy readding :)***


In [None]:
df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
df.head(2)

In [None]:
print("The dasaset have {} columns and  {} rows".format(df.shape[1],df.shape[0]))
print("------------------------")
for col in df.columns:
    print("{} nan(s) in columns  {}".format(df[col].isna().sum(),col))

 # 1.Exploratory data analysis

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Stroke classes .",size=16)
sns.countplot(data=df, x="stroke",edgecolor="black",color="#b8c7e1")

The target classes are very unbalanced!

* **Smoking status**

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("smoking_status classes repartion.")
sns.countplot(data=df, x="smoking_status",hue="stroke",edgecolor="black",color="#b8c7e1")

In [None]:
def exact_percent(var):
    #Extraction of the percent of stroke in each class of smoke status
    percent=[]
    lab=[]
    for s in df[var].unique():
        d=df.loc[df[var]==s]
        N= d.shape[0]
        n1=d.loc[d["stroke"]==1].shape[0]
        p =n1/N*100
        percent.append(p)
        lab.append(s)
    return lab,percent

In [None]:
def plot_percent(lab,percent,name):
    fig=plt.figure(figsize=[8,6])
    fig.patch.set_facecolor('#E0E0E0')
    fig.patch.set_alpha(0.7)
    plt.title("Stroke rate according the {} .".format(name))
    plt.bar(np.arange(len(lab)),percent,edgecolor="black",color="#b8c7e1")
    plt.xticks(np.arange(len(lab)),lab,rotation=45)
    plt.ylabel("percent %")
    y=max(percent)+2
    plt.ylim(0,y)
    for i, p in enumerate(percent):
        plt.text(i-0.1, p+0.5, "{}%".format(round(p,1)),size=12)
    plt.show()


In [None]:
lab,percent=exact_percent("smoking_status")
plot_percent(lab,percent,"smoking_status")

*  **GENDER and MARITAL STATUS**

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Gender classes repartion.")
sns.countplot(data=df, x='gender',hue="stroke",edgecolor="black",color="#b8c7e1")

In [None]:
lab,percent=exact_percent("gender")
plot_percent(lab,percent,"gender")

In [None]:
lab,percent=exact_percent('ever_married')
plot_percent(lab,percent,'ever_married')

There is a bias. the stroke rate is not directly related to whether you were married or not. People who have been married are on average much older (as we see on the next graph).

**==>Beware of misinterpretation**

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Age according the marital status classes.")
sns.boxplot(data=df,y='age',x='ever_married',width=0.4,showfliers=False,color="#b8c7e1")

* **work_type**

In [None]:
lab,percent=exact_percent('work_type')
plot_percent(lab,percent,'work_type')

* **Residence_type'**

In [None]:
lab,percent=exact_percent('Residence_type')
plot_percent(lab,percent,'Residence_type')


In [None]:
lab,percent=exact_percent('hypertension')
plot_percent(lab,percent,'hypertension')


In [None]:
lab,percent=exact_percent('heart_disease')
plot_percent(lab,percent,'heart_disease')

* **'avg_glucose_level' and 'bmi'**

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("avg_glucose_level repartion according the stoke classes.")
sns.boxplot(data=df,y='avg_glucose_level',x="gender",hue="stroke",width=0.4,showfliers=False,color="#b8c7e1")

In [None]:
fig=plt.figure(figsize=[8,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("avg_glucose_level repartion according the stoke classes.")
sns.boxplot(data=df,y='bmi',x="gender",hue="stroke",width=0.4,showfliers=False,color="#b8c7e1")

 # 2. Features ingineering

In [None]:
def smoke_encoder(val):
    if val  in ['Unknown','never smoked']:
        return 0
    elif val=="smokes":
        return 1
    else:
        return 2
    

In [None]:
data=df[["stroke","avg_glucose_level","bmi","age","hypertension","heart_disease","gender","ever_married"]].copy()
data["smoking_status"]=df["smoking_status"].apply(smoke_encoder)

In [None]:
def work_type_encoder(val):
    if val=='Self-employed':
        return 2
    elif val in ['Private','Govt_job']:
        return 1
    else: 
        return 0

data["word_type"]=df["work_type"].apply(work_type_encoder)
    

In [None]:
def encode_gender(val):
    if val=="Male":
        return 1
    elif val== "Female":
        return 0
    else:
        return np.nan
data["gender"]=data["gender"].apply(encode_gender)

In [None]:
def encode_married(val):
    if val=="Yes":
        return 1
    else:
        return 0
data["ever_married"]=data["ever_married"].apply(encode_married)

We remove the individual whose gender is "Other"

In [None]:
data=data.loc[data["gender"].isna()==False]


In [None]:
data=data.fillna(data.mean())

* **Check the correlations**

the following function allows to keep all the individuals of class 1 for the target variable + a N sample of the others

In [None]:

def get_sample(N,df):
    d0=df.loc[df["stroke"]==0]
    d1=df.loc[df["stroke"]==1]
    
    n =d0.shape[0]
    
    p =N/n
    sample=d0.sample(frac=p, replace=True)
    d=pd.concat([d1,sample])
    print("A {} rows sample as been extracted.".format(d.shape[0]))
    
    return d
    

In [None]:
sample=get_sample(300,data)

In [None]:

# calculate the correlation matrix
corr = sample.corr()

# plot the heatmap
fig=plt.figure(figsize=[12,9])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)

plt.title("Correlations",size=18)
ax=sns.heatmap(corr, vmin=-1, vmax=1,cmap="bwr",
        xticklabels=data.columns,
        yticklabels=data.columns)

In [None]:
#We take a sample for a better visualization
sample=get_sample(800,data)
sample.shape

 # 3.  **Class rebalancing strategies**

We use 3 class rebalancing strategies:

* Two Oversampling strategies:

**ADASYN** (Adaptive Synthetic): is an algorithm that generates synthetic data, and its greatest advantages are not copying the same minority data, and generating more data for “harder to learn” examples.

**SMOTE** (Synthetic Minority Oversampling Technique): first selects a minority class instance **A** at random and finds its k nearest minority class neighbors. The synthetic instance is then created by choosing one of the k nearest neighbors **B** at random and connecting **A** and **B** to form a line segment in the feature space. The synthetic instances are generated as a convex combination of the two chosen instances **A** and **B**.

* One undersampling method:

**Random Undersampling**: This method seeks to randomly select and remove samples from the majority class, consequently reducing the number of examples in the majority class in the transformed data.

> *Warning*:This can be highly problematic, as the loss of such data can make the decision boundary between the minority and majority instances harder to learn, resulting in a loss in classification performance.

We also use the **class_weight="balanced"** option for the randomforest and the logistic regression .


In [None]:
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SMOTE

X=sample.drop(columns=["stroke"])
y=sample["stroke"]

pca = PCA(n_components=2)
# Fit and transform x to visualise inside a 2D feature space
X_vis = pca.fit_transform(X)

# Apply the random over-sampling
ada = ADASYN()
#ada.fit(X,y)
X_resampled, y_resampled = ada.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5)
ax2.set_title('ADASYN')

# make nice plotting
for ax in (ax1, ax2):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
   
plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
              ncol=2, labelspacing=0.)
plt.tight_layout(pad=3)
plt.show()

In [None]:
# Apply the random over-sampling
smo = SMOTE()
#ada.fit(X,y)
X_resampled, y_resampled = smo.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5)
ax2.set_title('SMOTE')

# make nice plotting
for ax in (ax1, ax2):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
   
plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
              ncol=2, labelspacing=0.)
plt.tight_layout(pad=3)
plt.show()


In [None]:

from imblearn.under_sampling import RandomUnderSampler

ros = RandomUnderSampler()
#ada.fit(X,y)
X_resampled, y_resampled = ros.fit_resample(X, y)
X_res_vis = pca.transform(X_resampled)

# Two subplots, unpack the axes array immediately
f, (ax1, ax2) = plt.subplots(1, 2)

c0 = ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0",
                 alpha=0.5)
c1 = ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1",
                 alpha=0.5)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5)
ax2.set_title('Random under sampler')

# make nice plotting
for ax in (ax1, ax2):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.spines['left'].set_position(('outward', 10))
    ax.spines['bottom'].set_position(('outward', 10))
   
plt.figlegend((c0, c1), ('Class #0', 'Class #1'), loc='lower center',
              ncol=2, labelspacing=0.)
plt.tight_layout(pad=3)
plt.show()


 # 4. **Classification**

* **class rebalancing**

We use the **F1 score** as cross validation score for the rest of the notebook. 

Using accuracy would be irrelevant on the test set where the classes are unbalanced(The models would look falsely efficient)

In [None]:
#under and over sampler 
ada = ADASYN()
smo = SMOTE()
rus = RandomUnderSampler()

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

def lets_try(train, y):
    results = {}
    ss=StandardScaler()
    scaled_train=ss.fit_transform(train)
    
   
    def test_model(clf):
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, train, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores

    #for the model which needed standardized data 
    def test_model_scaler(clf):
    
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, scaled_train, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores
    
    clf = SVC(kernel="linear")
    results["SVC"] = test_model_scaler(clf)
    print("SVC done")
    
    clf = LogisticRegression()
    results["Logistic Regression"] = test_model_scaler(clf)
    print("Logistic Regression done")

    clf = KNeighborsClassifier()
    results["Kneighbors"] = test_model(clf)
    print("Kneighbors done")

    clf = SVC(kernel="poly")
    results["SVC poly"] = test_model_scaler(clf)
    print("SVC poly done.")

    clf = RandomForestClassifier()
    results["Random Forest Classifier"] = test_model(clf)
    print("Random Forest Classifier done")


    clf =SVC(kernel='rbf')
    results["SVC RBF"] = test_model_scaler(clf)
    print("SVC rbf done")

   
    return results 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X=data.drop(columns=["stroke"])
y=data["stroke"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [None]:
X_train_res, y_train_res=ada.fit_resample(X, y)
adasyn_results=lets_try(X_train_res, y_train_res)

In [None]:
X_train_res, y_train_res=smo.fit_resample(X, y)
smote_results=lets_try(X_train_res, y_train_res)

In [None]:
X_train_res, y_train_res=rus.fit_resample(X, y)
rus_results=lets_try(X_train_res, y_train_res)

* **With the *class_weight="balanced"* option**

In [None]:
def lets_try2(train, y):
    results = {}
    ss=StandardScaler()
    scaled_train=ss.fit_transform(train)
    
   
    def test_model(clf):
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, train, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores

    #for the model which needed standardized data 
    def test_model_scaler(clf):
    
        cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
        scores = cross_val_score(clf, scaled_train, y, scoring='f1', cv=cv, n_jobs=-1, error_score='raise')
        return scores
    
    
    clf = LogisticRegression(class_weight="balanced")
    results["Logistic Regression"] = test_model_scaler(clf)
    print("Logistic Regression done")

    

    clf = RandomForestClassifier(class_weight="balanced")
    results["Random Forest Classifier"] = test_model(clf)
    print("Random Forest Classifier done")

   
    return results 

In [None]:
balanced_results=lets_try2(X_train, y_train)

In [None]:
fig=plt.figure(1,figsize=[20,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("CV results on the train set . 5 folds",size=16)

plt.subplot(1,4,1)
plt.title("Adasyn oversampling",size=16)
plt.boxplot(adasyn_results.values(),labels=adasyn_results.keys(),showmeans=True)
plt.ylabel("  Scores CV \n (f1)",size=14)
plt.ylim(0.6,1)
plt.xticks(rotation=90)
plt.grid()

plt.subplot(1,4,2)
plt.title("Smote oversampling",size=16)
plt.boxplot(smote_results.values(),labels=smote_results.keys(),showmeans=True)
plt.ylim(0.6,1)
plt.xticks(rotation=90)
plt.grid()

plt.subplot(1,4,3)
plt.title("Random Undersampling",size=16)
plt.boxplot(rus_results.values(),labels=rus_results.keys(),showmeans=True)
plt.ylim(0.6,1)
plt.xticks(rotation=90)
plt.grid()

plt.subplot(1,4,4)
plt.title("Balanced class_weight",size=16)
plt.boxplot(balanced_results.values(),labels=balanced_results.keys(),showmeans=True)
plt.ylim(0,1)
plt.xticks(rotation=90)
plt.grid()

* **Check results on the testing set.**

In [None]:
from sklearn.metrics import f1_score
def lets_try_test(test, y,X_train,y_train):
    results = {}
    ss=StandardScaler()
    scaled_train=ss.fit_transform(X_train)
    scaled_test=ss.transform(test)
    
   
    def test_model(clf):
        clf.fit(X_train,y_train)
        pred=clf.predict(test)
        score=f1_score(y,pred)
        return score

    #for the model which needed standardized data 
    def test_model_scaler(clf):
        clf.fit(scaled_train,y_train)
        pred=clf.predict(scaled_test)
        score=f1_score(y,pred)
        return score

    
    clf = SVC(kernel="linear")
    results["SVC"] = test_model_scaler(clf)
    print("SVC done")
    
    clf = LogisticRegression()
    results["Logistic Regression"] = test_model_scaler(clf)
    print("Logistic Regression done")

    clf = KNeighborsClassifier()
    results["Kneighbors"] = test_model(clf)
    print("Kneighbors done")

    clf = SVC(kernel="poly")
    results["SVC poly"] = test_model_scaler(clf)
    print("SVC poly done.")

    clf = RandomForestClassifier()
    results["Random Forest Classifier"] = test_model(clf)
    print("Random Forest Classifier done")


    clf =SVC(kernel='rbf')
    results["SVC RBF"] = test_model_scaler(clf)
    print("SVC rbf done")

   
    return results 

In [None]:
X_train_res, y_train_res=ada.fit_resample(X, y)
adasyn_results=lets_try_test(X_test,y_test,X_train_res, y_train_res)

In [None]:
X_train_res, y_train_res=smo.fit_resample(X, y)
smote_results=lets_try_test(X_test,y_test,X_train_res, y_train_res)

In [None]:
X_train_res, y_train_res=rus.fit_resample(X, y)
rus_results=lets_try_test(X_test,y_test,X_train_res, y_train_res)

In [None]:
fig=plt.figure(1,figsize=[20,6])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.suptitle("F1 results on the testing set",size=16)
n=len(adasyn_results)
plt.subplot(1,3,1)
plt.title("Adasyn oversampling",size=16)
plt.bar(range(0,n),adasyn_results.values(),edgecolor="black",color="#b8c7e1")
plt.ylabel("F1 score",size=14)
plt.ylim(0,1.01)
plt.xticks(range(0,n),adasyn_results.keys(),rotation=90)
plt.grid()

plt.subplot(1,3,2)
plt.title("Smote oversampling",size=16)
plt.bar(range(0,n),smote_results.values(),edgecolor="black",color="#b8c7e1")
plt.ylim(0,1.01)
plt.xticks(range(0,n),smote_results.keys(),rotation=90)
plt.grid()

plt.subplot(1,3,3)
plt.title("Random Undersampling",size=16)
plt.bar(range(0,n),rus_results.values(),edgecolor="black",color="#b8c7e1")
plt.ylim(0,1.01)
plt.xticks(range(0,n),rus_results.keys(),rotation=90)
plt.grid()


In [None]:
X_train_res, y_train_res=ada.fit_resample(X, y)

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
model0=RandomForestClassifier()
model0.fit(X_train_res, y_train_res)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model0,X_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the testing set \n with ADASYN",size=14)
plt.show()

Just take a look on the Kneigbors performances on the testing set (by using SMOTE)

In [None]:
X_train_res, y_train_res=smo.fit_resample(X, y)

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
model0=KNeighborsClassifier()
model0.fit(X_train_res, y_train_res)

fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(model0,X_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the KNeighbors on the testing set \n with smote",size=14)
plt.show()

In [None]:
#We need to convert the target variable in a categoriel one 
# for a better displayed summary_plot (shap )
def convert_variable(val):
    return str(val)

In [None]:
X_train_res, y_train_res=smo.fit_resample(X, y)
y_train_res=y_train_res.apply(convert_variable)

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix

rf_smote=RandomForestClassifier()
rf_smote.fit(X_train_res, y_train_res)

y_test=y_test.apply(convert_variable)
fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(rf_smote,X_test, y_test
                           , cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the testing set \n with smothe",size=14)
plt.show()

# 5. **Let's Interpret the predictions**

In [None]:
fig=plt.figure(figsize=[10,10])
fig.patch.set_facecolor('#E0E0E0')
fig.patch.set_alpha(0.7)
plt.title("Feature importances of the Random forest using SMOTE.",size=16)
plt.barh(X_test.columns, rf_smote.feature_importances_,color="#28a2b4",edgecolor='black')
plt.grid()

* **SHAP**:

In the following section,I will use **the shapley method** to interpret the model predictions

SHAP (SHapley Additive exPlanations) by Lundberg and Lee (2016)48 is a method to explain individual predictions. SHAP is based on the game theoretically optimal Shapley Values.

More informations? follow this link: https://christophm.github.io/interpretable-ml-book/shap.html#examples-4


In [None]:
import shap

shap.initjs()

In [None]:
explainer = shap.TreeExplainer(rf_smote)
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values[1],X_test)


Features are displayed according to their impact on the prediction in descending order.

For example, age is the most impactful feature, a high age will tend to increase
 stroke risk score
 
 This summary_plot is globally consistent with the previous feature importances plot 

In [None]:
def display_gender(val):
    if val ==1:
        return "Male"
    else:
        return "Female"
def display_married(val):
    if val==1:
        return "Ever married"
    else:
        return "Never married"
    
    
def show_one_case(num):
    print("--------------------")
    print("Case N° {}".format(num))
    print("--------------------")
    case=X_test.iloc[[num],:]
    for i in case.columns:
        if i=="gender":
            print(i,":  ", display_gender(case[i].values[0]))
        elif i=="ever_married":
            print("Martial status :  ", display_married(case[i].values[0]))
        else:
            print(i,":  ", case[i].values[0])
    pred=rf_smote.predict_proba(case)[0]
    print("--------------------")
    print('True class :', y_test.iloc[num])
    print("Stroke risks: {} %".format(pred[1]*100))

In [None]:
n=1674

#421
show_one_case(n)

In [None]:
case1 = X_test.iloc[[n],:]
shap_values = explainer.shap_values(case1)
shap.force_plot(explainer.expected_value[1], shap_values[1], case1)


In the case of this 82-year-old woman, her age,her word type as well as her high BMI considerably increase her risk of stroke.
Risk = 82%


In [None]:
n=89
show_one_case(n)

In [None]:
case2 = X_test.iloc[[n],:]
shap_values = explainer.shap_values(case2)
shap.force_plot(explainer.expected_value[1], shap_values[1], case2)


Here we observe the case of a child it is mainly his very young age (5 years old) which protected him from  stroke risk.

# **Bonus**: "Test of fire".

We  submit our model to two robustness tests by offering it a false sample to predict.

**Test 1:** For this first test, we  generate false data by randomly choosing values among those present in the trainning set.

**Test 2:** for the second test we generate  random numbers between 0 and 300  for each feature

In [None]:
from random import sample
from random import choice
def generate_fake_sample1(N,Xtest,ytest):
    X_fake_sample=pd.DataFrame(columns=Xtest.columns)
    for feature in Xtest.columns:
        list_values=list(Xtest[feature])
        fake_values=sample(list_values,N)
        X_fake_sample[feature]=pd.Series(fake_values)
    l=["0","1"]    
    y_fake=[choice(l) for i in range(0,N) ]
    
    return X_fake_sample,pd.Series(y_fake)
    
    

In [None]:
def generate_fake_sample2(N,Xtest,ytest):
    X_fake_sample=pd.DataFrame(columns=Xtest.columns)
    for feature in Xtest.columns:
        fake_values=randint(0,300,N)
        X_fake_sample[feature]=pd.Series(fake_values)
    l=["0","1"]    
    y_fake=[choice(l) for i in range(0,N) ]
    
    return X_fake_sample,pd.Series(y_fake)

In [None]:
x_fake,y_fake=generate_fake_sample1(800,X_train,y_train)

In [None]:
fake_pred=rf_smote.predict(x_fake)


In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(rf_smote,x_fake,y_fake
                           ,normalize="true", cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the Fake set N°1 \n with smothe",size=14)
plt.show()

In [None]:
x_fake,y_fake=generate_fake_sample1(800,X_train,y_train)
fake_pred=rf_smote.predict(x_fake)
fig, ax = plt.subplots(figsize=(8, 6))
disp=plot_confusion_matrix(rf_smote,x_fake,y_fake
                           ,normalize="true", cmap=plt.cm.Blues, ax=ax)
disp.ax_.set_title("Results of the Random forest on the Fake set N°2 \n with smothe",size=14)
plt.show()

**On  fake samples, the performances are  not good. So we can reasonably think that scores on real data reflect a real ability to predict stroke risks.**

# Conclusion:
The random forest obtains perfect results regardless of the oversampling strategy used.
**SMOTE** and **ADASYN** can therefore be used interchangeably.

The **age**, the **avg_glucose_level** and body **masse index** are the features who have the biggest impact on the predicition.

With shapley method we see that the **marital status** and the **gender** also impact the prediction