<a href="https://colab.research.google.com/github/semoennaciri/Tipemohamedennaciri/blob/main/stroke_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#stroke prediction 

##Importation de la data 

In [1]:
from google.colab import files 
import io 
uploaded = files.upload()

Saving healthcare-dataset-stroke-data.xlsx to healthcare-dataset-stroke-data.xlsx


### libraries

In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import KNNImputer
import sklearn

##Data

In [9]:
data=pd.read_excel('healthcare-dataset-stroke-data.xlsx')

In [None]:
data.describe()

##Correlation


In [None]:
mat=data.corr()

##data visualisation 

In [None]:
fig = px.scatter(data, x="age", y="stroke", facet_col="stroke")
fig.show()

In [None]:
fig = px.histogram(data, x="ever_married",y='stroke',color='ever_married')
fig.show()

In [None]:
fig = px.histogram(data, x="work_type",y='stroke',color='work_type')
fig.show()

In [None]:
fig = px.histogram(data, x="smoking_status",y='stroke',color='smoking_status')
fig.show()

In [None]:
fig = px.histogram(data, x="stroke",color='stroke')
fig.show()

In [None]:
fig = px.scatter(x=data['avg_glucose_level'], y=data['stroke'])
fig.show()

##data cleaning 

In [12]:
# train-test split
X = data.drop('stroke', axis=1)
y = data.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25, random_state=34)

In [13]:
#  traitement des variables catégorielles
nominals = [c for c in X_train if X_train[c].dtypes=="object"]
for c in nominals:
    label_encoder = LabelEncoder()
    label_encoder.fit(list(X_train[c]) + list(X_test[c]))
    X_train[c] = label_encoder.transform(X_train[c])
    X_test[c] = label_encoder.transform(X_test[c])

In [14]:
# standarisation 
continious = [c for c in X_train if X_train[c].dtypes=="float64"]
sc = StandardScaler()
train_transformed = sc.fit_transform(X_train[continious])
X_train[continious] = pd.DataFrame(train_transformed,columns = X_train[continious].columns, index=X_train[continious].index)
X_test[continious] = pd.DataFrame(sc.transform(X_test[continious]),columns = X_test[continious].columns, index=X_test[continious].index)
# utiliser KNN-Imputer pour NaN
imputer = KNNImputer()
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns, index=X_test.index)

##imbalanced data 

In [15]:
lg = LogisticRegression(random_state = 34)
lg.fit(X_train, y_train)
y_pred = lg.predict(X_test)

In [16]:
confusion_matrix(y_test, y_pred)

array([[1219,    0],
       [  59,    0]])

In [17]:
print('Train Score is : ' , lg.score(X_train, y_train))
print('Test Score is : ' , lg.score(X_test, y_test))

Train Score is :  0.9504175365344467
Test Score is :  0.9538341158059468


In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.show()

##résolution du probleme

###under-sampling

In [25]:
from sklearn.utils import resample
# les classes 
train = X_train.copy()
train['stroke'] = y_train
stroke = train[(train.stroke == 1)]
not_stroke = train[(train.stroke == 0)]
not_stroke_downsampled = resample(not_stroke, replace=False, n_samples=len(stroke), random_state=34)
train_downsampled = pd.concat([not_stroke_downsampled, stroke]).sample(frac=1)
train_downsampled.stroke.value_counts()

1    190
0    190
Name: stroke, dtype: int64

In [26]:
downsampled_y = train_downsampled['stroke']
downsampled_X = train_downsampled.drop('stroke', axis=1)

###log regression after under-sampling




In [27]:
lg = LogisticRegression(random_state = 34)
lg.fit(downsampled_X, downsampled_y)
y_pred = lg.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred)

array([[876, 343],
       [ 16,  43]])

In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.show()

In [31]:
print('Train Score is : ' , lg.score(X_train, y_train))
print('Test Score is : ' , lg.score(X_test, y_test))

Train Score is :  0.7429540709812108
Test Score is :  0.7190923317683882


### over-sampling

In [37]:
stroke_upsampled = resample(stroke, replace=True, n_samples=len(not_stroke), random_state=34)
train_upsampled = pd.concat([stroke_upsampled, not_stroke]).sample(frac=1)
train_upsampled.stroke.value_counts()

0    3642
1    3642
Name: stroke, dtype: int64

In [38]:
upsampled_y = train_upsampled['stroke']
upsampled_X = train_upsampled.drop('stroke', axis=1)

###log regression after over-sampling

In [39]:
lg = LogisticRegression(random_state = 34)
lg.fit(upsampled_X, upsampled_y)
y_pred = lg.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[896, 323],
       [ 14,  45]])

In [40]:
print('Train Score is : ' , lg.score(X_train, y_train))
print('Test Score is : ' , lg.score(X_test, y_test))

Train Score is :  0.7533924843423799
Test Score is :  0.7363067292644757


In [None]:
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred)
plt.plot(fpr,tpr)
plt.show()