<h1 style="background-color:powderblue;font-family:newtimeroman;font-size:350%;text-align:center;border-radius: 15px 50px;"><b>Stroke Prediction </b></h1>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings  
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
data.head()

In [None]:
# Dropping ID Column
data=data.drop(['id'],axis=1)
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
data.plot(kind="hist", y="age", bins=70, color="b", ax=axes[0][0])
data.plot(kind="hist", y="bmi", bins=100, color="r", ax=axes[0][1])
data.plot(kind="hist", y="heart_disease", bins=6, color="g", ax=axes[1][0])
data.plot(kind="hist", y="avg_glucose_level", bins=100, color="orange", ax=axes[1][1])
plt.show()

In [None]:
# Stroke Piechart

labels =data['stroke'].value_counts(sort = True).index
sizes = data['stroke'].value_counts(sort = True)

colors = ["green","red"]
explode = (0.05,0) 
 
plt.figure(figsize=(7,7))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90,)
plt.show()


In [None]:
# Displaying Missing Values
msno.bar(data)

In [None]:
# Filling Missing value with Mean
data['bmi'].fillna(data['bmi'].mean(),inplace=True)
data.isnull().sum()

**Sucessfully removed all the Null Values**

In [None]:
sns.countplot(data['gender'])

In [None]:
sns.boxplot(data['age'])

In [None]:
sns.countplot(data['work_type'])

**There are large Number of People who Work in Private Sector**

In [None]:
sns.countplot(data['Residence_type'])

In [None]:
sns.countplot(data['smoking_status'])

In [None]:
sns.countplot(data['ever_married'])

In [None]:
sns.countplot(data['stroke'])

**The Data is Highly Imbalanced**

In [None]:
data['work_type'].nunique()

#### MAPPING OF CATEGORICAL VARIABLES

In [None]:
data['work_type']= data['work_type'].map({'Private':0, 'Self-employed': 1, 'Govt_job':2, 'children':3, 'Never_worked':4})

In [None]:
data['work_type'].head()

In [None]:
data['gender'] = data['gender'].map({'Male':0, 'Female':1})
data['Residence_type'] = data['Residence_type'].map({'Urban':0, 'Rural':1})
data['smoking_status'] = data['smoking_status'].map({'formerly smoked':0, 'never smoked':1, 'smokes':2, 'Unknown':3})
data['ever_married'] = data['ever_married'].map({'Yes':0, 'No':1})


In [None]:
data.head()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(data.corr(method='pearson'), annot=True)

In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(x=data['age'], y=data['avg_glucose_level'])

In [None]:
plt.figure(figsize=(20,15))
sns.catplot(x='heart_disease',y='age', hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x='hypertension',y='age', hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x="smoking_status", y="stroke", hue="work_type", kind="bar", data=data)


In [None]:
sns.catplot(x="hypertension", y="stroke", hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x="Residence_type", y="stroke", hue="work_type", kind="bar", data=data)

In [None]:
sns.catplot(x='stroke', y="avg_glucose_level", kind="box", data=data)

## Applying ML Algorithm

In [None]:
features = ['age',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'avg_glucose_level',
 'bmi',
 'gender',
 'work_type',
 'smoking_status']

label = ['stroke']

X = data[features]
y = data[label]

In [None]:
X.isnull().sum()

In [None]:
X.isnull().sum()

In [None]:
X.gender=(X.gender.fillna(1))

In [None]:
# Treating Imbalance Data using SMOTE

from imblearn.over_sampling import SMOTE
smote=SMOTE()
x_smote,y_smote=smote.fit_resample(X,y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test=train_test_split(x_smote,y_smote,test_size=0.33,random_state=42)
X_valid, X_test, y_valid, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
# Rescaling Data
sc= StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
X_train

In [None]:
X_test

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()

In [None]:
LR.fit(X_train,y_train)


In [None]:
y_pred=LR.predict(X_test)

In [None]:
 y_pred

In [None]:
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score,confusion_matrix,precision_recall_curve,auc,roc_curve,recall_score, classification_report


In [None]:
class_report=classification_report(y_test,y_pred)
print(class_report)

In [None]:
auc = roc_auc_score(y_test, y_pred)
auc

In [None]:
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
predicted_probab_log = LR.predict_proba(X_test)
predicted_probab_log = predicted_probab_log[:, 1]
fpr, tpr, _ = roc_curve(y_test, predicted_probab_log)

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', label='Logistic Regression')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier()
RF.fit(X_train, y_train)

In [None]:
RF_pred=RF.predict(X_test)
roc_auc_score(y_test, RF_pred)

In [None]:
cm = confusion_matrix(y_test, RF_pred)
cm


In [None]:
predicted_probab = RF.predict_proba(X_test)
predicted_probab = predicted_probab[:, 1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, predicted_probab)

In [None]:
from matplotlib import pyplot
pyplot.plot(fpr, tpr, marker='.', color='red', label='Random Forest')
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
pyplot.legend()
pyplot.show()