# Stroke-Prediction (Classification Problem)

In [None]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings(action="ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer

In [None]:
# importing dataset
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head(10)

**Data Attributes**

* id: unique identifier
* gender: "Male", "Female" or "Other"
* age: age of the patient
* hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
* heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
* ever_married: "No" or "Yes"
* work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
* Residence_type: "Rural" or "Urban"
* avg_glucose_level: average glucose level in blood
* bmi: body mass index
* smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*
* stroke: 1 if the patient had a stroke or 0 if not

In [None]:
# Datatypes of attributes
df.info()

In [None]:
# Statistical data
df.describe()

In [None]:
# Checking dor null values
df.isna().sum()

In [None]:
# shape of our data
df.shape

In [None]:
# dropping rows with null values
df = df.dropna(axis=0)
df = df.drop(columns="id")

In [None]:
df.columns

# Data Visualization

In [None]:
df.head()

In [None]:
sns.pairplot(df, hue="stroke",data=df)
plt.show()

In [None]:
# countplots
sns.countplot(df["gender"])
plt.show()

In [None]:
sns.countplot(df["heart_disease"])
plt.show()

In [None]:
sns.countplot(df["Residence_type"])
plt.show()

In [None]:
sns.countplot(df["smoking_status"])
plt.show()

In [None]:
sns.countplot(df["work_type"])
plt.show()

In [None]:
sns.histplot(df["age"])
plt.show()

In [None]:
plt.figure(figsize=(14,7))
sns.histplot(df["bmi"],bins=30)
plt.show()

# Feature Engineering

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df1 = df.apply(le.fit_transform)

In [None]:
df1.head()

In [None]:
x = df1.drop(columns="stroke")
y = df1.stroke

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=24)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
X_train

In [None]:
# Scaling of data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
model = RandomForestClassifier()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
report = classification_report(y_pred, y_test)
print(report)
print("Accuracy of Random Forest Classifier Model:", accuracy_score(y_pred,y_test)*100,"%")

In [None]:
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(data=cm, annot=True)
plt.show()

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
model2 = SVC()
model2.fit(X_train,y_train)
y_pred2 = model2.predict(X_test)
report2 = classification_report(y_pred2, y_test)
print(report2)
print("Accuracy of SVM Model:", accuracy_score(y_pred2,y_test)*100,"%")

In [None]:
from sklearn.linear_model import LogisticRegression
model3 = LogisticRegression()
model3.fit(X_train,y_train)
y_pred3 = model3.predict(X_test)
report3 = classification_report(y_pred3, y_test)
print(report3)
print("Accuracy of Logistic Regression Model:", accuracy_score(y_pred3,y_test)*100,"%")

In [None]:
import xgboost
xgb = xgboost.XGBClassifier(n_estimators=500)
xgb.fit(X_train,y_train)
y_pred4 = model3.predict(X_test)
report4 = classification_report(y_pred4, y_test)
print(report4)
print("Accuracy of XGBoost Model:", accuracy_score(y_pred4,y_test)*100,"%")

* Author: Purvit Vashishtha
* Created on : 30.03.2021 at 11:55:40 pm