### Predicting Stroke using machine learning

Stroke is a leading cause of death, yearly, about 800,000 people in the United States have a stroke, so predicting Stroke is an important mission.

Aim: based on the (10) given parameters if we can predict Stroke?

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Preparing the necessary tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline  
from sklearn.ensemble import RandomForestClassifier
# model evaluation
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
df = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

### Data exploration and processing

In [None]:
df.info()

In [None]:
df["stroke"]

In [None]:
df.isna().sum()

In [None]:
# Filling missing values
for column in ['bmi']:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.columns

In [None]:
df.age.plot.hist();

In [None]:
# Dropping unnecessary columns
df.drop("id", axis=1, inplace = True)

In [None]:
df.shape

In [None]:
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
# turning strings into categories
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        df[label] = content.astype("category").cat.as_ordered()

In [None]:
df.work_type.cat.categories

In [None]:
df.info

In [None]:
# turn categorical variables into numbers
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        df[label] = pd.Categorical(content).codes+1  

In [None]:
df.info()

In [None]:
df.corr()
corr_matrix = df.corr()
fig, ax = plt.subplots(figsize=(15, 10))
ax = sns.heatmap(corr_matrix,
                annot=True,
                linewidths=0.5,
                fmt=".2f",
                cmap="YlGnBu");

In [None]:
# Modelling
X = df.drop("stroke", axis = 1)
y = df["stroke"]

In [None]:
X.head()

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Split the data
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
X_train

In [None]:
y_train, len(y_train)

In [None]:
# fit and score the model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
X.head()

In [None]:
clf.predict(X_test)

In [None]:
np.array([y_test])

In [None]:
y_preds = clf.predict(X_test)
np.mean(y_preds==y_test)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
cross_val_score(clf, X, y)

In [None]:
cross_val_score = cross_val_score(clf, X, y, cv=5)
np.mean(cross_val_score)

In [None]:
print(f"Stroke Prediction Classifier cross-validated Accuracy:{np.mean(cross_val_score) *100:.2f}%")

In [None]:
from sklearn.metrics import roc_curve

# make predictions with probabilities
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2)
clf.fit(X_train, y_train)

y_probs = clf.predict_proba(X_test)
y_probs[:10]

In [None]:
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

def plot_roc_curve(fpr, tpr):
    """
    plots a ROC curve given the false positive rate(fpr)
    and true positive rate (tpr) of a model.
    """
    plt.plot(fpr, tpr, color="orange", label="ROC")
    plt.plot([0, 1], [0, 1], color="darkblue", linestyle= "--", label="guessing")
    plt.xlabel("false_positive_rate(fpr)")
    plt.ylabel("true_positive_rate(tpr)")
    plt.title("Receiver operating characteristics (ROC) curve")
    plt.legend()
    plt.show()
    
plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_probs_positive)