In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.isna().sum()

In [None]:
# handling missing values
df['bmi'].fillna(df['bmi'].mean(), inplace=True)
df.isnull().sum()

In [None]:
sns.countplot(x= 'gender', hue='stroke', data=df)
plt.show()

##### Female have more chances of getting a stroke

In [None]:
sns.displot(df['age'], bins=10, kde=True)
plt.show()

##### 40s-60s have more chances

In [None]:
sns.set_theme(style="darkgrid")
sns.countplot(data=df, x="ever_married")
plt.show()

In [None]:
sns.countplot(data=df, x="work_type")
plt.show()

In [None]:
sns.countplot(data=df, x="work_type", hue='gender')
plt.show()

##### Private sector is more vernuable, makes sence

In [None]:
sns.countplot(data=df, x="smoking_status")
plt.xticks(rotation=90)
plt.show()

##### shocking, peple who never smoked have more chances

In [None]:
fig = plt.figure(figsize=(7,7))
sns.displot(df.bmi, color="orange", label="bmi", kde=True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12,10))

sns.distplot(df[df.stroke == 0]["age"], color="green")
sns.distplot(df[df.stroke != 0]["age"], color="red")

plt.title("No strock Vs Stroke By BMI", fontsize=15)
plt.xlim([10,100])
plt.show()


## Preprocessing for ML

In [None]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
le=LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in df.columns:
    # Compare if the dtype is object
    if df[col].dtypes=='object':
    # Use LabelEncoder to do the numeric transformation
        df[col]=le.fit_transform(df[col])

# Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

X = df.drop(['id', 'stroke'], axis=1)
y = df.stroke

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
labels = df.stroke.value_counts(sort=True).index
sizes = df.stroke.value_counts(sort=True)

colors=["lightblue", "red"]
explode=(0.05,0)
plt.figure(figsize=(7,7))
plt.pie(sizes, 
        explode=explode, 
        labels=labels,
        colors=colors,
        autopct="%1.1f%%",
        shadow=True,
        startangle=90)

plt.title("Stroke Percent")
plt.show()

##### As you can see, percentage of people with no stroke is low and this will affect our model

In [None]:
from imblearn.over_sampling import SMOTE

print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(random_state=2)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

## Searching right hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

para = {'n_neighbors':range(0,10)}

grid_cv = GridSearchCV(knn, para, cv=5)

grid_cv.fit(X_train_res, y_train_res)

In [None]:
import matplotlib.pyplot as plt

best = {}
for i in range(1, 50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train_res, y_train_res)
    y_pred = knn.predict(X_test)
    acc = knn.score(X_test, y_test)
    #print(i,':',acc)
    best[i] = round(acc, 3)
    
plt.plot(best.keys(), best.values())
plt.xticks([i for i in range(0, 100, 5)])
plt.grid(True)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()

## Fitting best model

In [None]:
best_model = grid_cv.best_estimator_
best_model.fit(X_train_res, y_train_res)
y_pred = best_model.predict(X_test)

In [None]:
best_model.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, y_pred))
print(accuracy_score(y_pred, y_test))

In [None]:
df[df['stroke'] == 1].iloc[10].tolist()

best_model.predict([[0.0, 81.0, 1.0, 0.0, 1.0, 2.0, 0.0, 80.43, 29.7, 2.0]])

In [None]:
# Import PCA
from sklearn.decomposition import PCA

# Create PCA instance: model
model = PCA()

# Apply the fit_transform method of model to grains: pca_features
pca_features = model.fit_transform(X_train_res)

# Assign 0th column of pca_features: xs
xs = pca_features[:,0]

# Assign 1st column of pca_features: ys
ys = pca_features[:,1]

# Scatter plot xs vs ys
plt.scatter(xs, ys)
plt.axis('equal')
plt.show()

## Dimension reduction (if necessary)

In [None]:
# Perform the necessary imports
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

# Create scaler: scaler
scaler = StandardScaler()

# Create a PCA instance: pca
pca = PCA()

# Create pipeline: pipeline
pipeline = make_pipeline(scaler, pca)

# Fit the pipeline to 'samples'
pipeline.fit(X_train_res)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

##### Rather than first, every other feature have similar importance. So I don't think its necessary

# DecisionTrees

In [None]:
dtc = DecisionTreeClassifier()
depth = np.arange(1,30)
leaves = [1,2,4,5,10,20,30,40,80,100]
param_grid =[{'max_depth':depth,
             'min_samples_leaf':leaves}]
grid_search = GridSearchCV(estimator = dtc,param_grid = param_grid,
                           scoring='roc_auc',cv=10)
grid_search = grid_search.fit(X_train_res,y_train_res)

In [None]:
dt = grid_search.best_estimator_
y_pred = dt.predict(X_train_res)

In [None]:
grid_search.best_params_

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba = dt.predict_proba(X_test)[:,1]

# Compute test_roc_auc
test_roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print test_roc_auc
print('Test set ROC AUC score: {:.3f}'.format(test_roc_auc))

In [None]:
accuracy_score(y_pred, y_train_res)

In [None]:
dt.predict([[0.0, 81.0, 1.0, 0.0, 1.0, 2.0, 0.0, 80.43, 29.7, 2.0]])

##### Accuracy is 95% and thats impressive

# Comparing Models and Choosing best one

In [None]:
# Set seed for reproducibility
SEED=1

# Instantiate lr
lr = LogisticRegression(random_state=SEED)

# Instantiate knn
KNN = KNeighborsClassifier
knn = KNN(n_neighbors=1)

# Instantiate dt
dt = DecisionTreeClassifier(max_depth=27, min_samples_leaf=5, random_state=SEED)

# Define the list classifiers
classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)]

# Iterate over the pre-defined list of classifiers
for clf_name, clf in classifiers:    
 
    # Fit clf to the training set
    clf.fit(X_train_res, y_train_res)    
   
    # Predict y_pred
    y_pred = clf.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_pred, y_test) 
   
    # Evaluate clf's accuracy on the test set
    print('{:s} : {:.3f}'.format(clf_name, accuracy))

## VotingClassifier

In [None]:
# Import VotingClassifier from sklearn.ensemble
from sklearn.ensemble import VotingClassifier

# Instantiate a VotingClassifier vc
vc = VotingClassifier(estimators=classifiers)     

# Fit vc to the training set
vc.fit(X_train_res, y_train_res)   

# Evaluate the test set predictions
y_pred = vc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_pred, y_test)
print('Voting Classifier: {:.3f}'.format(accuracy))