In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Seprate Dependent Variable From Independents

In [None]:
# Separate target from predictors
X=pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
X.head()

In [None]:
X.shape

# Dealing with missing values

In [None]:
null= X.isna().sum()
per_null= X.isna().sum()/X.isna().count()*100
per_null= round(per_null, 1)
missing_data= pd.concat([null, per_null], axis=1,keys=['number of missing values', '% of missig values'])
missing_data

# Missing Values Visualization

In [None]:
# we can see that there are only null values in bmi column
import missingno as ms
ms.matrix(X)

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='most_frequent')
X_impute = pd.DataFrame(my_imputer.fit_transform(X))

# Imputation removed column names; put them back
X_impute.columns = X.columns

In [None]:
# We imputed the null values and now there is no null values in bmi
ms.matrix(X_impute)

# Find Correlation

In [None]:
import seaborn as sns
sns.countplot(x=X_impute.stroke)
plt.title('Number of patient stroked')
plt.show()

In [None]:
# relation between gender and stroke
sns.countplot(x=X_impute.gender,hue=X_impute.stroke)
plt.title('Relation between gender and stroke')
plt.show()

In [None]:
X_impute.groupby(['gender'])['stroke'].value_counts()

In [None]:
sns.countplot(x=X_impute.smoking_status,hue=X_impute.stroke)
plt.title('relation between smoking and stroke')
plt.show()

In [None]:
X.groupby("smoking_status")['stroke'].value_counts()

In [None]:
# We can see that this is not a good visualization but it gave us this a good 
# intution that the older ages have higher stoke probability
sns.countplot(x=X_impute.age,hue=X_impute.stroke)
plt.title('relation between age and stroke')
plt.show()

# we change age into 3 groups that can give us better result 

In [None]:
def seprate_ages(row):
    if row.age > 45:
        row.age = 3 
    elif row.age > 30:
        row.age = 2
    else :
        row.age = 1
    return row

age_seprated_x=X_impute.apply(seprate_ages,axis='columns')
age_seprated_x.head()

In [None]:
# We can see that this is not a good visualization but it gave us this a good 
# intution that the older ages have higher stoke probability
sns.countplot(x=age_seprated_x.age,hue=age_seprated_x.stroke)
plt.title('relation between age and stroke')
plt.show()

In [None]:
# we can see that most of the stroke happened for ages above 45 
age_seprated_x.groupby('age')['stroke'].value_counts()

# Find And Keep Categorical And Numerical Data

In [None]:
# categorical_cols = [cols for cols in X.columns if X[cols].dtype=='object' and X[cols].nunique()<10]
# numerical_cols = [cols for cols in X.columns if X[cols].dtype in ['int64','float64']]

categorical_data = age_seprated_x.select_dtypes(include = ['object'])
numerical_data = data=age_seprated_x.select_dtypes(include=['integer', 'float'])

# Label Encode Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
features = categorical_data.apply(label_encoder.fit_transform)
features=features.join(numerical_data)
features.head()

In [None]:
y = features.stroke
features=features.drop('stroke',axis=1)


# Divide data into training and validation subsets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train,y_val = train_test_split(features,y)
X_train

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC

random_forest = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
random_forest.fit(X_train,y_train)


k_neighbor = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
k_neighbor.fit(X_train,y_train)

decision_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
decision_tree.fit(X_train,y_train)

xgboost = XGBClassifier(n_estimators=100, random_state=0)
xgboost.fit(X_train,y_train)

svc = SVC(kernel = 'rbf', random_state = 0)
svc.fit(X_train,y_train)

naive_bayes = GaussianNB()
naive_bayes.fit(X_train, y_train)

models = [random_forest, k_neighbor, decision_tree, xgboost, svc]

In [None]:
from sklearn.model_selection import cross_val_score
def mean_accuracy_score(model):
    scores = cross_val_score(model,X_train,y_train,cv=5,scoring='accuracy')
    return scores.mean()

In [None]:
print("xgboost_pipeline Score:", mean_accuracy_score(xgboost))

print("random_forest_pipline Score:", mean_accuracy_score(random_forest))

print("k_neighbor_pipeline Score:", mean_accuracy_score(k_neighbor))

print("decision_tree_pipeline Score:", mean_accuracy_score(decision_tree))

print("svc_pipeline Score:", mean_accuracy_score(svc))

print("naive_bayes Score:", mean_accuracy_score(naive_bayes))

# predicting valid data with the best model

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
y_pred = svc.predict(X_val)
accuracy_score(y_val,y_pred)

