In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 

Load Dataset

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
data.head()

Exploratory analysis

In [None]:
data.info()

In [None]:
print("="*50)
print(" Categorical and Ordinal variables ")
print("="*50)
print("gender \t\t",data.gender.unique())
print("hypertension \t\t",data.hypertension.unique())
print("heart_disease \t",data.heart_disease.unique())
print("ever_married \t\t",data.ever_married.unique())
print("work_type \t\t",data.work_type.unique())
print("Residence_type \t",data.Residence_type.unique())
print("smoking_status \t",data.smoking_status.unique())
print("stroke \t\t",data.stroke.unique(), " yes | no ")
print("="*50)
print(" Numerical variables ")
print("="*50)
print(data.describe())

Data cleaning & feature engineering

In [None]:
cleanup_dict = {"gender":{"Male":1,"Female":2,"Other":3},
               "ever_married":{"No":1,"Yes":2},
               "Residence_type":{"Urban":1,"Rural":2},
               "work_type":{'Private':5, 'Self-employed':4, 'Govt_job':3, 'children':2,'Never_worked':1},
               "smoking_status":{'formerly smoked':2, 'never smoked':1, 'smokes':3, 'Unknown':0}
               }

In [None]:
clean_data = data.replace(cleanup_dict)

In [None]:
clean_data.isnull().sum(axis = 0) #NaN values in every column 

In [None]:
clean_data.shape[0] # Num of rows

In [None]:
clean_data2 = clean_data.dropna()

In [None]:
clean_data2.shape[0] # Num of rows

In [None]:
print("Number of rows deleted for missing value ",clean_data.shape[0] - clean_data2.shape[0])

### Model creation

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics

In [None]:
features = ['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status']

target = ['stroke']

In [None]:
X = pd.DataFrame(clean_data2, columns=features)
Y = pd.DataFrame(clean_data2, columns=target)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4, random_state = 42)

In [None]:
model = DecisionTreeClassifier()

# Train Decision Tree Classifer
model = model.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

### Model visualization 

In [None]:
! pip install graphviz
! pip install pydotplus

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydotplus


In [None]:
my_file = StringIO()
export_graphviz(model, out_file=my_file,  
                filled=True, rounded=True,
                special_characters=True,
                feature_names = features,
                class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(my_file.getvalue())  
graph.write_png('stroke.png')
Image(graph.create_png())