In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
dataset.head(5)

In [None]:
dataset.describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
strokes = dataset.stroke.value_counts()
print(strokes)
plt.figure(figsize=[3,4])
sns.barplot(x=strokes.index, y=strokes.values, alpha = 0.7)
plt.show()

We can see that our dataset is unbalanced, so we have to balance it

In [None]:
#balancing our data set
dataNoStrok = dataset[dataset.stroke != 1]
dataYesStroke = dataset[dataset.stroke != 0]
dataNoStrok = dataNoStrok.sample(300)
dataset = pd.concat([dataNoStrok,dataYesStroke],axis=0)
dataset.shape

In [None]:
plt.figure(figsize=[15,17])
fft=["age","avg_glucose_level","bmi"]
n=1
for f in fft:
    plt.subplot(4,2,n)
    sns.histplot(x=f, hue='stroke', edgecolor="black", alpha=0.7, multiple="stack", data=dataset)
    sns.despine()
    plt.title("Countplot of Strokes  by {}".format(f))
    n=n+1
plt.tight_layout()
plt.show()

In [None]:
sm = dataset.smoking_status[dataset.stroke == 1].value_counts()
sns.barplot(x = sm.index, y = sm.values, alpha=0.7)

In [None]:
sm = dataset.work_type[dataset.stroke == 1].value_counts()
sns.barplot(x = sm.index, y = sm.values, alpha=0.7)

In [None]:
dataset.head()

In [None]:
smoking = dataset.smoking_status.value_counts()
smoking

In [None]:
def gender_conv(x):
    if x == 'Male' : return 1
    if x == 'Female' : return 2
    return 0

def work_type_conv(x):
    if x == 'Private'        : return 1
    if x == 'Self-employed	': return 2
    if x == 'Govt_job'       : return 3
    if x == 'children'       : return 4
    return 0
    
def status_conv(x):
    if x == 'Yes' : return 1
    if x == 'No' : return 2
    return 0

def residence_type(x):
    if x == 'Urban' : return 1
    if x == 'Rural' : return 2
    return 0

def smokingSt_conv(x):
    if x == "never smoked " : return 1
    if x == "formerly smoked" : return 2
    if x == "smokes" : return 3
    return 0

In [None]:
dataset.ever_married.describe()

In [None]:
dataset.gender         =  dataset.gender.apply(gender_conv)
dataset.smoking_status =  dataset.smoking_status.apply(smokingSt_conv)
dataset.ever_married   =  dataset.ever_married.apply(status_conv)
dataset.Residence_type =  dataset.Residence_type.apply(residence_type)
dataset.work_type      =  dataset.work_type.apply(work_type_conv)
dataset.head(2)

In [None]:
print("Any missing values in training set:",dataset.isnull().values.any())

In [None]:
dataset.gender              = dataset.gender.fillna(dataset.gender.mean())
dataset.smoking_status      = dataset.smoking_status.fillna(dataset.smoking_status.mean())
dataset.ever_married        = dataset.ever_married.fillna(dataset.ever_married.mean())
dataset.Residence_type      = dataset.Residence_type.fillna(dataset.Residence_type.mean())
dataset.work_type           = dataset.work_type.fillna(dataset.work_type.mean())
dataset.age                 = dataset.age.fillna(dataset.age.mean())
dataset.hypertension        = dataset.hypertension.fillna(dataset.hypertension.mean())
dataset.heart_disease       = dataset.heart_disease.fillna(dataset.heart_disease.mean())
dataset.avg_glucose_level   = dataset.avg_glucose_level.fillna(dataset.avg_glucose_level.mean())
dataset.bmi                 = dataset.bmi.fillna(dataset.bmi.mean())

In [None]:
print("Any missing values in training set:",dataset.isnull().values.any())
dataset.head(2)

In [None]:
features = ["age", "bmi", "avg_glucose_level", "hypertension", "heart_disease", "smoking_status" ]
X = dataset[features]
Y = dataset.stroke

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [None]:
logistic_regression= LogisticRegression()
logistic_regression.fit(X_train,Y_train)

In [None]:
Y_pred=logistic_regression.predict(X_test)
print('Accuracy: ',metrics.accuracy_score(Y_test, Y_pred))

In [None]:
confusion_matrix = pd.crosstab(Y_test, Y_pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
plt.show()

In [None]:
print(metrics.classification_report(Y_test,Y_pred))