In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Importing the libraries

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Importing the dataset

stroke =  pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv').drop(labels=['id'], axis=1)

In [None]:
stroke.shape

In [None]:
stroke.head()

In [None]:
stroke.info()

In [None]:
stroke.describe()

In [None]:
# Correlation Heatmap

sns.heatmap(stroke.corr(), cmap='coolwarm', annot=True)

In [None]:
stroke.nunique()

In [None]:
stroke.gender.value_counts()

In [None]:
stroke.drop(stroke[stroke.gender=='Other'].index, inplace=True)

In [None]:
sns.distplot(stroke.loc[stroke.stroke==0, :].age)

In [None]:
sns.distplot(stroke.loc[stroke.stroke==1, :].age)

In [None]:
stroke.loc[stroke.stroke==1, :].median()

In [None]:
sns.boxplot(stroke.loc[stroke.stroke==1, 'age'])

In [None]:
plt.figure(figsize=(10,6))
plt.subplot(1, 2, 1)
sns.distplot(stroke.loc[(stroke.stroke==1) & (stroke.gender=='Male'), 'age'])
plt.subplot(1, 2, 2)
sns.distplot(stroke.loc[(stroke.stroke==1) & (stroke.gender=='Female'), 'age'])

In [None]:
stroke.smoking_status.replace({'never smoked':0,'Unknown':np.nan,'formerly smoked':1,'smokes':1}, inplace=True)
stroke.smoking_status.replace(np.nan, stroke.smoking_status.value_counts().argmax(), inplace=True)

In [None]:
stroke.gender.replace({'Male':1,"Female":0},inplace=True)

In [None]:
stroke.ever_married.replace({'Yes':1,"No":0},inplace=True)

In [None]:
stroke.work_type.replace({'Never_worked':'Student',"children":'Student',
                         "Private":'Private-Job', 'Self-employed':'Self-Employed',
                         'Govt_job':'Govt-Job'},inplace=True)

In [None]:
cols = ['gender', 'hypertension', 'heart_disease', 'ever_married',
        'work_type', 'Residence_type', 'smoking_status']
plt.figure(figsize=(7,25))
for i, col in enumerate(cols,1):
    plt.subplot(7,1,i)
    sns.countplot(stroke[col],hue=stroke['stroke'])
    plt.xlabel(col)

In [None]:
# Splitting the dataset into dependent and independent

X = stroke.drop('stroke', axis='columns')
y = stroke['stroke']

In [None]:
# Handle missing data

imputer = SimpleImputer(strategy='median')
imputer.fit(X.iloc[:,8:9])
X.iloc[:,8:9] = imputer.transform(X.iloc[:,8:9])

In [None]:
# Encoding categorical data

cat_feature = [feature for feature in X.columns if X[feature].dtypes == 'O']
X = pd.get_dummies(X, columns=cat_feature)

In [None]:
# Splitting the dataset into the Training set and Test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 32, stratify=y)

In [None]:
# Handle Imbalanced data

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
# Training the Random Forest Regression model 

model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
# Predicting

y_pred = model.predict(X_test)

In [None]:
# Classification Report

print(classification_report(y_test, y_pred))

Working on making the model better ...