In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder





import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# # EDA

In [None]:
# loading the dataset
dataset = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')

In [None]:
dataset.head()

In [None]:
# analyzing basic information from dataset
dataset.info()

In [None]:
## Analyzing the statistical information from dataset
dataset.describe(include='all')

In [None]:
# Finding the missing values
dataset.isnull().sum()

In [None]:
# we find that 'bmi' has null values we will replace the null values with mean()
dataset['bmi'].fillna(dataset['bmi'].mean(),inplace=True)

In [None]:
# checking weather we succesfully replaced the null values
dataset.isnull().sum()

In [None]:
# Lets drop the 'id' column which will not make much sence in the analyizing the dataset
dataset.drop('id',axis=1,inplace=True)


In [None]:
dataset.head()

# # Univariate Analysis

In [None]:
df_value_stroke = dataset['stroke'].value_counts()
print('Non Stroke:{}'.format(df_value_stroke[0]))
print('Stroke:{}'.format(df_value_stroke[1]))
dataset['stroke'].value_counts().plot.bar()
plt.title('Stroke Analysis')
plt.show()

In [None]:
# Numerical Features
dataset_numerical = [feature for feature in dataset.columns if dataset[feature].dtype != 'O']
dataset_numerical

In [None]:
# Categorical Features
dataset_categorical = [feature for feature in dataset.columns if dataset[feature].dtype == 'O']
dataset_categorical

In [None]:
# Lets find the relationship between categorical_features and Stroke
for feature in dataset_categorical:
    data = dataset.copy()
    sns.countplot(x = data[feature], hue = data["stroke"])
    plt.xlabel(feature)
    plt.title(feature)
    plt.show()

# Feature Engineering

In [None]:
#Lets Check weather data is normally distributed or not

for feature in dataset_numerical:
    data = dataset.copy()
    sns.distplot(x=data[feature])
    plt.xlabel(feature)
    plt.title(feature)
    plt.show()


In [None]:
for feature in dataset_categorical:
    data = dataset.copy()
    sns.histplot(x=data[feature],bins=50)
    plt.xlabel(feature)
    plt.title(feature)
    plt.show()

# Outliers 

In [None]:
# Finding the outliers with the boxplot
for feature in dataset_numerical:
    data = dataset.copy()
    data.boxplot(column=feature)
    plt.ylabel(feature)
    plt.title(feature)
    plt.show()

In [None]:
# Lets find  all the unique values in dataset
for feature in dataset.columns[:]:
    print(feature,':',len(dataset[feature].unique()))

In [None]:
# One Hot Encoding
data = pd.get_dummies(dataset, columns = ["gender", "work_type"], drop_first = True)

In [None]:
data

In [None]:
# Replacing Categorical Features with Numerical Values
ever_married_map = data['ever_married'].value_counts().to_dict()
ever_married_map

In [None]:
ordinal_label = {k:i for i,k in enumerate(ever_married_map,0)}
data['ever_married'] = data['ever_married'].map(ordinal_label)

In [None]:
Residence_type_map = data['Residence_type'].value_counts().to_dict()
Residence_type_map

In [None]:
ordinal_label_1 = {k:i for i,k in enumerate(Residence_type_map,0)}
data['Residence_type'] = data['Residence_type'].map(ordinal_label_1)

In [None]:
ord_encoder = OrdinalEncoder()
data["smoking_status"] = ord_encoder.fit_transform(data["smoking_status"].values.reshape(-1, 1))
data.smoking_status.value_counts()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
# Checking weather data is normally distributed or not
for feature in data.columns[:]:
    sns.distplot(data[feature])
    plt.title(feature)
    plt.show()

# Removing Outliers

In [None]:
fig = data.boxplot(column='avg_glucose_level')

In [None]:
data['avg_glucose_level'].describe()

In [None]:
## lets compute the interquantile range to calculate the boundaries
IQR = data.avg_glucose_level.quantile(0.75) - data.avg_glucose_level.quantile(0.25)
# extreme outlires

lower_bridge = data['avg_glucose_level'].quantile(0.25) - (IQR*3)
upper_bridge = data['avg_glucose_level'].quantile(0.75) + (IQR*3)

print(lower_bridge),print(upper_bridge)

In [None]:
data.loc[data['avg_glucose_level']>165,'avg_glucose_level']=165

In [None]:
data.boxplot(column='avg_glucose_level')

In [None]:
## lets compute the interquantile range to calculate the boundaries
IQR = data.bmi.quantile(0.75) - data.bmi.quantile(0.25)
# extreme outlires
lower_bridge = data['bmi'].quantile(0.25) - (IQR*3)
upper_bridge = data['bmi'].quantile(0.75) + (IQR*3)
print(lower_bridge),print(upper_bridge)

In [None]:
data.boxplot(column='bmi')

In [None]:
data.loc[data['bmi']>45,'bmi']=45

In [None]:
data.boxplot(column='bmi')

In [None]:
figure = data.bmi.hist(bins=50)
figure.set_title('bmi')
figure.set_xlabel('bmi')


In [None]:

figure = data.avg_glucose_level.hist(bins=50)
figure.set_title('avg_glucose_level')
figure.set_xlabel('avg_glucose_level')


In [None]:
X = data.drop(columns = ["stroke"])
y = data["stroke"]

In [None]:
# Spliting the data into train and test to avoid the data leakage
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 24)
print(f"Train Data: {X_train.shape}, {y_train.shape}")
print(f"Test Data: {X_test.shape}, {y_test.shape}")

# Scaling the Data

In [None]:
std_scaler  = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.transform(X_test)

In [None]:
# using Logistic Regression 
from sklearn.linear_model import LogisticRegression
classifer = LogisticRegression()
classifer.fit(X_train,y_train)
y_pred = classifer.predict(X_test)


In [None]:
# checking accuracy of the model
from sklearn.metrics import accuracy_score,confusion_matrix
print("Accuaracy-score:{}".format(accuracy_score(y_test,y_pred)))
print((confusion_matrix(y_test,y_pred)))