In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# DataFrame

data = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
print (type(data))

In [None]:
print (data.info())

print (data['bmi'].describe()) # For Statistical Info

# We can calculate IQR and remove outlier from the column of 'bmi'

Q1 = data['bmi'].quantile(0.25)
Q3 = data['bmi'].quantile(0.75)
IQR = Q3 - Q1
print ("The interquartile range between Q3 and Q1 >>" , IQR)
filtered_data = data.query('(@Q1 - 1.5 * @IQR) <= bmi <= (@Q3 + 1.5 * @IQR)') # query function

filtered_data # This is filtered DataFrame
                    # We now calculate the mean(),mode(),median() value from the filtered_data
                    # What we should do is calculate the mean ( ) from only X_train not from the whole dataset ( which will posses risk of overfit on test_data)
                        
bmi_mean = filtered_data['bmi'].mean()
print (bmi_mean)

In [None]:
# Checking the total number of missing values contained in our DataFrame
# We must choose which features to use as training and which for labeled data

X = data.drop('stroke',axis = 1) # training features
y = data.stroke # labeled features # Pandas Series

In [None]:
data.isnull().sum() # We can see bmi column contain

In [None]:
# Train-Test split stage

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size = 0.8,test_size = 0.2,random_state = 0)


# Creating the list of categorical data
categorical_col  = [i for i in X_train.columns if X_train[i].dtype == "object"]
print (categorical_col)

# Safely ordinal encoded
# This step is not necessary right now but when we working with large data set we 
good_label_col = [col for col in categorical_col if 
                   set(X_test[col]).issubset(set(X_train[col]))]

print ("Good_label",good_label_col)

In [None]:
from sklearn.preprocessing import OneHotEncoder # use this for categorical data when there is no relationship
from sklearn.preprocessing import OrdinalEncoder # use ordinal encoder for ordered relationship data
from sklearn.preprocessing import LabelBinarizer # use this for binary classification (Yes or No)
from sklearn.impute import SimpleImputer

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Training features contain categorical data and need encoding


# Defining Encoders 
OH_imputer = OneHotEncoder(handle_unknown='ignore',sparse = False)
# binary_imputer = LabelBinarizer() # Here is the problem
ordinal_imputer = OrdinalEncoder()
simple_imputer = SimpleImputer(strategy='constant',fill_value = 28.313586163784137) # we imputed with data


# Defining the features that would be imputed with respect to suitable Encoder (Corresponding features)
binary_feature_col = [col for col in X_train[categorical_col] if X_train[col].nunique() < 3] # Notice iteration only use for categorical_col
one_hot_col = [col for col in X_train[categorical_col] if X_train[col].nunique() >= 3] # Notice gender has unique = 3
simple_impute_col = ['bmi']
# Since there is no ordinal data column in our dataset we leave this step


# Creating column transformer
imputer = ColumnTransformer([
    ('ordinal_imputer',ordinal_imputer,binary_feature_col), # we use binary_feature_col here for ordinal encode because our previous LabelBinarizer() gives us error
    ('one_hot_imputer',OH_imputer,one_hot_col),
    ('Simple_imputer',simple_imputer,simple_impute_col)
                                                ],remainder = 'passthrough')


# Creating pipeline
pipeline = Pipeline(steps = [(['preprocessor',imputer])])

In [None]:
# Transformation
imputed_X_train = pipeline.fit_transform(X_train)
imputed_X_test = pipeline.transform(X_test)

In [None]:
# Creating Models

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(learning_rate= 0.1,n_estimators= 100)
clf.fit(imputed_X_train,y_train)
prediction = clf.predict(imputed_X_test)



from sklearn.metrics import accuracy_score
print (accuracy_score(prediction,y_test))