In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder, MinMaxScaler

In [None]:
seed=2021
np.random.seed(seed)
sns.set_style("darkgrid")

In [None]:
data = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
data.dropna(inplace = True)

In [None]:
train_data, test_data = train_test_split(data,test_size=0.2, random_state = seed)

# **Data explore**

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data[['age','avg_glucose_level','bmi']].hist(bins=50, figsize=(20,15))
plt.show()

Lets delete some outliers

In [None]:
train_data = train_data[train_data["bmi"] < train_data["bmi"].quantile(0.97)]
train_data = train_data[train_data["bmi"] > train_data["bmi"].quantile(0.01)]

train_data = train_data[train_data["avg_glucose_level"] < train_data["avg_glucose_level"].quantile(0.99)]

In [None]:
train_data[['bmi','avg_glucose_level']].hist(bins=35, figsize=(20,10))
plt.show()

In [None]:
b1 = sns.catplot(x="smoking_status", kind="count", palette="ch:.25", data=train_data, aspect = 3, order = train_data['smoking_status'].value_counts().index)
plt.subplots_adjust(top=0.9)
b1.fig.suptitle('Smoking distibution', fontsize = 17)

In [None]:
b1 = sns.catplot(x="work_type", kind="count", palette="ch:.25", data=train_data, aspect = 3, order = train_data['work_type'].value_counts().index)
plt.subplots_adjust(top=0.9)
b1.fig.suptitle('Working distibution', fontsize = 17)

In [None]:
fig, ax = plt.subplots(figsize=(9,7))
b5 = sns.heatmap(train_data.corr(), ax=ax)
plt.subplots_adjust(top=0.9)
b5.set_title('CORRELATION BETWEEN INPUTS', fontsize = 17)

# Data preprocess

In [None]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler())
])

In [None]:
binary_pipeline = OrdinalEncoder()

In [None]:
categorical_pipeline = OneHotEncoder(handle_unknown='ignore')

In [None]:
#Check for possible values in each columns 
print(train_data.apply(lambda col: col.unique()))

In [None]:
numerical_features = ["age", "avg_glucose_level", "bmi"]
binary_features = ["hypertension", "heart_disease", "ever_married", "Residence_type"]
categorical_features = ["gender", "work_type", "smoking_status"]

In [None]:
#For adding own feature, it didn't work well 
from sklearn.preprocessing import FunctionTransformer
def add_extra_features(X):
    diabetes_feature = [[1,0,0] if x > 200 else [0,1,0] if x < 140 else [0,0,1] for x in X["avg_glucose_level"]]
    return np.c_[diabetes_feature]


In [None]:
preprocess_pipeline = ColumnTransformer([
    #('attribs_adder', FunctionTransformer(add_extra_features, validate=False), train_data.columns),
    ("num", numerical_pipeline, numerical_features),
    ("bin", binary_pipeline, binary_features),
    ("cat", categorical_pipeline, categorical_features),
])

In [None]:
X = preprocess_pipeline.fit_transform(train_data.drop(["stroke"],axis=1))
y = train_data["stroke"]

In [None]:
X_test = preprocess_pipeline.transform(test_data.drop(["stroke"],axis=1))
y_test = test_data["stroke"]

# **Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
rf = RandomForestClassifier(max_depth=8,criterion='entropy',class_weight='balanced_subsample',random_state=seed)
rf.fit(X, y)
predicted_y = rf.predict(X_test)
print(confusion_matrix(predicted_y, y_test))
print(classification_report(predicted_y, y_test))