In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2)

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_cols = X_train.select_dtypes(include=np.number).columns

num_pipeline = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('std_scaler', StandardScaler())
])


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

cat_cols = X_train.select_dtypes(include=object).columns

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)
    
cat_pipeline = Pipeline([
        ("imputer", MostFrequentImputer()),
        ("cat_encoder", OneHotEncoder(sparse=False)),
    ])


In [None]:
#Lets Mix both of those into a full pipeine
from sklearn.compose import ColumnTransformer

full_pipeline = ColumnTransformer([
  ("num", num_pipeline, num_cols),
  ("cat", cat_pipeline, cat_cols)
])

In [None]:
X_processed = full_pipeline.fit_transform(X_train)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
params = {
    'max_depth': [ x for x in range(3,10,1)],
    'min_child_weight':[ x for x in range(1,6,1)],
    'n_estimators' : [ x for x in range(1, 150)],
    'gamma' : [0, 0.1, 0.001]
}
grid = RandomizedSearchCV(estimator = XGBClassifier(), param_distributions= params, scoring='accuracy',cv=5,verbose =3)

In [None]:
grid.fit(X_processed, y_train)

In [None]:
full_pipeline_with_predictor = Pipeline([
  ("preprocessing", full_pipeline),
  ("xgb", grid.best_estimator_),
])


In [None]:
full_pipeline_with_predictor.fit(X_train,y_train)

In [None]:
full_pipeline_with_predictor.score(X_test, y_test)