# First attempt at xgb

XBG and doing a bit of tuning.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
file = '/kaggle/input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(file)
df.columns = [x.lower() for x in df.columns]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
numeric_cols = [
    'age', 'creatinine_phosphokinase',
    'ejection_fraction', 'platelets',
    'serum_creatinine', 'serum_sodium', 'time',
]

cat_cols = [
    'anaemia','diabetes',
    'high_blood_pressure',
    'sex', 'smoking',
]

In [None]:
df[numeric_cols].hist()

In [None]:
(df['age']).hist()

# Building up a pipe

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn import metrics

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class BasePipeStep(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        return X
    
class SelectColumns(BasePipeStep):
    
    def transform(self, X):
        X = X.copy()
        return X[self.columns]
    
class FillNumericData(BasePipeStep):
    
    def fit(self, X, y=None):
        self.means = { col: X[col].mean() for col in self.columns}
        return self
        
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].fillna(self.means[col])
        return X
    
class LogTransform(BasePipeStep):
    
        
    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = np.log(X[col])
        return X
    
    
class ScaleNumeric(BasePipeStep):
    
    def fit(self, X, y=None):
        self.scaler = StandardScaler()
        self.scaler.fit(X[self.columns])
        return self
        
    def transform(self, X):
        X = X.copy()
        X[self.columns] = self.scaler.transform(X[self.columns])
        return X

In [None]:
preprocessing = Pipeline([
    ('feature_selection', SelectColumns(cat_cols + numeric_cols)),
    ('fill_missing', FillNumericData(numeric_cols)),
    ('log_transform', LogTransform(['creatinine_phosphokinase', 'serum_creatinine', 'age'])),
    ('standard_scaling', ScaleNumeric(numeric_cols)),
    
])

pipeline = Pipeline([
    ('preprocessing', preprocessing),
#     ('create_new_features', features),
    ('learning', XGBClassifier(
        random_state=42,
        eval_metric='auc',
        objective='binary:logistic',
        use_label_encoder=False        
    ))
])

In [None]:
X = df[cat_cols+numeric_cols]
y = df[['death_event']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
pipeline.get_params().keys()

In [None]:
parameters = {
   'learning__max_depth': [i + 1 for i in range(8)],
    'learning__n_estimators' : [1 + i*5 for i in range(4)] + [None],
    'learning__learning_rate': [0.8,]
}
grid = GridSearchCV(pipeline, parameters, cv=3, n_jobs=5)
grid.fit(X_train, y_train)

In [None]:
model = grid.best_estimator_

In [None]:
grid.best_params_

In [None]:


test_prediction_probs = model.predict_proba(X_test)[:,1]
test_predictions = model.predict(X_test)
auc_score = metrics.roc_auc_score(y_test, test_prediction_probs)
accuracy = metrics.accuracy_score(y_test, test_predictions)

print(f'Area under ROC of Model On Test Set - {auc_score:,.2%}')
print(f'The accuracy is {accuracy:,.2%}')