# Setup

# Import Modules

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from time import time

# Exploration

In [None]:
df = pd.read_csv('../input/fetal-health-classification/fetal_health.csv')

In [None]:
df.head()

In [None]:
df.describe().T

In [None]:
df.info()

# Feature Selection

In [None]:
X = df.drop(columns=['fetal_health'])
y = df['fetal_health']

# Model Selection

In [None]:
kf = StratifiedKFold(n_splits=5)

In [None]:
def train_models(models, X, y):
    status = {'best_score': 0, 'best_model': None}
    for name, model in models:
        model_scores = []
        print('=' * 80)
        print("Training: ")
        print(model)
        for i, (train_index, test_index) in enumerate(kf.split(X, y)):
            print('_' * 80)
            print("Iteration: ", i)
            t0 = time()
            X_train, X_test = X.iloc[list(train_index)], X.iloc[list(test_index)]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
        
            train_time = time() - t0
            print("Train time: %0.3fs" % train_time)

            t0 = time()
            pred = model.predict(X_test)
            test_time = time() - t0
            print("Test time:  %0.3fs" % test_time)

            print("Accuracy of fit on train set:   %0.3f" % model.score(X_train, y_train))
        
            score = accuracy_score(y_test, pred)
            print("Accuracy on test set:   %0.3f" % score)

            print("Classification report:")
            print(classification_report(y_test, pred))
            print("Class names identified: ", model.classes_)

            print("Confusion matrix:")
            print(confusion_matrix(y_test, pred))

            model_scores.append(score)

        if np.mean(model_scores) > status['best_score']:
            status['best_score'] = np.mean(model_scores)
            status['best_model'] = name

    return status

In [None]:
models = []

In [None]:
models.append(
    ('XGB', XGBClassifier(random_state=0))
)

models.append(
    ('Logistic', Pipeline([('scaler', StandardScaler()), ('logistic', LogisticRegression())]))
)

models.append(
    ('Decision Tree', DecisionTreeClassifier(random_state=0))
)

models.append(
    ('Random Forest', RandomForestClassifier(random_state=0))
)

In [None]:
best_model = train_models(models, X, y)

In [None]:
best_model

# Hyperparameter Tuning

In [None]:
parameters = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2),
    'min_child_weight':[6,8,10,12],
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

In [None]:
rgs = RandomizedSearchCV(XGBClassifier(), parameters, verbose=10)
rgs.fit(X, y)
sorted(rgs.cv_results_.keys())

In [None]:
print(rgs.best_params_)
print(rgs.best_score_)