In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv(r'/kaggle/input/fetal-health-classification/fetal_health.csv')

plt.style.use('ggplot')

sns.set_style('darkgrid')

In [None]:
data.info()

In [None]:
data.isna().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.shape

In [None]:
!pip install pingouin

In [None]:
import pingouin as pg

In [None]:
pair_corr = pg.pairwise_corr(data, method='spearman').loc[:, ['X', 'Y', 'r', 'p-unc']]

pair_corr = pair_corr[pair_corr['Y'] == 'fetal_health']

pair_corr.sort_values(by=['p-unc'])

In [None]:
sns.countplot(data['fetal_health'])
plt.show()
# unbalanced classes

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
frame = [data.iloc[:300, :], data.iloc[500:1800,:]]

train_data = pd.concat(frame, axis=0)

train_data.shape, data.shape

In [None]:
X_train = train_data.iloc[:,:-1]
y_train = train_data['fetal_health']

X_val = data.iloc[1800:,:-1]
y_val = data.iloc[1800:, -1]

X_test = data.iloc[300:500,:-1]
y_test = data['fetal_health'][300:500]

In [None]:
X_train.shape[0] + X_test.shape[0] + X_val.shape[0], data.shape[0]

In [None]:
smote = SMOTE(sampling_strategy='not majority', k_neighbors=3).fit(X_train, y_train)

In [None]:
# over sampling validation and train data

val_X, val_y = smote.fit_resample(X_val, y_val)

train_X, train_y = smote.fit_resample(X_train, y_train)

train_X = pd.DataFrame(train_X)
train_X.columns = X_train.columns

train_y = pd.DataFrame(train_y)
train_y.columns = ['target']

In [None]:
model = XGBClassifier(booster='gbtree', colsample_bylevel=0.6,
              colsample_bynode=0.7, colsample_bytree=1, gamma=0.3, gpu_id=-1,
              importance_type='gain',
              learning_rate=0.06, max_delta_step=0.9, max_depth=7,
              min_child_weight=0.8,
              n_estimators=300, n_jobs=4, num_class=3, num_parallel_tree=1, objetive='multi:softmax',
              reg_alpha=0, reg_lambda=0.1, scale_pos_weight=None,
              subsample=0.8, tree_method='approx',
              verbosity=0, grow_policy='lossguide', feature_selector='greedy')

model.fit(train_X, train_y, early_stopping_rounds=10, eval_metric='merror', eval_set=[(val_X, val_y)])

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
def report(test_Y, pred_Y):
    
    model_cm = confusion_matrix(test_Y, pred_Y)
    print('\t\t\tReport\n')
    print(classification_report(test_Y, pred_Y))
    print('\t\t\tAccuracy\n')
    print(accuracy_score(test_Y, pred_Y))
    print('\t\t\tHeatmap\n')
    sns.heatmap(model_cm, annot=True, cmap='Blues')

In [None]:
pred = model.predict(X_test)

In [None]:
# report test
report(y_test, pred)

*recall for class 3 -> 'Pathological'; since it is better to classify a patient as pathological even if it is not a case, so that future diagnoses deny the classification.
In this way we would be preventing several cases of maternal and infant mortality.*

**REPORT BONUS REPORT OF THE FOLLOWING DATA**

In [None]:
# report val without over sampling

pred_val = model.predict(X_val)

report(y_val, pred_val)

In [None]:
# report val with over sampling
pred_val = model.predict(val_X)

report(val_y, pred_val)

In [None]:
# report train without over sampling
pred_train = model.predict(X_train)

report(y_train, pred_train)

In [None]:
# report train with over sampling
pred_train = model.predict(train_X)

report(train_y, pred_train)