In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

### Data Preprocessing

In [2]:
df = pd.read_csv('../datasets/fetal_health.csv')
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [3]:
## Categorical columns... 
categorical_columns = [columns for columns in df.columns if len(df[columns].unique()) <= 10]
categorical_columns

['severe_decelerations',
 'prolongued_decelerations',
 'histogram_number_of_zeroes',
 'histogram_tendency',
 'fetal_health']

In [4]:
for feature in categorical_columns:
    print(df[feature].value_counts())

severe_decelerations
0.000    2119
0.001       7
Name: count, dtype: int64
prolongued_decelerations
0.000    1948
0.002      72
0.001      70
0.003      24
0.004       9
0.005       3
Name: count, dtype: int64
histogram_number_of_zeroes
0.0     1624
1.0      366
2.0      108
3.0       21
4.0        2
5.0        2
10.0       1
8.0        1
7.0        1
Name: count, dtype: int64
histogram_tendency
 0.0    1115
 1.0     846
-1.0     165
Name: count, dtype: int64
fetal_health
1.0    1655
2.0     295
3.0     176
Name: count, dtype: int64


In [5]:
numerical_columns = [columns for columns in df.columns if columns not in categorical_columns]
numerical_columns

['baseline value',
 'accelerations',
 'fetal_movement',
 'uterine_contractions',
 'light_decelerations',
 'abnormal_short_term_variability',
 'mean_value_of_short_term_variability',
 'percentage_of_time_with_abnormal_long_term_variability',
 'mean_value_of_long_term_variability',
 'histogram_width',
 'histogram_min',
 'histogram_max',
 'histogram_number_of_peaks',
 'histogram_mode',
 'histogram_mean',
 'histogram_median',
 'histogram_variance']

In [27]:
X = df.drop(columns=['fetal_health'])
y = df['fetal_health']

In [28]:
y.unique()

array([2., 1., 3.])

In [29]:
y = y.astype(int) - 1

In [44]:
y.unique()

array([1, 0, 2])

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape, X_test.shape

((1488, 21), (638, 21))

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("StandardScaler", numeric_transformer, numerical_columns)
    ],
    remainder='passthrough'
)

In [32]:
preprocessor

0,1,2
,transformers,"[('StandardScaler', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [33]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [19]:
import pickle
import os

# Relative path from notebooks/ to models/
save_path = '../models/preprocessor.pkl'

# Ensure directory exists (optional)
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save preprocessor
with open(save_path, 'wb') as f:
    pickle.dump(preprocessor, f)

### Model Building for Prediction

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

In [35]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [48]:
weights = {
    0: 1.0,
    1: 1655 / 295,
    2: 1655 / 176
}

models = {
    'XbBoost' : XGBClassifier(scale_pos_weight=weights, n_estimators=1000),
    "Adaboost":AdaBoostClassifier(),
    "Random Forest":RandomForestClassifier(class_weight=weights, n_estimators=100),
    "Decision Tree":DecisionTreeClassifier(),
    "Logistic":LogisticRegression(class_weight=weights, max_iter=1000),
}

trained_models = {}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    ## Training set performance
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    model_train_f1 = f1_score(y_train,y_train_pred,average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted')

    ## Testing set performance
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test,y_test_pred,average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted')


    print(list(models.keys())[i])

    print("Model Performance for Training set")
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print("- F1 score: {:.4f}".format(model_train_f1))
    print("- Precision: {:.4f}".format(model_train_precision))
    print("- Recall: {:.4f}".format(model_train_recall))


    print('------------------------------------------------------------------')

    print("Model Performance for Test set")
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- F1 score: {:.4f}".format(model_test_f1))
    print("- Precision: {:.4f}".format(model_test_precision))
    print("- Recall: {:.4f}".format(model_test_recall))

    trained_models[list(models.keys())[i]] = model
    
    print()

Parameters: { "scale_pos_weight" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XbBoost
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.9467
- F1 score: 0.9463
- Precision: 0.9463
- Recall: 0.9467

Adaboost
Model Performance for Training set
- Accuracy: 0.9140
- F1 score: 0.9144
- Precision: 0.9155
- Recall: 0.9140
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.8997
- F1 score: 0.9007
- Precision: 0.9022
- Recall: 0.8997

Random Forest
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.9389
- F1 score: 0.9378
- Precision: 0.9374
- Recall: 0.9389

Decision Tree
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 

XbBoost
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.9467
- F1 score: 0.9463
- Precision: 0.9463
- Recall: 0.9467

Adaboost
Model Performance for Training set
- Accuracy: 0.9140
- F1 score: 0.9144
- Precision: 0.9155
- Recall: 0.9140
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.8997
- F1 score: 0.9007
- Precision: 0.9022
- Recall: 0.8997

Random Forest
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.9389
- F1 score: 0.9378
- Precision: 0.9374
- Recall: 0.9389

Decision Tree
Model Performance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.9326
- F1 score: 0.9322
- Precision: 0.9321
- Recall: 0.9326

Logistic
Model Performance for Training set
- Accuracy: 0.8737
- F1 score: 0.8838
- Precision: 0.9094
- Recall: 0.8737
------------------------------------------------------------------
Model Performance for Test set
- Accuracy: 0.8213
- F1 score: 0.8378
- Precision: 0.8767
- Recall: 0.8213

In [52]:
model_dir = '../models' 
os.makedirs(model_dir, exist_ok=True)

# Save XGBClassifier
with open(os.path.join(model_dir, 'xgboost_model.pkl'), 'wb') as f:
    pickle.dump(trained_models['XbBoost'], f)

# Save RandomForestClassifier
with open(os.path.join(model_dir, 'random_forest_model.pkl'), 'wb') as f:
    pickle.dump(trained_models['Random Forest'], f)

print("Models saved as pickle files successfully!")

Models saved as pickle files successfully!
