Machine Learning for Fetal Health Classification

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

In [48]:
orig_fetal = pd.read_excel("C:/Users/reems/Downloads/fetal_health_excel.xlsx")
classified_fetal = pd.read_excel("C:/Users/reems/Downloads/fetal_classified2.xlsx")
fetal_normal = pd.read_excel("C:/Users/reems/Downloads/fetal_health_normal.xlsx")
fetal_suspect = pd.read_excel("C:/Users/reems/Downloads/fetal_health_suspect.xlsx")
fetal_pathological = pd.read_excel("C:/Users/reems/Downloads/fetal_health_pathological.xlsx")
summary_stats = pd.read_excel("C:/Users/reems/Downloads/fetal_summary_stats.xlsx")

In [49]:
X = classified_fetal.drop(columns=['health_status'])  # Features
y = classified_fetal['health_status']               # Target variable

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Display results
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                     Accuracy  Precision    Recall  F1 Score
Logistic Regression  0.992958   0.992938  0.992958  0.992938
Decision Tree        1.000000   1.000000  1.000000  1.000000
Random Forest        1.000000   1.000000  1.000000  1.000000


In [50]:
import joblib

# Save each trained model
for name, model in models.items():
    # Define file name for the model
    file_name = f"{name.lower().replace(' ', '_')}_model.pkl"
    # Save the model to a file
    joblib.dump(model, file_name)
    print(f"Saved {name} model as {file_name}")


Saved Logistic Regression model as logistic_regression_model.pkl
Saved Decision Tree model as decision_tree_model.pkl
Saved Random Forest model as random_forest_model.pkl


In [51]:
from sklearn.metrics import classification_report

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    # Classification report
    report = classification_report(y_test, y_pred, target_names=['Normal', 'Suspect', 'Pathological'], output_dict=True)
    results[name] = {
        'Precision (Normal)': report['Normal']['precision'],
        'Recall (Normal)': report['Normal']['recall'],
        'F1 Score (Normal)': report['Normal']['f1-score'],
        'Precision (Suspect)': report['Suspect']['precision'],
        'Recall (Suspect)': report['Suspect']['recall'],
        'F1 Score (Suspect)': report['Suspect']['f1-score'],
        'Precision (Pathological)': report['Pathological']['precision'],
        'Recall (Pathological)': report['Pathological']['recall'],
        'F1 Score (Pathological)': report['Pathological']['f1-score'],
        'Overall Accuracy': report['accuracy']
    }

# Display results
results_df = pd.DataFrame.from_dict(results, orient='index')
(results_df.head())


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Precision (Normal),Recall (Normal),F1 Score (Normal),Precision (Suspect),Recall (Suspect),F1 Score (Suspect),Precision (Pathological),Recall (Pathological),F1 Score (Pathological),Overall Accuracy
Logistic Regression,0.994286,0.997135,0.995708,1.0,1.0,1.0,0.985714,0.971831,0.978723,0.992958
Decision Tree,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Random Forest,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [67]:
# Define the number of synthetic samples to generate
num_samples = 1000

# Generate random samples for each feature
baseline_value = np.random.randint(low=120, high=160, size=num_samples)
accelerations = np.random.uniform(low=0, high=1, size=num_samples)
fetal_movement = np.random.uniform(low=0, high=0.5, size=num_samples)
uterine_contractions = np.random.uniform(low=0, high=1, size=num_samples)
light_decelerations = np.random.uniform(low=0, high=0.1, size=num_samples)
severe_decelerations = np.random.uniform(low=0, high=0.05, size=num_samples)
prolongued_decelerations = np.random.uniform(low=0, high=0.05, size=num_samples)
abnormal_short_term_variability = np.random.randint(low=0, high=100, size=num_samples)
mean_value_of_short_term_variability = np.random.uniform(low=0, high=5, size=num_samples)
percentage_of_time_with_abnormal_long_term_variability = np.random.uniform(low=0, high=100, size=num_samples)
mean_value_of_long_term_variability = np.random.uniform(low=0, high=10, size=num_samples)
histogram_width = np.random.randint(low=0, high=100, size=num_samples)
histogram_min = np.random.randint(low=50, high=200, size=num_samples)
histogram_max = np.random.randint(low=100, high=300, size=num_samples)
histogram_number_of_peaks = np.random.randint(low=0, high=20, size=num_samples)
histogram_number_of_zeroes = np.random.randint(low=0, high=10, size=num_samples)
histogram_mode = np.random.randint(low=50, high=200, size=num_samples)
histogram_mean = np.random.randint(low=100, high=250, size=num_samples)
histogram_median = np.random.randint(low=50, high=200, size=num_samples)
histogram_variance = np.random.randint(low=0, high=100, size=num_samples)
histogram_tendency = np.random.randint(low=-1, high=2, size=num_samples)
fetal_health = np.random.randint(low=0, high=2, size=num_samples)

# Generate synthetic health status
#health_status = np.random.choice(['Normal', 'Suspect', 'Pathological'], size=num_samples)

# Combine features into a DataFrame
synthetic_data = pd.DataFrame({
    'health_status': health_status,
    'baseline value': baseline_value,
    'accelerations': accelerations,
    'fetal_movement': fetal_movement,
    'uterine_contractions': uterine_contractions,
    'light_decelerations': light_decelerations,
    'severe_decelerations': severe_decelerations,
    'prolongued_decelerations': prolongued_decelerations,
    'abnormal_short_term_variability': abnormal_short_term_variability,
    'mean_value_of_short_term_variability': mean_value_of_short_term_variability,
    'percentage_of_time_with_abnormal_long_term_variability': percentage_of_time_with_abnormal_long_term_variability,
    'mean_value_of_long_term_variability': mean_value_of_long_term_variability,
    'histogram_width': histogram_width,
    'histogram_min': histogram_min,
    'histogram_max': histogram_max,
    'histogram_number_of_peaks': histogram_number_of_peaks,
    'histogram_number_of_zeroes': histogram_number_of_zeroes,
    'histogram_mode': histogram_mode,
    'histogram_mean': histogram_mean,
    'histogram_median': histogram_median,
    'histogram_variance': histogram_variance,
    'histogram_tendency': histogram_tendency,
    'fetal_health': fetal_health
})

# Print synthetic_data without the first column
(synthetic_data.iloc[:, 1:])


Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,142,0.078301,0.153330,0.809639,0.087914,0.043432,0.035466,8,0.833488,43.144031,...,152,265,3,5,135,218,190,14,-1,1
1,148,0.543085,0.366937,0.595929,0.086812,0.037450,0.026321,21,0.613859,42.178901,...,63,103,9,5,122,207,60,14,1,1
2,150,0.148892,0.235325,0.211427,0.064587,0.035278,0.032732,8,1.249377,33.060715,...,103,220,1,1,179,189,119,13,1,0
3,156,0.147746,0.421176,0.515718,0.006439,0.023028,0.003833,10,0.443198,10.067395,...,112,103,9,1,112,246,180,38,1,1
4,155,0.107925,0.244960,0.787958,0.016048,0.019259,0.041833,88,2.970772,18.283313,...,75,217,6,4,142,138,145,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,147,0.766156,0.054539,0.807327,0.012946,0.018078,0.037809,82,1.214075,2.926814,...,81,141,19,3,80,143,187,42,-1,1
996,154,0.039432,0.132625,0.031030,0.030513,0.031053,0.013258,80,2.137066,5.470829,...,184,237,9,9,57,189,130,46,0,0
997,125,0.474801,0.270993,0.266775,0.070187,0.031403,0.005733,71,4.332108,3.449766,...,157,116,12,5,190,165,149,90,-1,1
998,149,0.663162,0.464275,0.258587,0.098287,0.045165,0.043166,32,3.767187,57.124796,...,140,201,4,8,70,165,81,69,-1,1


In [75]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Drop the 'health_status' column from the feature set
X_train = classified_fetal.drop(columns=['health_status'])
y_train = classified_fetal['health_status']

# Train a classification model
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Now suppose you have new data without the 'health_status' column
# Drop the 'health_status' column from the synthetic data
new_data = synthetic_data.drop(columns=['health_status'])

# Use the trained model to predict the 'health_status' for the new data
predicted_health_status = classifier.predict(new_data)


In [76]:
# These are the health_status predictions that the random forest classifier makes 
# Replace the values in the first column with the new health_status predictions
synthetic_data.iloc[:, 0] = predicted_health_status

# Print the modified synthetic_data DataFrame
(synthetic_data)


Unnamed: 0,health_status,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,Normal,142,0.078301,0.153330,0.809639,0.087914,0.043432,0.035466,8,0.833488,...,152,265,3,5,135,218,190,14,-1,1
1,Suspect,148,0.543085,0.366937,0.595929,0.086812,0.037450,0.026321,21,0.613859,...,63,103,9,5,122,207,60,14,1,1
2,Suspect,150,0.148892,0.235325,0.211427,0.064587,0.035278,0.032732,8,1.249377,...,103,220,1,1,179,189,119,13,1,0
3,Suspect,156,0.147746,0.421176,0.515718,0.006439,0.023028,0.003833,10,0.443198,...,112,103,9,1,112,246,180,38,1,1
4,Suspect,155,0.107925,0.244960,0.787958,0.016048,0.019259,0.041833,88,2.970772,...,75,217,6,4,142,138,145,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Suspect,147,0.766156,0.054539,0.807327,0.012946,0.018078,0.037809,82,1.214075,...,81,141,19,3,80,143,187,42,-1,1
996,Suspect,154,0.039432,0.132625,0.031030,0.030513,0.031053,0.013258,80,2.137066,...,184,237,9,9,57,189,130,46,0,0
997,Normal,125,0.474801,0.270993,0.266775,0.070187,0.031403,0.005733,71,4.332108,...,157,116,12,5,190,165,149,90,-1,1
998,Suspect,149,0.663162,0.464275,0.258587,0.098287,0.045165,0.043166,32,3.767187,...,140,201,4,8,70,165,81,69,-1,1


In [78]:
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

# Now suppose you have new data without the 'health_status' column
# Drop the 'health_status' column from the synthetic data
new_data = synthetic_data.drop(columns=['health_status'])

# Use the trained model to predict the 'health_status' for the new data
predicted_health_status = classifier.predict(new_data)


In [79]:
#these are the health status predictions that the decision tree classifier makes
# This is the health_status predictions that the random forest classifier makes 
# Replace the values in the first column with the new health_status predictions
synthetic_data.iloc[:, 0] = predicted_health_status

# Print the modified synthetic_data DataFrame
(synthetic_data)


Unnamed: 0,health_status,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,Normal,142,0.078301,0.153330,0.809639,0.087914,0.043432,0.035466,8,0.833488,...,152,265,3,5,135,218,190,14,-1,1
1,Suspect,148,0.543085,0.366937,0.595929,0.086812,0.037450,0.026321,21,0.613859,...,63,103,9,5,122,207,60,14,1,1
2,Suspect,150,0.148892,0.235325,0.211427,0.064587,0.035278,0.032732,8,1.249377,...,103,220,1,1,179,189,119,13,1,0
3,Suspect,156,0.147746,0.421176,0.515718,0.006439,0.023028,0.003833,10,0.443198,...,112,103,9,1,112,246,180,38,1,1
4,Suspect,155,0.107925,0.244960,0.787958,0.016048,0.019259,0.041833,88,2.970772,...,75,217,6,4,142,138,145,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Suspect,147,0.766156,0.054539,0.807327,0.012946,0.018078,0.037809,82,1.214075,...,81,141,19,3,80,143,187,42,-1,1
996,Suspect,154,0.039432,0.132625,0.031030,0.030513,0.031053,0.013258,80,2.137066,...,184,237,9,9,57,189,130,46,0,0
997,Normal,125,0.474801,0.270993,0.266775,0.070187,0.031403,0.005733,71,4.332108,...,157,116,12,5,190,165,149,90,-1,1
998,Suspect,149,0.663162,0.464275,0.258587,0.098287,0.045165,0.043166,32,3.767187,...,140,201,4,8,70,165,81,69,-1,1


In [80]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Now suppose you have new data without the 'health_status' column
# Drop the 'health_status' column from the synthetic data
new_data = synthetic_data.drop(columns=['health_status'])

# Use the trained model to predict the 'health_status' for the new data
predicted_health_status = classifier.predict(new_data)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [81]:
#these are the health status predictions that the logistic regression classifier makes

synthetic_data.iloc[:, 0] = predicted_health_status

# Print the modified synthetic_data DataFrame
(synthetic_data)


Unnamed: 0,health_status,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,Suspect,142,0.078301,0.153330,0.809639,0.087914,0.043432,0.035466,8,0.833488,...,152,265,3,5,135,218,190,14,-1,1
1,Suspect,148,0.543085,0.366937,0.595929,0.086812,0.037450,0.026321,21,0.613859,...,63,103,9,5,122,207,60,14,1,1
2,Normal,150,0.148892,0.235325,0.211427,0.064587,0.035278,0.032732,8,1.249377,...,103,220,1,1,179,189,119,13,1,0
3,Suspect,156,0.147746,0.421176,0.515718,0.006439,0.023028,0.003833,10,0.443198,...,112,103,9,1,112,246,180,38,1,1
4,Normal,155,0.107925,0.244960,0.787958,0.016048,0.019259,0.041833,88,2.970772,...,75,217,6,4,142,138,145,1,-1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Normal,147,0.766156,0.054539,0.807327,0.012946,0.018078,0.037809,82,1.214075,...,81,141,19,3,80,143,187,42,-1,1
996,Pathological,154,0.039432,0.132625,0.031030,0.030513,0.031053,0.013258,80,2.137066,...,184,237,9,9,57,189,130,46,0,0
997,Normal,125,0.474801,0.270993,0.266775,0.070187,0.031403,0.005733,71,4.332108,...,157,116,12,5,190,165,149,90,-1,1
998,Pathological,149,0.663162,0.464275,0.258587,0.098287,0.045165,0.043166,32,3.767187,...,140,201,4,8,70,165,81,69,-1,1
