### Preprocessing & Cleaning

In [1]:
import pandas as pd

# Load the cleaned dataset
data = pd.read_excel(r"C:\Ovarian Cancer Prediction\Supplementary data 1.xlsx")

# Display dataset dimensions
print(f"Dataset shape: {data.shape[0]:,} samples × {data.shape[1]} features")

# Configure display for full column visibility
pd.set_option('display.max_columns', None)
data.head()

Dataset shape: 349 samples × 51 features


Unnamed: 0,SUBJECT_ID,AFP,AG,Age,ALB,ALP,ALT,AST,BASO#,BASO%,BUN,Ca,CA125,CA19-9,CA72-4,CEA,CL,CO2CP,CREA,TYPE,DBIL,EO#,EO%,GGT,GLO,GLU.,HCT,HE4,HGB,IBIL,K,LYM#,LYM%,MCH,MCV,Menopause,Mg,MONO#,MONO%,MPV,Na,NEU,PCT,PDW,PHOS,PLT,RBC,RDW,TBIL,TP,UA
0,1,3.58\t,19.36,47,45.4,56.0,11.0,24.0,0.01,0.3,5.35,2.48,15.36\t,36.48\t,6.42,1.4,107.4,19.9,103.0,0,2.0,0.04,1.0,16.0,28.5,4.67,0.273,,89.0,3.5,5.36,0.65,16.8,33.7,103.4,0,0.78,0.22,5.7,11.7,141.3,76.2,0.09,13.4,1.46,74,2.64,13.7,5.5,73.9,396.4
1,2,34.24\t,23.98,61,39.9,95.0,9.0,13.0,0.02,0.3,3.21,2.62,2444.00\t,19.98\t,,2.46,100.1,22.3,45.0,0,2.6,0.04,0.5,13.0,32.1,10.5,0.417,934.1,128.0,4.2,4.38,1.27,17.2,26.2,85.3,1,0.82,0.41,5.5,10.0,142.0,76.5,0.3,11.2,1.09,304,4.89,12.7,6.8,72.0,119.2
2,3,1.50\t,18.4,39,45.4,77.0,9.0,18.0,0.03,0.6,3.8,2.57,56.08\t,12.18\t,,0.77,102.6,22.2,48.0,0,4.7,0.03,0.6,10.0,32.5,4.64,0.391,47.56,131.0,10.1,4.3,1.1,23.7,28.4,84.6,0,1.0,0.25,5.4,11.4,138.9,69.7,0.13,15.2,0.97,112,4.62,12.0,14.8,77.9,209.2
3,4,2.75,16.6,45,39.2,26.0,16.0,17.0,0.05,0.74,5.27,2.35,2555,18.41,131.6,0.82,103.2,24.0,65.7,0,2.9,0.0,0.07,17.0,26.9,4.76,0.372,853.5,123.0,8.0,4.7,1.73,27.2,30.6,92.6,1,1.11,0.42,6.55,7.38,139.1,65.5,0.25,17.4,1.25,339,4.01,14.6,10.9,66.1,215.6
4,5,2.36,19.97,45,35.0,47.0,21.0,27.0,0.01,0.1,4.89,2.48,1391,11.15,,0.42,99.6,26.2,70.3,0,2.2,0.11,1.6,24.0,31.5,4.07,0.383,404.9,122.0,3.1,4.77,1.98,28.8,27.7,87.0,0,1.08,0.69,10.0,10.4,141.0,59.5,0.28,11.9,0.94,272,4.4,13.4,5.3,66.5,206.0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 51 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   SUBJECT_ID  349 non-null    int64  
 1   AFP         327 non-null    object 
 2   AG          348 non-null    float64
 3   Age         349 non-null    int64  
 4   ALB         339 non-null    float64
 5   ALP         339 non-null    float64
 6   ALT         339 non-null    float64
 7   AST         339 non-null    float64
 8   BASO#       349 non-null    float64
 9   BASO%       349 non-null    float64
 10  BUN         349 non-null    float64
 11  Ca          349 non-null    float64
 12  CA125       332 non-null    object 
 13  CA19-9      325 non-null    object 
 14  CA72-4      109 non-null    float64
 15  CEA         327 non-null    float64
 16  CL          349 non-null    float64
 17  CO2CP       348 non-null    float64
 18  CREA        349 non-null    float64
 19  TYPE        349 non-null    i

In [3]:
# Check for missing values
has_missing = data.isnull().any().any()
print(f"Dataset contains missing values: {has_missing}")

# Summarize missing values per feature
missing_summary = data.isnull().sum()
print("\nMissing values per feature:")
print(missing_summary[missing_summary > 0])  # Show only features with missing values

Dataset contains missing values: True

Missing values per feature:
AFP        22
AG          1
ALB        10
ALP        10
ALT        10
AST        10
CA125      17
CA19-9     24
CA72-4    240
CEA        22
CO2CP       1
DBIL       10
GGT        10
GLO        10
HE4        20
IBIL       10
MPV         2
NEU        91
PCT         2
PDW         2
TBIL       10
TP         10
dtype: int64


In [4]:
# STEP 1: DATA LOADING AND PREPROCESSING

print("\n" + "="*120)
print("STEP 1: DATA LOADING AND PREPROCESSING")
print("="*120)

X = data.drop(columns=['TYPE', 'SUBJECT_ID'])
y = data['TYPE']

print(f"Dataset: {data.shape[0]} samples, {X.shape[1]} features")
print(f"Class distribution: Benign: {(y==0).sum()}, Malignant: {(y==1).sum()}")

def clean_numeric(series):
    if series.dtype == 'object':
        return pd.to_numeric(
            series.astype(str).str.replace(r'[^\d\\.\\-eE]', '', regex=True).str.strip(),
            errors='coerce'
        )
    return series

X = X.apply(clean_numeric)

# Drop CA72-4 becuase it has 69% of missing data
if 'CA72-4' in X.columns:
    X = X.drop(columns=['CA72-4'])

# Derive NEU% from complementary percentages if possible
pct_complements = ['LYM%', 'MONO%', 'EO%', 'BASO%']
if all(col in X.columns for col in pct_complements):
    neu_derived = 100 - X[pct_complements].sum(axis=1)
    mask = X['NEU'].isna() & neu_derived.between(0, 100)
    X.loc[mask, 'NEU'] = neu_derived[mask]

# Create missingness indicators for key biomarkers
for col in ['AFP', 'CA125', 'CA19-9', 'CEA', 'HE4', 'NEU']:
    if col in X.columns:
        X[f'{col}_missing'] = X[col].isna().astype(int)

print(f"Total features: {X.shape[1]}")
print("Name of all features: ")
for col in X.columns:
    print(col)



STEP 1: DATA LOADING AND PREPROCESSING
Dataset: 349 samples, 49 features
Class distribution: Benign: 171, Malignant: 178
Total features: 54
Name of all features: 
AFP
AG
Age
ALB
ALP
ALT
AST
BASO#
BASO%
BUN
Ca
CA125
CA19-9
CEA
CL
CO2CP
CREA
DBIL
EO#
EO%
GGT
GLO
GLU.
HCT
HE4
HGB
IBIL
K
LYM#
LYM%
MCH
MCV
Menopause
Mg
MONO#
MONO%
MPV
Na
NEU
PCT
PDW
PHOS
PLT
RBC
RDW
TBIL
TP
UA
AFP_missing
CA125_missing
CA19-9_missing
CEA_missing
HE4_missing
NEU_missing


In [6]:
# STEP 2: TRAIN-TEST SPLIT

from sklearn.model_selection import train_test_split

print("\n" + "="*120)
print("STEP 2: STRATIFIED TRAIN-TEST SPLIT")
print("="*120)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training: {X_train.shape[0]} samples, Test: {X_test.shape[0]} samples")


STEP 2: STRATIFIED TRAIN-TEST SPLIT
Training: 279 samples, Test: 70 samples


In [10]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler

# STEP 3: PREPROCESSING

print("\n" + "="*120)
print("STEP 3: PREPROCESSING (LOG TRANSFORM, IMPUTATION, SCALING)")
print("="*120)

tumor_markers = [col for col in ['AFP', 'CA125', 'CA19-9', 'CEA', 'HE4'] if col in X_train.columns]

for col in tumor_markers:
    X_train[col] = np.log1p(X_train[col])
    X_test[col] = np.log1p(X_test[col])

knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
X_train[tumor_markers] = knn_imputer.fit_transform(X_train[tumor_markers])
X_test[tumor_markers] = knn_imputer.transform(X_test[tumor_markers])

clinical_features = [col for col in X_train.columns
                     if col not in tumor_markers and not col.endswith('_missing')
                     and X_train[col].dtype in ['float64', 'int64']]

median_imputer = SimpleImputer(strategy='median')
X_train[clinical_features] = median_imputer.fit_transform(X_train[clinical_features])
X_test[clinical_features] = median_imputer.transform(X_test[clinical_features])

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)

print(f"Preprocessing complete")

# Save processed dataset
processed_data = pd.concat([X_train_scaled, y_train], axis=1)
processed_data.to_csv(output_path, index=False)

print(f"Data cleaning and imputation complete. Saved to '{output_path}'.")


STEP 3: PREPROCESSING (LOG TRANSFORM, IMPUTATION, SCALING)
Preprocessing complete
Data cleaning and imputation complete. Saved to 'cleaned_dataset.csv'.


In [12]:
initial_missing = data.isna().sum()
remaining_missing = processed_data.isna().sum()

valid_columns = initial_missing.index.intersection(remaining_missing.index)

comparison = pd.DataFrame({
    "Initial Missing": initial_missing[valid_columns],
    "Remaining Missing": remaining_missing[valid_columns]
})

comparison = comparison[comparison["Initial Missing"] > 0]

print("\nMissing Values Comparison (Before vs After):\n", comparison)



Missing Values Comparison (Before vs After):
         Initial Missing  Remaining Missing
AFP                  22                  0
AG                    1                  0
ALB                  10                  0
ALP                  10                  0
ALT                  10                  0
AST                  10                  0
CA125                17                  0
CA19-9               24                  0
CEA                  22                  0
CO2CP                 1                  0
DBIL                 10                  0
GGT                  10                  0
GLO                  10                  0
HE4                  20                  0
IBIL                 10                  0
MPV                   2                  0
NEU                  91                  0
PCT                   2                  0
PDW                   2                  0
TBIL                 10                  0
TP                   10                  0


In [None]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

# Define numeric columns (exclude identifiers/categoricals)
num_cols = [col for col in data.columns if col not in ['TYPE', 'Menopause']]

def clean_numeric(series):
    """Clean non-numeric characters and coerce to numeric."""
    return pd.to_numeric(
        series.astype(str).str.replace(r'[^\d\.\-eE]', '', regex=True).str.strip(),
        errors='coerce'
    )

# Clean numeric columns
data[num_cols] = data[num_cols].apply(clean_numeric)

# Create missingness indicators for key biomarkers
missing_flags = ['AFP', 'CA125', 'CA19-9', 'CA72-4', 'CEA', 'HE4', 'NEU']
for col in missing_flags:
    if col in data.columns:
        data[f'{col}_missing'] = data[col].isna().astype(int)

# Derive NEU% from complementary percentages if possible
pct_complements = ['LYM%', 'MONO%', 'EO%', 'BASO%']
if all(col in data.columns for col in pct_complements):
    data['NEU_derived'] = 100 - data[pct_complements].sum(axis=1)
    mask = data['NEU'].isna() & data['NEU_derived'].between(0, 100)
    data.loc[mask, 'NEU'] = data.loc[mask, 'NEU_derived']
    data = data.drop(columns='NEU_derived')  # Non-inplace for safety

# Median imputation for low-missing-rate features
low_missing_cols = ['AG', 'ALB', 'ALP', 'ALT', 'AST', 'DBIL', 'GGT', 'GLO', 
                    'IBIL', 'TBIL', 'TP', 'MPV', 'PCT', 'PDW', 'CO2CP']
for col in low_missing_cols:
    if col in data.columns:
        data[col] = data[col].fillna(data[col].median())

# Iterative imputation for moderate-missing tumor markers
tumor_markers = ['AFP', 'CA125', 'CA19-9', 'CEA', 'HE4']
if any(col in data.columns for col in tumor_markers):
    available_markers = [col for col in tumor_markers if col in data.columns]
    imputer = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=100, random_state=42),
        max_iter=10, random_state=42
    )
    sub_df = data[available_markers].copy()
    data[available_markers] = imputer.fit_transform(sub_df)

# Median imputation for high-missing CA72-4
if 'CA72-4' in data.columns:
    data['CA72-4'] = data['CA72-4'].fillna(data['CA72-4'].median())

# Iterative imputation for remaining NEU values
if data['NEU'].isna().sum() > 0 and 'NEU' in data.columns:
    ref_cols = ['LYM%', 'MONO%', 'EO%', 'BASO%', 'HGB', 'HCT', 'PLT']
    available_refs = [col for col in ref_cols if col in data.columns]
    temp_df = data[available_refs + ['NEU']].copy()
    imputer_neu = IterativeImputer(
        estimator=RandomForestRegressor(n_estimators=50, random_state=42),
        random_state=42
    )
    neu_imputed = imputer_neu.fit_transform(temp_df)[:, -1]  # Last column is NEU
    data['NEU'] = neu_imputed

# Log-transform skewed tumor markers (using log1p for stability)
log_cols = ['AFP', 'CA125', 'CA19-9', 'CEA', 'HE4', 'CA72-4']
for col in log_cols:
    if col in data.columns:
        data[f'{col}_log'] = np.log1p(data[col])

# Clip percentage columns to valid range [0, 100]
pct_cols = [col for col in data.columns if '%' in col]
data[pct_cols] = data[pct_cols].clip(lower=0, upper=100)

# Save processed dataset
output_path = "cleaned_dataset.csv"
data.to_csv(output_path, index=False)
print(f"✅ Data cleaning and imputation complete. Saved to '{output_path}'.")
print("Remaining missing values:\n", data.isna().sum()[data.isna().sum() > 0])

Data cleaning and imputation complete. Saved to 'ovarian_cleaned_dataset.csv'.
Remaining missing values:
 Series([], dtype: int64)
