In [None]:
import pandas as pd
import numpy as np

# Path to your extracted CSV
csv_path = "risk_factors_cervical_cancer.csv"

df = pd.read_csv(csv_path)
print(df.shape)
df.head()


(858, 36)


Unnamed: 0,Age,Number of sexual partners,First sexual intercourse,Num of pregnancies,Smokes,Smokes (years),Smokes (packs/year),Hormonal Contraceptives,Hormonal Contraceptives (years),IUD,...,STDs: Time since first diagnosis,STDs: Time since last diagnosis,Dx:Cancer,Dx:CIN,Dx:HPV,Dx,Hinselmann,Schiller,Citology,Biopsy
0,18,4.0,15.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
1,15,1.0,14.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
2,34,1.0,?,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,?,?,0,0,0,0,0,0,0,0
3,52,5.0,16.0,4.0,1.0,37.0,37.0,1.0,3.0,0.0,...,?,?,1,0,1,0,0,0,0,0
4,46,3.0,21.0,4.0,0.0,0.0,0.0,1.0,15.0,0.0,...,?,?,0,0,0,0,0,0,0,0


In [None]:
df.columns = (
    df.columns
      .str.strip()
      .str.lower()
      .str.replace(r"[^a-z0-9]+", "_", regex=True)
      .str.strip("_")
)
df.columns.tolist()


['age',
 'number_of_sexual_partners',
 'first_sexual_intercourse',
 'num_of_pregnancies',
 'smokes',
 'smokes_years',
 'smokes_packs_year',
 'hormonal_contraceptives',
 'hormonal_contraceptives_years',
 'iud',
 'iud_years',
 'stds',
 'stds_number',
 'stds_condylomatosis',
 'stds_cervical_condylomatosis',
 'stds_vaginal_condylomatosis',
 'stds_vulvo_perineal_condylomatosis',
 'stds_syphilis',
 'stds_pelvic_inflammatory_disease',
 'stds_genital_herpes',
 'stds_molluscum_contagiosum',
 'stds_aids',
 'stds_hiv',
 'stds_hepatitis_b',
 'stds_hpv',
 'stds_number_of_diagnosis',
 'stds_time_since_first_diagnosis',
 'stds_time_since_last_diagnosis',
 'dx_cancer',
 'dx_cin',
 'dx_hpv',
 'dx',
 'hinselmann',
 'schiller',
 'citology',
 'biopsy']

In [None]:
# Replace "?" placeholders with NaN and strip any stray whitespace
df = df.replace("?", np.nan)

# Convert all columns to numeric where possible
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Quick check
missing_per_col = df.isna().sum().sort_values(ascending=False)
missing_per_col.head(10)


Unnamed: 0,0
stds_time_since_first_diagnosis,787
stds_time_since_last_diagnosis,787
iud,117
iud_years,117
hormonal_contraceptives,108
hormonal_contraceptives_years,108
stds_hpv,105
stds_aids,105
stds_hepatitis_b,105
stds_hiv,105


In [None]:
# Drop columns with >70% missing
thresh = 0.30  # keep columns with at least 30% non-missing data
keep_cols = df.columns[df.notna().mean() >= thresh]
df = df[keep_cols].copy()

# Drop fully empty rows and duplicates
df = df.dropna(how="all").drop_duplicates().reset_index(drop=True)

df.shape, df.isna().sum().sum()


((835, 34), np.int64(1957))

In [None]:
# Median imputation for numeric columns
num_cols = df.columns  # all should be numeric now
medians = df[num_cols].median()
df[num_cols] = df[num_cols].fillna(medians)

# Sanity check
df.isna().sum().sum()


np.int64(0)

In [None]:
# --- Check number of duplicate rows ---
num_duplicates = df.duplicated().sum()
print(f" Number of duplicate rows: {num_duplicates}")

# --- Remove duplicate rows ---
df = df.drop_duplicates().reset_index(drop=True)

# --- Identify binary columns (only 0 and 1 values) ---
def is_binary_series(s: pd.Series) -> bool:
    vals = set(pd.unique(s.dropna()))
    return vals.issubset({0, 1})

binary_cols = [c for c in df.columns if is_binary_series(df[c])]
cont_cols   = [c for c in df.columns if c not in binary_cols]

# --- Convert binary columns to int type ---
df[binary_cols] = df[binary_cols].astype(int)

print(f" Duplicates removed. Final shape: {df.shape}")
print(f"Binary columns: {len(binary_cols)} | Continuous columns: {len(cont_cols)}")


 Number of duplicate rows: 0
 Duplicates removed. Final shape: (830, 34)
Binary columns: 24 | Continuous columns: 10


In [None]:
# Dataset overview
print("Dataset shape:", df.shape)
print("\nColumn names:\n", df.columns.tolist())

# Info and quick stats
print("\nData types and non-null counts:")
df.info()

print("\nDescriptive statistics:")
df.describe().T


Dataset shape: (830, 34)

Column names:
 ['age', 'number_of_sexual_partners', 'first_sexual_intercourse', 'num_of_pregnancies', 'smokes', 'smokes_years', 'smokes_packs_year', 'hormonal_contraceptives', 'hormonal_contraceptives_years', 'iud', 'iud_years', 'stds', 'stds_number', 'stds_condylomatosis', 'stds_cervical_condylomatosis', 'stds_vaginal_condylomatosis', 'stds_vulvo_perineal_condylomatosis', 'stds_syphilis', 'stds_pelvic_inflammatory_disease', 'stds_genital_herpes', 'stds_molluscum_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hepatitis_b', 'stds_hpv', 'stds_number_of_diagnosis', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx', 'hinselmann', 'schiller', 'citology', 'biopsy']

Data types and non-null counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 830 entries, 0 to 829
Data columns (total 34 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   age                                 830 non-nul

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,830.0,27.06988,8.485135,13.0,21.0,26.0,32.0,84.0
number_of_sexual_partners,830.0,2.539759,1.657744,1.0,2.0,2.0,3.0,28.0
first_sexual_intercourse,830.0,17.025301,2.8097,10.0,15.0,17.0,18.0,32.0
num_of_pregnancies,830.0,2.287952,1.410874,0.0,1.0,2.0,3.0,11.0
smokes,830.0,0.148193,0.355505,0.0,0.0,0.0,0.0,1.0
smokes_years,830.0,1.241765,4.122522,0.0,0.0,0.0,0.0,37.0
smokes_packs_year,830.0,0.461333,2.245822,0.0,0.0,0.0,0.0,37.0
hormonal_contraceptives,830.0,0.693976,0.461118,0.0,0.0,1.0,1.0,1.0
hormonal_contraceptives_years,830.0,2.090644,3.609802,0.0,0.0,0.5,3.0,30.0
iud,830.0,0.1,0.300181,0.0,0.0,0.0,0.0,1.0
