In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('/Users/punyashrees/Documents/projects/OvaGuide/data/Cleaned-Data.csv')
df.head(5)

Unnamed: 0,Age,Weight_kg,Height_ft,Marital_Status,PCOS,Family_History_PCOS,Menstrual_Irregularity,Hormonal_Imbalance,Hyperandrogenism,Hirsutism,...,Diet_Multivitamin,Vegetarian,Exercise_Frequency,Exercise_Type,Exercise_Duration,Sleep_Hours,Stress_Level,Smoking,Exercise_Benefit,PCOS_Medication
0,20-25,66.0,157.48,Unmarried,No,No,Yes,No,No,No,...,0,No,Rarely,"Cardio (e.g., running, cycling, swimming)",30 minutes,Less than 6 hours,No,No,Somewhat,No.
1,Less than 20,56.0,165.1,Unmarried,No,No,No,No,No,No,...,0,No,Daily,No Exercise,Less than 30 minutes,6-8 hours,No,No,Somewhat,No.
2,Less than 20,89.0,167.64,Unmarried,No,Yes,No,No,No,Yes,...,0,No,Rarely,"Cardio (e.g., running, cycling, swimming)",Less than 30 minutes,6-8 hours,Yes,No,Somewhat,No.
3,20-25,55.0,160.02,Unmarried,No,Yes,No,Yes,No,Yes,...,1,No,Never,No Exercise,Not Applicable,6-8 hours,Yes,No,Somewhat,No.
4,Less than 20,55.0,160.02,Unmarried,No,No,No,No,No,No,...,0,No,Daily,"Cardio (e.g., running, cycling, swimming)",30 minutes to 1 hour,6-8 hours,Yes,No,Not at All,No.


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

In [5]:
df["Age"].unique()

array(['20-25', 'Less than 20', '45 and above', '30-35', '35-44', '25-30'],
      dtype=object)

In [6]:
age_map = {'20-25': '20-25', 'Less than 20': '12-19', '45 and above': '45-100', '30-35': '31-35', '35-44': '36-44', '25-30': '26-30'}
df['Age'] = df['Age'].map(age_map)

In [7]:
data = df.copy()
data["Sleep_Hours"].unique()

array(['Less than 6 hours', '6-8 hours', '9-12 hours',
       'More than 12 hours'], dtype=object)

In [8]:
#mapping sleep hours to adjacent numeric labels
sleep_map = {"Less than 6 hours":5.5, "6-8 hours":7.0, "9-12 hours":11.5, "More than 12 hours": 12}
if "Sleep_Hours" in data.columns:
    data["Sleep_Hours_num"] = data["Sleep_Hours"].map(sleep_map)

# Show mapping between Sleep_Hours and Sleep_Hours_num
data.groupby("Sleep_Hours")["Sleep_Hours_num"].unique()

Sleep_Hours
6-8 hours              [7.0]
9-12 hours            [11.5]
Less than 6 hours      [5.5]
More than 12 hours    [12.0]
Name: Sleep_Hours_num, dtype: object

In [9]:
data["Exercise_Duration"].unique()

array(['30 minutes', 'Less than 30 minutes', 'Not Applicable',
       '30 minutes to 1 hour', 'More than 30 minutes'], dtype=object)

In [10]:
#mapping exercise duration to minutes
exercise_map = {
    "Less than 30 minutes": 15, "30 minutes": 30, "30 minutes to 1 hour": 55,
    "Not Applicable": 0, "More than 30 minutes": 45
}
if "Exercise_Duration" in data.columns:
    data["Exercise_Minutes"] = data["Exercise_Duration"].map(exercise_map)

data.groupby("Exercise_Duration")["Exercise_Minutes"].unique()


Exercise_Duration
30 minutes              [30]
30 minutes to 1 hour    [55]
Less than 30 minutes    [15]
More than 30 minutes    [45]
Not Applicable           [0]
Name: Exercise_Minutes, dtype: object

In [11]:
#ensuring diet columns are numeric
diet_cols = [c for c in data.columns if c.startswith("Diet_")]
for c in diet_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce")

In [12]:
#detecting yes/no and mapping them to binary
def is_yes_no(series):
    vals = set([str(x).strip().lower().rstrip('.') for x in series.dropna().unique()])
    return vals.issubset({"yes","no","y","n","true","false"})

binary_cols = []
for col in data.select_dtypes(include=["object"]).columns:
    if is_yes_no(data[col]):
        binary_cols.append(col)
for c in binary_cols:
    data[c] = data[c].astype(str).str.strip().str.lower().str.rstrip('.').map({"yes":1,"no":0,"y":1,"n":0,"true":1,"false":0})

In [13]:
data.drop(columns=["Sleep_Hours", "Exercise_Duration"], inplace=True, axis=1)

In [14]:
#setting feature columns
target_candidates = ["PCOS"]
target_col = next((t for t in target_candidates if t in data.columns), None)
if target_col:
    X = data.drop(columns=[target_col]).copy()
else:
    X = data.copy()

In [15]:
#input / feature columns
X.head(2)

Unnamed: 0,Age,Weight_kg,Height_ft,Marital_Status,Family_History_PCOS,Menstrual_Irregularity,Hormonal_Imbalance,Hyperandrogenism,Hirsutism,Mental_Health,...,Diet_Multivitamin,Vegetarian,Exercise_Frequency,Exercise_Type,Stress_Level,Smoking,Exercise_Benefit,PCOS_Medication,Sleep_Hours_num,Exercise_Minutes
0,20-25,66.0,157.48,Unmarried,No,1,No,0,No,1,...,0,0,Rarely,"Cardio (e.g., running, cycling, swimming)",0,0,Somewhat,No.,5.5,30
1,12-19,56.0,165.1,Unmarried,No,0,No,0,No,0,...,0,0,Daily,No Exercise,0,0,Somewhat,No.,7.0,15


In [16]:
#encoding target columns - PCOS
print(data["PCOS"].unique())
PCOS_map = {"No": 0, "Yes": 1, "No, Yes, not diagnosed by a doctor": 1}
data["PCOS"] = data["PCOS"].map(PCOS_map)
print(data["PCOS"].unique())
data["PCOS"].dtype

['No' 'Yes' 'No, Yes, not diagnosed by a doctor']
[0 1]


dtype('int64')

In [17]:
#encoding target columns - PCOS_Medication
print(data["PCOS_Medication"].unique())
medication_map = {'No.': 0, 'Yes.. but rn i stopped medication and start organic seeds that helps me a lot': 1, 'Glucophage ': 1, "I used to take medicine for almost two years, but it had no significant effects. One thing I added to my daily routine is walking for half an hour or more, which has significantly improved my periods.Also recommended my doctors.': 1, 'Yes, hormonal therapy (e.g., birth control pills)': 1, 'I took as endocrinologist prescribed me but she said that I don't have PCOS ": 0, 'Multivitamins, Inositol': 1, 'Herbal': 1, 'Option 2': 0}
data['PCOS_Medication'] = data['PCOS_Medication'].map(medication_map)
#droping rows where PCOS_Medication is NaN
data = data.dropna(subset=['PCOS_Medication']).reset_index(drop=True)
print(data["PCOS_Medication"].unique())


['No.'
 'Yes.. but rn i stopped medication and start organic seeds that helps me a lot'
 'Glucophage '
 'I used to take medicine for almost two years, but it had no significant effects. One thing I added to my daily routine is walking for half an hour or more, which has significantly improved my periods.Also recommended my doctors.'
 'Yes, hormonal therapy (e.g., birth control pills)'
 "I took as endocrinologist prescribed me but she said that I don't have PCOS "
 'Multivitamins, Inositol' 'Herbal' 'Option 2']
[0. 1.]


In [18]:
X.size

6055

In [19]:
#dropping target column from entering the pipeline
X = data.drop(columns='PCOS')
X.size

5950

In [25]:
#getting numerical and categorical columns for pipeline
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
categorical_cols

['Age',
 'Marital_Status',
 'Family_History_PCOS',
 'Hormonal_Imbalance',
 'Hirsutism',
 'Conception_Difficulty',
 'Insulin_Resistance',
 'Diabetes',
 'Childhood_Trauma',
 'Cardiovascular_Disease',
 'Exercise_Frequency',
 'Exercise_Type',
 'Exercise_Benefit']

In [21]:
#building pipelines for numeric and categorical columns
numeric_transformer = Pipeline([("scaler", StandardScaler()), ("num_imputer", SimpleImputer(strategy="median"))])
categorical_transformer = Pipeline([("onehot", OneHotEncoder(handle_unknown="ignore")), ("cat_imputer", SimpleImputer(strategy="median"))])

In [22]:
#applying different transformers to different type of columns
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
], remainder="drop")

#pipeline for preprocessing
pipeline = Pipeline([("preprocessor", preprocessor)])

#fitting the data into the pipeline and saving it to X_trans
pipeline.fit(X)
X_trans = pipeline.transform(X)
print("Transformed shape:", X_trans.shape)

Transformed shape: (170, 78)


In [23]:
joblib.dump(pipeline, "pcos_preprocessing_pipeline.joblib")
joblib.dump({"numeric_cols": numeric_cols, "categorical_cols": categorical_cols}, "pcos_feature_columns.joblib")
print("Saved pipeline and feature lists to pcos_preprocessing_pipeline.joblib")

Saved pipeline and feature lists to pcos_preprocessing_pipeline.joblib


In [24]:
import joblib
joblib.dump(data, "pcos_preprocessed.pkl")      

['pcos_preprocessed.pkl']