In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv('/Users/punyashrees/Documents/projects/OvaGuide/data/Cleaned-Data.csv')
df.head(5)

Unnamed: 0,Age,Weight_kg,Height_ft,Marital_Status,PCOS,Family_History_PCOS,Menstrual_Irregularity,Hormonal_Imbalance,Hyperandrogenism,Hirsutism,...,Diet_Multivitamin,Vegetarian,Exercise_Frequency,Exercise_Type,Exercise_Duration,Sleep_Hours,Stress_Level,Smoking,Exercise_Benefit,PCOS_Medication
0,20-25,66.0,157.48,Unmarried,No,No,Yes,No,No,No,...,0,No,Rarely,"Cardio (e.g., running, cycling, swimming)",30 minutes,Less than 6 hours,No,No,Somewhat,No.
1,Less than 20,56.0,165.1,Unmarried,No,No,No,No,No,No,...,0,No,Daily,No Exercise,Less than 30 minutes,6-8 hours,No,No,Somewhat,No.
2,Less than 20,89.0,167.64,Unmarried,No,Yes,No,No,No,Yes,...,0,No,Rarely,"Cardio (e.g., running, cycling, swimming)",Less than 30 minutes,6-8 hours,Yes,No,Somewhat,No.
3,20-25,55.0,160.02,Unmarried,No,Yes,No,Yes,No,Yes,...,1,No,Never,No Exercise,Not Applicable,6-8 hours,Yes,No,Somewhat,No.
4,Less than 20,55.0,160.02,Unmarried,No,No,No,No,No,No,...,0,No,Daily,"Cardio (e.g., running, cycling, swimming)",30 minutes to 1 hour,6-8 hours,Yes,No,Not at All,No.


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib

In [11]:
data = df.copy()
data["Sleep_Hours"].unique()

array(['Less than 6 hours', '6-8 hours', '9-12 hours',
       'More than 12 hours'], dtype=object)

In [17]:
#mapping sleep hours to adjacent numeric labels
sleep_map = {"Less than 6 hours":5.5, "6-8 hours":7.0, "9-12 hours":11.5, "More than 12 hours": 12}
if "Sleep_Hours" in data.columns:
    data["Sleep_Hours_num"] = data["Sleep_Hours"].map(sleep_map)

# Show mapping between Sleep_Hours and Sleep_Hours_num
data.groupby("Sleep_Hours")["Sleep_Hours_num"].unique()

Sleep_Hours
6-8 hours              [7.0]
9-12 hours            [11.5]
Less than 6 hours      [5.5]
More than 12 hours    [12.0]
Name: Sleep_Hours_num, dtype: object

In [18]:
data["Exercise_Duration"].unique()

array(['30 minutes', 'Less than 30 minutes', 'Not Applicable',
       '30 minutes to 1 hour', 'More than 30 minutes'], dtype=object)

In [22]:
#mapping exercise duration to minutes
exercise_map = {
    "Less than 30 minutes": 15, "30 minutes": 30, "30 minutes to 1 hour": 55,
    "Not Applicable": 0, "More than 30 minutes": 45
}
if "Exercise_Duration" in data.columns:
    data["Exercise_Minutes"] = data["Exercise_Duration"].map(exercise_map)

data.groupby("Exercise_Duration")["Exercise_Minutes"].unique()


Exercise_Duration
30 minutes              [30]
30 minutes to 1 hour    [55]
Less than 30 minutes    [15]
More than 30 minutes    [45]
Not Applicable           [0]
Name: Exercise_Minutes, dtype: object

In [None]:


# Ensure diet columns are numeric
diet_cols = [c for c in data.columns if c.startswith("Diet_")]
for c in diet_cols:
    data[c] = pd.to_numeric(data[c], errors="coerce")

# Detect simple Yes/No columns and map to 0/1
def is_yes_no(series):
    vals = set([str(x).strip().lower().rstrip('.') for x in series.dropna().unique()])
    return vals.issubset({"yes","no","y","n","true","false"})

binary_cols = []
for col in data.select_dtypes(include=["object"]).columns:
    if is_yes_no(data[col]):
        binary_cols.append(col)
for c in binary_cols:
    data[c] = data[c].astype(str).str.strip().str.lower().str.rstrip('.').map({"yes":1,"no":0,"y":1,"n":0,"true":1,"false":0})

# Drop original textual columns we replaced (if present)
for c in ["Sleep_Hours","Exercise_Duration"]:
    if c in data.columns:
        data.drop(columns=[c], inplace=True)

# Identify target column (PCOS)
target_candidates = ["PCOS","PCOS_Label","PCOS (Y/N)","PCOS_Diagnosis"]
target_col = next((t for t in target_candidates if t in data.columns), None)
if target_col:
    X = data.drop(columns=[target_col]).copy()
else:
    X = data.copy()

# Numeric and categorical columns for pipeline
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# Build numeric and categorical transformers
numeric_transformer = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
categorical_transformer = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False))])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
], remainder="drop")

pipeline = Pipeline([("preprocessor", preprocessor)])

# Fit the preprocessing pipeline and save it
pipeline.fit(X)
X_trans = pipeline.transform(X)
print("Transformed shape:", X_trans.shape)

joblib.dump(pipeline, "/mnt/data/pcos_preprocessing_pipeline.joblib")
joblib.dump({"numeric_cols": numeric_cols, "categorical_cols": categorical_cols}, "/mnt/data/pcos_feature_columns.joblib")
print("Saved pipeline and feature lists to /mnt/data/")

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse'