In [None]:
# Essential Libraries for PCOS Feature Harmonization
import pandas as pd  # For data manipulation and analysis
import numpy as np   # For numerical operations and handling missing values
from sklearn.preprocessing import StandardScaler  # For standardizing features (if needed in your pipeline)

# Optional but useful libraries for visualization and additional processing
import matplotlib.pyplot as plt  # For creating visualizations
import seaborn as sns            # For enhanced visualizations
from scipy import stats          # For statistical operations (if needed)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

class PCOSFeatureHarmonizer:
    """
    A class to harmonize PCOS datasets from different geographic regions
    by standardizing features, units, and handling missing values.
    """

    def __init__(self):
        # Define the 18 common features across all cohorts
        self.common_features = [
            'Follicle No. (R)', 'Follicle No. (L)', 'Skin darkening (Y/N)',
            'Hair growth (Y/N)', 'Weight gain (Y/N)', 'Cycle (R/I)',
            'Pimples (Y/N)', 'AMH (ng/mL)', 'Weight (Kg)', 'Cycle length (days)',
            'Age (yrs)', 'Hip (inch)', 'BMI', 'Avg. F size (L) (mm)',
            'Pulse rate (bpm)', 'Hb (g/dl)', 'Vit D3 (ng/mL)', 'FSH/LH ratio'
        ]

        # Define unit conversion factors
        self.conversion_factors = {
            'Testosterone': 28.84,  # nmol/L to ng/dL
            'Waist': 2.54,  # cm to inches
            'Hip': 2.54,    # cm to inches
        }

    def load_dataset(self, file_path, dataset_name):
        """
        Load a dataset from the given file path
        """
        df = pd.read_csv(file_path)
        print(f"Loaded {dataset_name} dataset with shape: {df.shape}")
        return df

    def rename_features(self, df, mapping_dict):
        """
        Rename features according to a mapping dictionary
        """
        return df.rename(columns=mapping_dict)

    def convert_units(self, df, cohort_name):
        """
        Convert units to standard format across all cohorts
        """
        df_converted = df.copy()

        # Kerala dataset typically already uses standard units
        if cohort_name != "Kerala":
            # Convert testosterone if present
            if 'Testosterone (nmol/L)' in df_converted.columns:
                df_converted['Testosterone (ng/dL)'] = df_converted['Testosterone (nmol/L)'] * self.conversion_factors['Testosterone']
                df_converted.drop('Testosterone (nmol/L)', axis=1, inplace=True)

            # Convert waist and hip measurements from cm to inches if needed
            if 'Waist (cm)' in df_converted.columns:
                df_converted['Waist (inch)'] = df_converted['Waist (cm)'] / self.conversion_factors['Waist']
                df_converted.drop('Waist (cm)', axis=1, inplace=True)

            if 'Hip (cm)' in df_converted.columns:
                df_converted['Hip (inch)'] = df_converted['Hip (cm)'] / self.conversion_factors['Hip']
                df_converted.drop('Hip (cm)', axis=1, inplace=True)

        return df_converted

    def encode_categorical_variables(self, df):
        """
        Encode categorical variables consistently across datasets
        """
        df_encoded = df.copy()

        # Binary features mapping
        binary_mappings = {
            'Skin darkening (Y/N)': {'Yes': 1, 'No': 0, 'Present': 1, 'Absent': 0},
            'Hair growth (Y/N)': {'Yes': 1, 'No': 0, 'Present': 1, 'Absent': 0},
            'Weight gain (Y/N)': {'Yes': 1, 'No': 0, 'Present': 1, 'Absent': 0},
            'Pimples (Y/N)': {'Yes': 1, 'No': 0, 'Present': 1, 'Absent': 0},
            'Cycle (R/I)': {'Regular': 0, 'Irregular': 1, 'R': 0, 'I': 1}
        }

        for feature, mapping in binary_mappings.items():
            if feature in df_encoded.columns:
                df_encoded[feature] = df_encoded[feature].map(mapping)

        return df_encoded

    def handle_missing_values(self, df, reference_df=None):
        """
        Handle missing values using mean/mode imputation
        If reference_df is provided, use its statistics for imputation
        """
        df_imputed = df.copy()

        # Use reference dataset statistics if provided
        if reference_df is not None:
            for column in df_imputed.columns:
                if df_imputed[column].isnull().any():
                    if df_imputed[column].dtype in ['int64', 'float64']:
                        # Continuous variable - use mean from reference
                        impute_value = reference_df[column].mean()
                    else:
                        # Categorical variable - use mode from reference
                        impute_value = reference_df[column].mode()[0]

                    df_imputed[column].fillna(impute_value, inplace=True)
        else:
            # Impute with current dataset's statistics
            for column in df_imputed.columns:
                if df_imputed[column].isnull().any():
                    if df_imputed[column].dtype in ['int64', 'float64']:
                        df_imputed[column].fillna(df_imputed[column].mean(), inplace=True)
                    else:
                        df_imputed[column].fillna(df_imputed[column].mode()[0], inplace=True)

        return df_imputed

    def harmonize_datasets(self, kerala_df, sudan_df, iran_df):
        """
        Main method to harmonize all three datasets
        """
        print("Starting dataset harmonization...")

        # Use Kerala as reference for imputation
        reference_df = kerala_df[self.common_features].copy()

        # Process each dataset
        processed_datasets = {}

        # Kerala dataset
        kerala_processed = reference_df.copy()
        kerala_processed = self.encode_categorical_variables(kerala_processed)
        kerala_processed = self.handle_missing_values(kerala_processed)
        processed_datasets['Kerala'] = kerala_processed

        # Sudan dataset
        sudan_processed = sudan_df[self.common_features].copy()
        sudan_processed = self.convert_units(sudan_processed, "Sudan")
        sudan_processed = self.encode_categorical_variables(sudan_processed)
        sudan_processed = self.handle_missing_values(sudan_processed, reference_df)
        processed_datasets['Sudan'] = sudan_processed

        # Iran dataset
        iran_processed = iran_df[self.common_features].copy()
        iran_processed = self.convert_units(iran_processed, "Iran")
        iran_processed = self.encode_categorical_variables(iran_processed)
        iran_processed = self.handle_missing_values(iran_processed, reference_df)
        processed_datasets['Iran'] = iran_processed

        print("Dataset harmonization completed successfully!")
        return processed_datasets

    def generate_summary_report(self, processed_datasets):
        """
        Generate a summary report of the harmonized datasets
        """
        print("\n" + "="*50)
        print("HARMONIZATION SUMMARY REPORT")
        print("="*50)

        for cohort, df in processed_datasets.items():
            print(f"\n{cohort} Dataset:")
            print(f"  Samples: {len(df)}")
            print(f"  Features: {len(df.columns)}")
            print(f"  Missing values: {df.isnull().sum().sum()}")

            # Count PCOS cases if target variable exists
            if 'PCOS (Y/N)' in df.columns:
                pcos_cases = df['PCOS (Y/N)'].sum()
                print(f"  PCOS cases: {ppos_cases} ({pcos_cases/len(df)*100:.1f}%)")

# Example usage
def main():
    """
    Example of how to use the PCOSFeatureHarmonizer class
    """
    # Initialize the harmonizer
    harmonizer = PCOSFeatureHarmonizer()

    # Load datasets (replace with actual file paths)
    # Note: These paths should point to your actual dataset files
    kerala_df = harmonizer.load_dataset("/content/kerala_pcos.csv", "Kerala")
    sudan_df = harmonizer.load_dataset("/content/sudan_pcos.csv", "Sudan")
    iran_df = harmonizer.load_dataset("/content/iran_pcos.csv", "Iran")

    # Define feature mapping for each dataset if needed
    # This is necessary if feature names differ across datasets
    sudan_mapping = {
        'right_follicle_count': 'Follicle No. (R)',
        'left_follicle_count': 'Follicle No. (L)',
        # Add more mappings as needed
    }

    iran_mapping = {
        'Follicle_R': 'Follicle No. (R)',
        'Follicle_L': 'Follicle No. (L)',
        # Add more mappings as needed
    }

    # Apply feature mapping if necessary
    sudan_df = harmonizer.rename_features(sudan_df, sudan_mapping)
    iran_df = harmonizer.rename_features(iran_df, iran_mapping)

    # Harmonize datasets
    processed_datasets = harmonizer.harmonize_datasets(kerala_df, sudan_df, iran_df)

    # Generate summary report
    harmonizer.generate_summary_report(processed_datasets)

    # Save harmonized datasets
    for cohort, df in processed_datasets.items():
        df.to_csv(f"/content/{cohort.lower()}_harmonized.csv", index=False)
        print(f"Saved harmonized {cohort} dataset to {cohort.lower()}_harmonized.csv")

    print("\nHarmonization process completed!")

if __name__ == "__main__":
    main()