In [3]:
import pandas as pd
import numpy as np

# Load the imputed dataset
imputed_data = pd.read_excel('imputed_female_farmers_data.xlsx')

def flexible_decode_dataset(df):
    """
    Flexible decoding function that handles both numeric and already-decoded text values
    """
    # Create a copy to avoid modifying the original
    decoded_df = df.copy()
    
    # 1. Decode binary variables (1.0='oui', 0.0='non')
    binary_vars = ['Neffa', 'Fumées de Tabouna', 'AT en milieu agricole', 'Ménopause']
    binary_mapping = {1.0: 'oui', 0.0: 'non', 1: 'oui', 0: 'non'}
    
    for var in binary_vars:
        if var in decoded_df.columns:
            # Check if it contains string values already
            if decoded_df[var].dtype == 'object':
                continue
            decoded_df[var] = decoded_df[var].map(binary_mapping)
    
    # 2. Decode Tabagisme (special encoding)
    if 'Tabagisme' in decoded_df.columns and decoded_df['Tabagisme'].dtype != 'object':
        tabagisme_mapping = {0.0: 'non', 1.0: 'passif', 2.0: 'oui', 0: 'non', 1: 'passif', 2: 'oui'}
        decoded_df['Tabagisme'] = decoded_df['Tabagisme'].map(tabagisme_mapping)
    
    # 3. Decode equipment usage 
    equipment_vars = ['Masque pour pesticides', 'Bottes', 'Gants', 'Casquette/Mdhalla', 'Manteau imperméable']
    equipment_mapping = {0.0: 'jamais', 1.0: 'parfois', 2.0: 'souvent', 3.0: 'toujours',
                         0: 'jamais', 1: 'parfois', 2: 'souvent', 3: 'toujours'}
    
    for var in equipment_vars:
        if var in decoded_df.columns and decoded_df[var].dtype != 'object':
            decoded_df[var] = decoded_df[var].map(equipment_mapping)
    
    # 4. Decode professional category (only if not already decoded)
    if 'Catégorie professionnelle' in decoded_df.columns and decoded_df['Catégorie professionnelle'].dtype != 'object':
        profession_mapping = {
            0.0: 'agricultrice indépendante',
            1.0: 'ouvrière',
            2.0: 'ouvrière, agricultrice indépendante',
            3.0: 'pêcheur indépendante',
            0: 'agricultrice indépendante',
            1: 'ouvrière',
            2: 'ouvrière, agricultrice indépendante',
            3: 'pêcheur indépendante'
        }
        decoded_df['Catégorie professionnelle'] = decoded_df['Catégorie professionnelle'].map(profession_mapping)
    
    # 5. Decode ordinal categorical variables (only if not already decoded)
    categorical_mappings = {
        'Situation maritale': {
            0.0: 'célibataire', 0: 'célibataire',
            1.0: 'mariée', 1: 'mariée',
            2.0: 'divorcée', 2: 'divorcée',
            3.0: 'veuve', 3: 'veuve'
        },
        'Domicile': {
            0.0: 'monastir', 0: 'monastir',
            1.0: 'sfax', 1: 'sfax',
            2.0: 'mahdia', 2: 'mahdia'
        },
        'Niveau socio-économique': {
            0.0: 'bas', 0: 'bas',
            1.0: 'moyen', 1: 'moyen',
            2.0: 'bon', 2: 'bon'
        },
        'Statut': {
            0.0: 'permanente', 0: 'permanente',
            1.0: 'saisonnière', 1: 'saisonnière'
        },
        'Niveau scolaire': {
            0.0: 'analphabète', 0: 'analphabète',
            1.0: 'primaire', 1: 'primaire',
            2.0: 'secondaire', 2: 'secondaire',
            3.0: 'supérieur', 3: 'supérieur'
        }
    }
    
    for var, mapping in categorical_mappings.items():
        if var in decoded_df.columns and decoded_df[var].dtype != 'object':
            decoded_df[var] = decoded_df[var].map(mapping)
    
    # 6. Decode multi-value indicators and create consolidated columns
    indicator_groups = {
        'Produits chimiques utilisés': {
            'prefix': 'Chemical',
            'indicators': {
                'Chemical_engrais_chimiques': 'engrais chimiques',
                'Chemical_pesticides': 'pesticides'
            }
        },
        'Produits biologiques utilisés': {
            'prefix': 'Bio',
            'indicators': {
                'Bio_animaux': 'animaux',
                'Bio_engrais_naturels': 'engrais naturels'
            }
        },
        'Engrais utilisés': {
            'prefix': 'Fertilizer',
            'indicators': {
                'Fertilizer_chimique': 'chimique',
                'Fertilizer_organique': 'organique',
                'Fertilizer_organo_minéral': 'organo minéral'
            }
        },
        'Contraintes thermiques': {
            'prefix': 'Thermal',
            'indicators': {
                'Thermal_chaleur': 'chaleur',
                'Thermal_froid': 'froid'
            }
        },
        'Moyen de transport': {
            'prefix': 'Transport',
            'indicators': {
                'Transport_a_pieds': 'a pieds',
                'Transport_bus___transport_public': 'bus transport public',
                'Transport_camion_non_protégé': 'camion non protégé',
                'Transport_charette': 'charette',
                'Transport_charrette': 'charrette',
                'Transport_voiture': 'voiture'
            }
        }
    }
    
    # Columns to drop after consolidation
    columns_to_drop = []
    
    for group_name, group_info in indicator_groups.items():
        indicator_cols = list(group_info['indicators'].keys())
        
        # Check if sufficient indicator columns exist
        if not any(col in decoded_df.columns for col in indicator_cols):
            continue
            
        # Add columns to drop list
        columns_to_drop.extend([col for col in indicator_cols if col in decoded_df.columns])
        
        # Skip if consolidated column already exists with data
        if group_name in decoded_df.columns and not decoded_df[group_name].isna().all():
            continue
            
        # Create new column
        decoded_df[group_name] = ''
        
        # Fill based on available indicators
        for col, label in group_info['indicators'].items():
            if col in decoded_df.columns:
                # For binary indicators, any value > 0.5 is considered present
                decoded_df.loc[decoded_df[col].fillna(0) > 0.5, group_name] = \
                    decoded_df.loc[decoded_df[col].fillna(0) > 0.5, group_name] + label + ', '
        
        # Clean up the results
        decoded_df[group_name] = decoded_df[group_name].str.rstrip(', ')
        decoded_df.loc[decoded_df[group_name] == '', group_name] = None
    
    # 7. Decode one-hot encoded "Profession du mari"
    husband_profession_cols = [col for col in decoded_df.columns if col.startswith('Profession du mari_') and col != 'Profession du mari_nan']
    
    # Skip if already has a consolidated column with data
    if not ('Profession du mari' in decoded_df.columns and not decoded_df['Profession du mari'].isna().all()):
        if len(husband_profession_cols) > 0:
            # Add these columns to drop list
            columns_to_drop.extend(husband_profession_cols)
            if 'Profession du mari_nan' in decoded_df.columns:
                columns_to_drop.append('Profession du mari_nan')
            
            # Create consolidated column if it doesn't exist
            if 'Profession du mari' not in decoded_df.columns:
                decoded_df['Profession du mari'] = None
            
            # Process each row
            for i, row in decoded_df.iterrows():
                # Skip if marked as missing
                if 'Profession du mari_nan' in decoded_df.columns and row['Profession du mari_nan'] == 1:
                    continue
                    
                # Find which profession is indicated
                for col in husband_profession_cols:
                    if col in decoded_df.columns and row[col] == 1:
                        # Extract profession name
                        prof = col.replace('Profession du mari_', '')
                        decoded_df.at[i, 'Profession du mari'] = prof
                        break
    
    # Drop consolidated columns
    decoded_df = decoded_df.drop(columns=columns_to_drop, errors='ignore')
    
    return decoded_df

# Apply the flexible decoding
properly_decoded_data = flexible_decode_dataset(imputed_data)

# Save the properly decoded dataset
properly_decoded_data.to_excel('properly_decoded_female_farmers_data.xlsx', index=False)

print("Dataset has been properly decoded and saved as 'properly_decoded_female_farmers_data.xlsx'")
print(f"\nDecoded dataset has {properly_decoded_data.shape[0]} rows and {properly_decoded_data.shape[1]} columns")
print("\nColumns in decoded dataset:")
print(properly_decoded_data.columns.tolist())

Dataset has been properly decoded and saved as 'properly_decoded_female_farmers_data.xlsx'

Decoded dataset has 80 rows and 37 columns

Columns in decoded dataset:
['N°', 'Age', 'Situation maritale', 'Nb enfants', 'Nb pers à charge', 'Domicile', 'Niveau socio-économique', 'Tabagisme', 'Neffa', 'Fumées de Tabouna', 'AT en milieu agricole', 'H travail / jour', 'Mécanisme AT', 'Ménopause', 'Age ménopause', 'Antécédents gynéco', 'Ancienneté agricole', 'Catégorie professionnelle', 'Statut', 'J travail / Sem', 'Masque pour pesticides', 'Bottes', 'Niveau scolaire', 'Gants', 'Casquette/Mdhalla', 'Manteau imperméable', 'Poids', 'Taille', 'TAS', 'TAD', 'GAD', 'Produits chimiques utilisés', 'Produits biologiques utilisés', 'Engrais utilisés', 'Contraintes thermiques', 'Moyen de transport', 'Profession du mari']
