In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Health Outcomes Analysis for Female Farmers
This script explores relationships between various factors and health outcomes
in the female farmers dataset, focusing on aspects beyond protection equipment usage.

Author: [Your Name]
Date: April 2, 2025
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import glm
from scipy import stats
import os
from sklearn.preprocessing import StandardScaler
import logging

# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Create output directory for results
output_dir = 'results/health_analysis'
os.makedirs(output_dir, exist_ok=True)

def load_data():
    """Load the preprocessed dataset"""
    logger.info("Loading the dataset...")
    data = pd.read_excel('fixed_female_farmers_data.xlsx')
    logger.info(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns")
    return data

def prepare_data_for_analysis(data):
    """
    Prepare data for health analysis, including:
    - Create derived health variables
    - Format categorical variables
    - Handle missing values
    """
    logger.info("Preparing data for health analysis...")
    
    # Create a copy to avoid modifying the original
    df = data.copy()
    
    # Create binary indicators for each health complaint category
    health_categories = [
        'Troubles cardio-respiratoires', 
        'Troubles cognitifs',
        'Troubles neurologiques', 
        'Troubles cutanés/phanères'
    ]
    
    # Filter to those that exist in the data
    available_categories = [cat for cat in health_categories if cat in df.columns]
    
    # Create binary indicators
    for category in available_categories:
        binary_var = f"has_{category.lower().replace('-', '_').replace(' ', '_').replace('/', '_')}"
        df[binary_var] = df[category].notna().astype(int)
        logger.info(f"Created binary variable: {binary_var}")
    
    # Create a variable for any health complaint
    if available_categories:
        df['has_any_health_complaint'] = df[available_categories].notna().any(axis=1).astype(int)
        logger.info("Created 'has_any_health_complaint' variable")
    
    # Create BMI variable if height and weight are available
    if all(var in df.columns for var in ['Poids', 'Taille']):
        if 'BMI' not in df.columns:
            df['BMI'] = df['Poids'] / ((df['Taille']/100) ** 2)
            logger.info("Calculated BMI from height and weight")
            
            # Create BMI categories
            df['BMI_Category'] = pd.cut(
                df['BMI'], 
                bins=[0, 18.5, 25, 30, 100],
                labels=['Underweight', 'Normal', 'Overweight', 'Obese']
            )
            logger.info("Created BMI categories")
    
    # Create binary variables for key exposures
    exposure_vars = {
        'Tabagisme': 'is_smoker',
        'Fumées de Tabouna': 'exposed_to_tabouna',
        'Neffa': 'uses_neffa'
    }
    
    for var, new_var in exposure_vars.items():
        if var in df.columns:
            df[new_var] = (df[var] == 'oui').astype(int)
            logger.info(f"Created binary variable: {new_var}")
    
    # Create age groups if not already present
    if 'Age' in df.columns and 'Age_Group' not in df.columns:
        df['Age_Group'] = pd.cut(
            df['Age'], 
            bins=[0, 30, 40, 50, 60, 100],
            labels=['<30', '30-40', '40-50', '50-60', '>60']
        )
        logger.info("Created age group categories")
    
    # Create work intensity variable (hours per week)
    if all(var in df.columns for var in ['H travail / jour', 'J travail / Sem']):
        df['Hours_Per_Week'] = df['H travail / jour'] * df['J travail / Sem']
        logger.info("Calculated weekly work hours")
        
        # Create work intensity categories
        df['Work_Intensity'] = pd.cut(
            df['Hours_Per_Week'],
            bins=[0, 30, 40, 50, 100],
            labels=['Part-time', 'Regular', 'Intensive', 'Very Intensive']
        )
        logger.info("Created work intensity categories")
    
    # Convert important categorical variables to proper categorical type
    categorical_vars = [
        'BMI_Category', 'Age_Group', 'Work_Intensity', 
        'Statut', 'Situation maritale', 'Niveau socio-économique'
    ]
    
    for var in categorical_vars:
        if var in df.columns:
            df[var] = df[var].astype('category')
    
    return df

def bmi_health_analysis(data):
    """Analyze relationships between BMI and health outcomes"""
    logger.info("Analyzing BMI and health relationships...")
    
    if 'BMI' not in data.columns or 'BMI_Category' not in data.columns:
        logger.warning("BMI data not available for analysis")
        return
    
    # Create directory for BMI analysis
    bmi_dir = os.path.join(output_dir, "bmi_analysis")
    os.makedirs(bmi_dir, exist_ok=True)
    
    # 1. Analyze relationship between BMI categories and health complaints
    binary_health_vars = [col for col in data.columns if col.startswith('has_')]
    
    # Store chi-square results
    chi_square_results = []
    
    for health_var in binary_health_vars:
        logger.info(f"  Testing BMI_Category vs {health_var}")
        
        # Create contingency table
        contingency = pd.crosstab(data['BMI_Category'], data[health_var])
        
        # Check if we have enough data
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            # Perform chi-square test
            chi2, p, dof, expected = stats.chi2_contingency(contingency)
            
            chi_square_results.append({
                'Health_Variable': health_var,
                'Chi_Square': chi2,
                'P_value': p,
                'DoF': dof,
                'Significant': p < 0.05
            })
            
            # Create visualization if significant
            if p < 0.05:
                plt.figure(figsize=(10, 6))
                
                # Calculate percentages for each BMI category
                props = pd.crosstab(
                    data['BMI_Category'], 
                    data[health_var],
                    normalize='index'
                ) * 100
                
                # Create bar chart
                props.plot(kind='bar', stacked=True)
                
                plt.title(f'Relationship between BMI and {health_var} (p={p:.4f})', fontsize=14)
                plt.xlabel('BMI Category', fontsize=12)
                plt.ylabel('Percentage', fontsize=12)
                plt.legend(title='Has Condition', labels=['No', 'Yes'])
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                plt.tight_layout()
                
                # Save figure
                fig_path = os.path.join(bmi_dir, f"bmi_vs_{health_var}.png")
                plt.savefig(fig_path, dpi=300)
                plt.close()
                logger.info(f"    Saved visualization to {fig_path}")
    
    # Save chi-square results
    if chi_square_results:
        chi_results_df = pd.DataFrame(chi_square_results)
        chi_results_path = os.path.join(bmi_dir, "bmi_health_chi_square.csv")
        chi_results_df.to_csv(chi_results_path, index=False)
        logger.info(f"Saved BMI chi-square results to {chi_results_path}")
    
    # 2. Analyze relationship between continuous BMI and blood pressure
    if all(var in data.columns for var in ['BMI', 'TAS', 'TAD']):
        logger.info("  Analyzing BMI vs. blood pressure")
        
        # Calculate correlations
        corr_tas = stats.pearsonr(data['BMI'], data['TAS'])
        corr_tad = stats.pearsonr(data['BMI'], data['TAD'])
        
        correlations = pd.DataFrame({
            'Variable': ['Systolic BP', 'Diastolic BP'],
            'Correlation': [corr_tas[0], corr_tad[0]],
            'P_value': [corr_tas[1], corr_tad[1]],
            'Significant': [corr_tas[1] < 0.05, corr_tad[1] < 0.05]
        })
        
        # Save correlations
        corr_path = os.path.join(bmi_dir, "bmi_bp_correlations.csv")
        correlations.to_csv(corr_path, index=False)
        logger.info(f"    Saved blood pressure correlations to {corr_path}")
        
        # Create visualizations
        for bp_var, bp_name in [('TAS', 'Systolic'), ('TAD', 'Diastolic')]:
            plt.figure(figsize=(10, 6))
            
            # Create scatter plot with regression line
            sns.regplot(
                x='BMI', 
                y=bp_var, 
                data=data,
                scatter_kws={'alpha': 0.6},
                line_kws={'color': 'red'}
            )
            
            # Get correlation and p-value
            corr, p = stats.pearsonr(data['BMI'], data[bp_var])
            
            # Add correlation text
            plt.annotate(
                f'r = {corr:.2f}, p = {p:.4f}',
                xy=(0.05, 0.95),
                xycoords='axes fraction',
                ha='left',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            plt.title(f'Relationship between BMI and {bp_name} Blood Pressure', fontsize=14)
            plt.xlabel('BMI (kg/m²)', fontsize=12)
            plt.ylabel(f'{bp_name} Blood Pressure (mmHg)', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(bmi_dir, f"bmi_vs_{bp_var}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved BMI vs {bp_name} BP visualization to {fig_path}")
    
    # 3. Compare BMI across different demographic groups
    for group_var in ['Age_Group', 'Statut', 'Situation maritale', 'Niveau socio-économique']:
        if group_var in data.columns:
            logger.info(f"  Analyzing BMI by {group_var}")
            
            plt.figure(figsize=(12, 6))
            
            # Create boxplot
            sns.boxplot(x=group_var, y='BMI', data=data)
            
            # Add individual points
            sns.stripplot(
                x=group_var, 
                y='BMI', 
                data=data,
                color='black',
                alpha=0.3,
                jitter=True
            )
            
            # Perform ANOVA
            groups = []
            for category in data[group_var].cat.categories:
                group_data = data[data[group_var] == category]['BMI'].dropna()
                if len(group_data) > 0:
                    groups.append(group_data)
            
            if len(groups) >= 2:
                f_stat, p_value = stats.f_oneway(*groups)
                
                # Add ANOVA result
                plt.annotate(
                    f'ANOVA: F={f_stat:.2f}, p={p_value:.4f}',
                    xy=(0.5, 0.97),
                    xycoords='axes fraction',
                    ha='center',
                    va='top',
                    bbox=dict(boxstyle='round', fc='white', alpha=0.8)
                )
            
            plt.title(f'BMI Distribution by {group_var}', fontsize=14)
            plt.xlabel(group_var, fontsize=12)
            plt.ylabel('BMI (kg/m²)', fontsize=12)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            if len(data[group_var].cat.categories) > 3:
                plt.xticks(rotation=45, ha='right')
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(bmi_dir, f"bmi_by_{group_var.replace(' ', '_')}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved BMI by {group_var} visualization to {fig_path}")

def blood_pressure_analysis(data):
    """Analyze factors affecting blood pressure"""
    logger.info("Analyzing blood pressure relationships...")
    
    bp_vars = ['TAS', 'TAD']
    if not all(var in data.columns for var in bp_vars):
        logger.warning("Blood pressure data not available for analysis")
        return
    
    # Create directory for BP analysis
    bp_dir = os.path.join(output_dir, "blood_pressure")
    os.makedirs(bp_dir, exist_ok=True)
    
    # 1. Analyze relationship between blood pressure and age
    if 'Age' in data.columns:
        logger.info("  Analyzing blood pressure vs. age")
        
        for bp_var, bp_name in [('TAS', 'Systolic'), ('TAD', 'Diastolic')]:
            # Calculate correlation
            corr, p = stats.pearsonr(data['Age'], data[bp_var])
            
            # Create scatter plot
            plt.figure(figsize=(10, 6))
            
            sns.regplot(
                x='Age',
                y=bp_var,
                data=data,
                scatter_kws={'alpha': 0.6},
                line_kws={'color': 'red'}
            )
            
            # Add correlation text
            plt.annotate(
                f'r = {corr:.2f}, p = {p:.4f}',
                xy=(0.05, 0.95),
                xycoords='axes fraction',
                ha='left',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            plt.title(f'Relationship between Age and {bp_name} Blood Pressure', fontsize=14)
            plt.xlabel('Age (years)', fontsize=12)
            plt.ylabel(f'{bp_name} Blood Pressure (mmHg)', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(bp_dir, f"age_vs_{bp_var}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved Age vs {bp_name} BP visualization to {fig_path}")
    
    # 2. Analyze blood pressure by different categorical factors
    for group_var in ['Age_Group', 'BMI_Category', 'Statut', 'Work_Intensity', 'exposed_to_tabouna']:
        if group_var in data.columns:
            logger.info(f"  Analyzing blood pressure by {group_var}")
            
            for bp_var, bp_name in [('TAS', 'Systolic'), ('TAD', 'Diastolic')]:
                plt.figure(figsize=(12, 6))
                
                # Create boxplot
                sns.boxplot(x=group_var, y=bp_var, data=data)
                
                # Add individual points
                sns.stripplot(
                    x=group_var, 
                    y=bp_var, 
                    data=data,
                    color='black',
                    alpha=0.3,
                    jitter=True
                )
                
                # Perform ANOVA if the group variable is categorical
                if pd.api.types.is_categorical_dtype(data[group_var]) or group_var in ['exposed_to_tabouna']:
                    # For binary variables, use t-test instead
                    if group_var == 'exposed_to_tabouna' or len(data[group_var].unique()) == 2:
                        group0 = data[data[group_var] == 0][bp_var].dropna()
                        group1 = data[data[group_var] == 1][bp_var].dropna()
                        
                        if len(group0) > 0 and len(group1) > 0:
                            t_stat, p_value = stats.ttest_ind(group0, group1, equal_var=False)
                            
                            plt.annotate(
                                f't-test: t={t_stat:.2f}, p={p_value:.4f}',
                                xy=(0.5, 0.97),
                                xycoords='axes fraction',
                                ha='center',
                                va='top',
                                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
                            )
                    else:
                        groups = []
                        for category in data[group_var].unique():
                            group_data = data[data[group_var] == category][bp_var].dropna()
                            if len(group_data) > 0:
                                groups.append(group_data)
                        
                        if len(groups) >= 2:
                            f_stat, p_value = stats.f_oneway(*groups)
                            
                            plt.annotate(
                                f'ANOVA: F={f_stat:.2f}, p={p_value:.4f}',
                                xy=(0.5, 0.97),
                                xycoords='axes fraction',
                                ha='center',
                                va='top',
                                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
                            )
                
                plt.title(f'{bp_name} Blood Pressure by {group_var}', fontsize=14)
                plt.xlabel(group_var, fontsize=12)
                plt.ylabel(f'{bp_name} Blood Pressure (mmHg)', fontsize=12)
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                if pd.api.types.is_categorical_dtype(data[group_var]) and len(data[group_var].cat.categories) > 3:
                    plt.xticks(rotation=45, ha='right')
                
                plt.tight_layout()
                
                # Save figure
                fig_path = os.path.join(bp_dir, f"{bp_var}_by_{group_var.replace(' ', '_')}.png")
                plt.savefig(fig_path, dpi=300)
                plt.close()
                logger.info(f"    Saved {bp_name} BP by {group_var} visualization to {fig_path}")
    
    # 3. Analyze the relationship between work hours and blood pressure
    if 'Hours_Per_Week' in data.columns:
        logger.info("  Analyzing blood pressure vs. work hours")
        
        for bp_var, bp_name in [('TAS', 'Systolic'), ('TAD', 'Diastolic')]:
            # Calculate correlation
            corr, p = stats.pearsonr(data['Hours_Per_Week'], data[bp_var])
            
            # Create scatter plot
            plt.figure(figsize=(10, 6))
            
            sns.regplot(
                x='Hours_Per_Week',
                y=bp_var,
                data=data,
                scatter_kws={'alpha': 0.6},
                line_kws={'color': 'red'}
            )
            
            # Add correlation text
            plt.annotate(
                f'r = {corr:.2f}, p = {p:.4f}',
                xy=(0.05, 0.95),
                xycoords='axes fraction',
                ha='left',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            plt.title(f'Relationship between Work Hours and {bp_name} Blood Pressure', fontsize=14)
            plt.xlabel('Hours Worked Per Week', fontsize=12)
            plt.ylabel(f'{bp_name} Blood Pressure (mmHg)', fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(bp_dir, f"hours_vs_{bp_var}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved Work Hours vs {bp_name} BP visualization to {fig_path}")

def traditional_practices_analysis(data):
    """Analyze the impact of traditional practices on health"""
    logger.info("Analyzing traditional practices and health...")
    
    # Check if we have traditional practice data
    if 'exposed_to_tabouna' not in data.columns:
        logger.warning("Traditional practices data not available for analysis")
        return
    
    # Create directory for traditional practices analysis
    trad_dir = os.path.join(output_dir, "traditional_practices")
    os.makedirs(trad_dir, exist_ok=True)
    
    # 1. Analyze relationship between Tabouna exposure and respiratory issues
    if 'has_troubles_cardio_respiratoires' in data.columns:
        logger.info("  Analyzing Tabouna exposure vs. respiratory issues")
        
        # Create contingency table
        contingency = pd.crosstab(
            data['exposed_to_tabouna'], 
            data['has_troubles_cardio_respiratoires']
        )
        
        # Perform chi-square test
        chi2, p, dof, expected = stats.chi2_contingency(contingency)
        
        # Calculate risk ratio
        risk_exposed = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
        risk_unexposed = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
        risk_ratio = risk_exposed / risk_unexposed
        
        # Create results dataframe
        results = pd.DataFrame({
            'Test': ['Tabouna exposure vs. Respiratory issues'],
            'Chi_Square': [chi2],
            'P_value': [p],
            'Risk_Ratio': [risk_ratio],
            'Risk_Exposed': [risk_exposed],
            'Risk_Unexposed': [risk_unexposed],
            'Significant': [p < 0.05]
        })
        
        # Save results
        results_path = os.path.join(trad_dir, "tabouna_respiratory_results.csv")
        results.to_csv(results_path, index=False)
        logger.info(f"    Saved results to {results_path}")
        
        # Create visualization
        plt.figure(figsize=(10, 6))
        
        # Calculate percentages
        props = pd.crosstab(
            data['exposed_to_tabouna'], 
            data['has_troubles_cardio_respiratoires'],
            normalize='index'
        ) * 100
        
        # Create bar chart
        ax = props[1].plot(kind='bar')
        
        plt.title(f'Respiratory Issues by Tabouna Exposure (p={p:.4f}, RR={risk_ratio:.2f})', fontsize=14)
        plt.xlabel('Exposed to Tabouna Smoke', fontsize=12)
        plt.ylabel('Percentage with Respiratory Issues', fontsize=12)
        plt.xticks([0, 1], ['No', 'Yes'])
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        
        # Add percentage labels
        for i, v in enumerate(props[1]):
            ax.text(i, v + 1, f'{v:.1f}%', ha='center')
        
        plt.tight_layout()
        
        # Save figure
        fig_path = os.path.join(trad_dir, "tabouna_respiratory_issues.png")
        plt.savefig(fig_path, dpi=300)
        plt.close()
        logger.info(f"    Saved visualization to {fig_path}")
    
    # 2. Analyze Tabouna exposure and other health complaints
    other_health_vars = [var for var in data.columns if var.startswith('has_') and var != 'has_troubles_cardio_respiratoires']
    
    results = []
    
    for health_var in other_health_vars:
        # Create contingency table
        contingency = pd.crosstab(data['exposed_to_tabouna'], data[health_var])
        
        # Check if we have enough data
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            # Perform chi-square test
            chi2, p, dof, expected = stats.chi2_contingency(contingency)
            
            # Calculate risk ratio if possible
            if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                risk_exposed = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                risk_unexposed = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                risk_ratio = risk_exposed / risk_unexposed
            else:
                risk_ratio = np.nan
            
            results.append({
                'Health_Outcome': health_var,
                'Chi_Square': chi2,
                'P_value': p,
                'Risk_Ratio': risk_ratio,
                'Significant': p < 0.05
            })
    
    # Save all results
    if results:
        all_results_df = pd.DataFrame(results)
        all_results_path = os.path.join(trad_dir, "tabouna_all_health_outcomes.csv")
        all_results_df.to_csv(all_results_path, index=False)
        logger.info(f"  Saved all Tabouna health outcomes to {all_results_path}")

def work_patterns_analysis(data):
    """Analyze relationships between work patterns and health"""
    logger.info("Analyzing work patterns and health...")
    
    # Check if we have work pattern data
    work_vars = ['H travail / jour', 'J travail / Sem', 'Hours_Per_Week', 'Work_Intensity']
    if not any(var in data.columns for var in work_vars):
        logger.warning("Work pattern data not available for analysis")
        return
    
    # Create directory for work pattern analysis
    work_dir = os.path.join(output_dir, "work_patterns")
    os.makedirs(work_dir, exist_ok=True)
    
    # 1. Analyze health complaints by work intensity
    if 'Work_Intensity' in data.columns:
        logger.info("  Analyzing health complaints by work intensity")
        
        health_vars = [var for var in data.columns if var.startswith('has_')]
        
        for health_var in health_vars:
            # Create contingency table
            contingency = pd.crosstab(data['Work_Intensity'], data[health_var])
            
            # Check if we have enough data
            if contingency.shape[0] > 1 and contingency.shape[1] > 1:
                # Perform chi-square test
                chi2, p, dof, expected = stats.chi2_contingency(contingency)
                
                # Create visualization if significant
                if p < 0.05:
                    logger.info(f"    Significant relationship: Work_Intensity vs {health_var} (p={p:.4f})")
                    
                    plt.figure(figsize=(12, 6))
                    
                    # Calculate percentages
                    props = pd.crosstab(
                        data['Work_Intensity'], 
                        data[health_var],
                        normalize='index'
                    ) * 100
                    
                    # Create bar chart for the "Yes" column (health complaint present)
                    ax = props[1].plot(kind='bar')
                    
                    plt.title(f'{health_var} by Work Intensity (p={p:.4f})', fontsize=14)
                    plt.xlabel('Work Intensity', fontsize=12)
                    plt.ylabel(f'Percentage with {health_var}', fontsize=12)
                    plt.grid(axis='y', linestyle='--', alpha=0.7)
                    
                    # Add percentage labels
                    for i, v in enumerate(props[1]):
                        ax.text(i, v + 1, f'{v:.1f}%', ha='center')
                    
                    plt.tight_layout()
                    
                    # Save figure
                    fig_path = os.path.join(work_dir, f"work_intensity_vs_{health_var}.png")
                    plt.savefig(fig_path, dpi=300)
                    plt.close()
                    logger.info(f"      Saved visualization to {fig_path}")
    
    # 2. Analyze relationship between hours worked and health metrics
    if 'Hours_Per_Week' in data.columns:
        logger.info("  Analyzing hours worked vs. health metrics")
        
        # Check if we have health metrics
        health_metrics = ['BMI', 'TAS', 'TAD']
        available_metrics = [var for var in health_metrics if var in data.columns]
        
        for metric in available_metrics:
            # Calculate correlation
            corr, p = stats.pearsonr(data['Hours_Per_Week'], data[metric])
            
            # Create scatter plot
            plt.figure(figsize=(10, 6))
            
            sns.regplot(
                x='Hours_Per_Week',
                y=metric,
                data=data,
                scatter_kws={'alpha': 0.6},
                line_kws={'color': 'red'}
            )
            
            # Add correlation text
            plt.annotate(
                f'r = {corr:.2f}, p = {p:.4f}',
                xy=(0.05, 0.95),
                xycoords='axes fraction',
                ha='left',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            plt.title(f'Relationship between Work Hours and {metric}', fontsize=14)
            plt.xlabel('Hours Worked Per Week', fontsize=12)
            plt.ylabel(metric, fontsize=12)
            plt.grid(True, linestyle='--', alpha=0.7)
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(work_dir, f"hours_vs_{metric}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved Hours vs {metric} visualization to {fig_path}")

# Optional: Add a main block to run the script
if __name__ == "__main__":
    data = load_data()
    prepared_data = prepare_data_for_analysis(data)
    bmi_health_analysis(prepared_data)
    blood_pressure_analysis(prepared_data)
    traditional_practices_analysis(prepared_data)
    work_patterns_analysis(prepared_data)

2025-04-01 14:47:17,568 - INFO - Loading the dataset...
2025-04-01 14:47:18,251 - INFO - Dataset loaded with 80 rows and 37 columns
2025-04-01 14:47:18,252 - INFO - Preparing data for health analysis...
2025-04-01 14:47:18,259 - INFO - Calculated BMI from height and weight
2025-04-01 14:47:18,275 - INFO - Created BMI categories
2025-04-01 14:47:18,277 - INFO - Created binary variable: is_smoker
2025-04-01 14:47:18,280 - INFO - Created binary variable: exposed_to_tabouna
2025-04-01 14:47:18,283 - INFO - Created binary variable: uses_neffa
2025-04-01 14:47:18,288 - INFO - Created age group categories
2025-04-01 14:47:18,292 - INFO - Calculated weekly work hours
2025-04-01 14:47:18,296 - INFO - Created work intensity categories
2025-04-01 14:47:18,307 - INFO - Analyzing BMI and health relationships...
2025-04-01 14:47:18,310 - INFO -   Analyzing BMI vs. blood pressure
2025-04-01 14:47:18,342 - INFO -     Saved blood pressure correlations to results/health_analysis\bmi_analysis\bmi_bp_corr

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Complete Missing Health Analyses for Female Farmers
This script focuses on completing the missing analyses:
1. Traditional practices analysis
2. Transportation health analysis
3. Experience health analysis
4. Multivariate models
5. Summary report

Author: [Your Name]
Date: April 2, 2025
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import glm
from scipy import stats
import os
from sklearn.preprocessing import StandardScaler
import logging

# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set output directory for results
output_dir = 'C:/Users/SelmaB/Desktop/Project/Analysis/2.Anova part two all determinants/results/health_analysis'

def load_data():
    """Load the preprocessed dataset"""
    logger.info("Loading the dataset...")
    try:
        data = pd.read_excel('C:/Users/SelmaB/Desktop/Project/Analysis/fixed_female_farmers_data.xlsx')
        logger.info(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns")
        return data
    except Exception as e:
        # Try alternative paths if the first one fails
        try:
            data = pd.read_excel('fixed_female_farmers_data.xlsx')
            logger.info(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns")
            return data
        except Exception as e2:
            logger.error(f"Error loading data: {str(e2)}")
            raise

def prepare_data_for_analysis(data):
    """
    Prepare data for health analysis, including:
    - Create derived health variables
    - Format categorical variables
    - Handle missing values
    """
    logger.info("Preparing data for health analysis...")
    
    # Create a copy to avoid modifying the original
    df = data.copy()
    
    # Create binary indicators for each health complaint category
    health_categories = [
        'Troubles cardio-respiratoires', 
        'Troubles cognitifs',
        'Troubles neurologiques', 
        'Troubles cutanés/phanères'
    ]
    
    # Filter to those that exist in the data
    available_categories = [cat for cat in health_categories if cat in df.columns]
    
    # Create binary indicators
    for category in available_categories:
        binary_var = f"has_{category.lower().replace('-', '_').replace(' ', '_').replace('/', '_')}"
        df[binary_var] = df[category].notna().astype(int)
        logger.info(f"Created binary variable: {binary_var}")
    
    # Create a variable for any health complaint
    if available_categories:
        df['has_any_health_complaint'] = df[available_categories].notna().any(axis=1).astype(int)
        logger.info("Created 'has_any_health_complaint' variable")
    
    # Create BMI variable if height and weight are available
    if all(var in df.columns for var in ['Poids', 'Taille']):
        if 'BMI' not in df.columns:
            df['BMI'] = df['Poids'] / ((df['Taille']/100) ** 2)
            logger.info("Calculated BMI from height and weight")
            
            # Create BMI categories
            df['BMI_Category'] = pd.cut(
                df['BMI'], 
                bins=[0, 18.5, 25, 30, 100],
                labels=['Underweight', 'Normal', 'Overweight', 'Obese']
            )
            logger.info("Created BMI categories")
    
    # Create binary variables for key exposures
    exposure_vars = {
        'Tabagisme': 'is_smoker',
        'Fumées de Tabouna': 'exposed_to_tabouna',
        'Neffa': 'uses_neffa'
    }
    
    for var, new_var in exposure_vars.items():
        if var in df.columns:
            df[new_var] = (df[var] == 'oui').astype(int)
            logger.info(f"Created binary variable: {new_var}")
    
    # Create age groups if not already present
    if 'Age' in df.columns and 'Age_Group' not in df.columns:
        df['Age_Group'] = pd.cut(
            df['Age'], 
            bins=[0, 30, 40, 50, 60, 100],
            labels=['<30', '30-40', '40-50', '50-60', '>60']
        )
        logger.info("Created age group categories")
    
    # Create work intensity variable (hours per week)
    if all(var in df.columns for var in ['H travail / jour', 'J travail / Sem']):
        df['Hours_Per_Week'] = df['H travail / jour'] * df['J travail / Sem']
        logger.info("Calculated weekly work hours")
        
        # Create work intensity categories
        df['Work_Intensity'] = pd.cut(
            df['Hours_Per_Week'],
            bins=[0, 30, 40, 50, 100],
            labels=['Part-time', 'Regular', 'Intensive', 'Very Intensive']
        )
        logger.info("Created work intensity categories")
    
    # Convert important categorical variables to proper categorical type
    categorical_vars = [
        'BMI_Category', 'Age_Group', 'Work_Intensity', 
        'Statut', 'Situation maritale', 'Niveau socio-économique'
    ]
    
    for var in categorical_vars:
        if var in df.columns:
            df[var] = df[var].astype('category')
    
    return df

def traditional_practices_analysis(data):
    """Analyze the impact of traditional practices on health"""
    logger.info("Analyzing traditional practices and health...")
    
    # Check if we have traditional practice data
    if 'exposed_to_tabouna' not in data.columns and 'Fumées de Tabouna' in data.columns:
        # Create the binary variable if it doesn't exist
        data['exposed_to_tabouna'] = (data['Fumées de Tabouna'] == 'oui').astype(int)
        logger.info("Created binary variable: exposed_to_tabouna")
    
    if 'uses_neffa' not in data.columns and 'Neffa' in data.columns:
        # Create the binary variable if it doesn't exist
        data['uses_neffa'] = (data['Neffa'] == 'oui').astype(int)
        logger.info("Created binary variable: uses_neffa")
        
    if 'exposed_to_tabouna' not in data.columns and 'uses_neffa' not in data.columns:
        logger.warning("Traditional practices data not available for analysis")
        return
    
    # Create directory for traditional practices analysis
    trad_dir = os.path.join(output_dir, "traditional_practices")
    os.makedirs(trad_dir, exist_ok=True)
    
    # 1. Analyze relationship between Tabouna exposure and respiratory issues
    if 'exposed_to_tabouna' in data.columns:
        # Find respiratory issues variable
        resp_vars = [col for col in data.columns if 'respir' in col.lower() or 'cardio' in col.lower()]
        resp_vars.extend([col for col in data.columns if col.startswith('has_') and ('respir' in col.lower() or 'cardio' in col.lower())])
        
        resp_var = None
        if 'has_troubles_cardio_respiratoires' in resp_vars:
            resp_var = 'has_troubles_cardio_respiratoires'
        elif resp_vars:
            resp_var = resp_vars[0]
        
        if resp_var:
            logger.info(f"  Analyzing Tabouna exposure vs. {resp_var}")
            
            # Create contingency table
            contingency = pd.crosstab(
                data['exposed_to_tabouna'], 
                data[resp_var]
            )
            
            # Perform chi-square test
            chi2, p, dof, expected = stats.chi2_contingency(contingency)
            
            # Calculate risk ratio if possible
            if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                risk_exposed = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                risk_unexposed = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                risk_ratio = risk_exposed / risk_unexposed
            else:
                risk_exposed = np.nan
                risk_unexposed = np.nan
                risk_ratio = np.nan
            
            # Create results dataframe
            results = pd.DataFrame({
                'Test': ['Tabouna exposure vs. Respiratory issues'],
                'Chi_Square': [chi2],
                'P_value': [p],
                'Risk_Ratio': [risk_ratio],
                'Risk_Exposed': [risk_exposed],
                'Risk_Unexposed': [risk_unexposed],
                'Significant': [p < 0.05]
            })
            
            # Save results
            results_path = os.path.join(trad_dir, "tabouna_respiratory_results.csv")
            results.to_csv(results_path, index=False)
            logger.info(f"    Saved results to {results_path}")
            
            # Create visualization
            plt.figure(figsize=(10, 6))
            
            # Calculate percentages
            props = pd.crosstab(
                data['exposed_to_tabouna'], 
                data[resp_var],
                normalize='index'
            ) * 100
            
            # Create bar chart
            ax = props[1].plot(kind='bar')
            
            plt.title(f'Respiratory Issues by Tabouna Exposure (p={p:.4f}, RR={risk_ratio:.2f})', fontsize=14)
            plt.xlabel('Exposed to Tabouna Smoke', fontsize=12)
            plt.ylabel('Percentage with Respiratory Issues', fontsize=12)
            plt.xticks([0, 1], ['No', 'Yes'])
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Add percentage labels
            for i, v in enumerate(props[1]):
                ax.text(i, v + 1, f'{v:.1f}%', ha='center')
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(trad_dir, "tabouna_respiratory_issues.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved visualization to {fig_path}")
    
    # 2. Analyze Neffa usage and health complaints
    if 'uses_neffa' in data.columns:
        logger.info("  Analyzing Neffa usage and health complaints")
        
        health_vars = [var for var in data.columns if var.startswith('has_')]
        
        results = []
        
        for health_var in health_vars:
            # Create contingency table
            contingency = pd.crosstab(data['uses_neffa'], data[health_var])
            
            # Check if we have enough data
            if contingency.shape[0] > 1 and contingency.shape[1] > 1:
                # Perform chi-square test
                chi2, p, dof, expected = stats.chi2_contingency(contingency)
                
                # Calculate risk ratio if possible
                if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                    risk_users = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                    risk_nonusers = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                    risk_ratio = risk_users / risk_nonusers
                else:
                    risk_ratio = np.nan
                
                results.append({
                    'Health_Outcome': health_var,
                    'Chi_Square': chi2,
                    'P_value': p,
                    'Risk_Ratio': risk_ratio,
                    'Significant': p < 0.05
                })
        
        # Save results
        if results:
            results_df = pd.DataFrame(results)
            results_path = os.path.join(trad_dir, "neffa_health_outcomes.csv")
            results_df.to_csv(results_path, index=False)
            logger.info(f"    Saved Neffa usage vs. health outcomes to {results_path}")
            
            # Create visualizations for significant relationships
            sig_results = results_df[results_df['Significant']]
            
            for _, row in sig_results.iterrows():
                health_var = row['Health_Outcome']
                p = row['P_value']
                rr = row['Risk_Ratio']
                
                plt.figure(figsize=(10, 6))
                
                # Calculate percentages
                props = pd.crosstab(
                    data['uses_neffa'], 
                    data[health_var],
                    normalize='index'
                ) * 100
                
                # Create bar chart
                ax = props[1].plot(kind='bar')
                
                plt.title(f'{health_var} by Neffa Usage (p={p:.4f}, RR={rr:.2f})', fontsize=14)
                plt.xlabel('Uses Neffa', fontsize=12)
                plt.ylabel(f'Percentage with {health_var}', fontsize=12)
                plt.xticks([0, 1], ['No', 'Yes'])
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                # Add percentage labels
                for i, v in enumerate(props[1]):
                    ax.text(i, v + 1, f'{v:.1f}%', ha='center')
                
                plt.tight_layout()
                
                # Save figure
                fig_path = os.path.join(trad_dir, f"neffa_vs_{health_var}.png")
                plt.savefig(fig_path, dpi=300)
                plt.close()
                logger.info(f"      Saved visualization to {fig_path}")
    
    # 3. Create combined traditional practice score 
    if 'exposed_to_tabouna' in data.columns and 'uses_neffa' in data.columns:
        logger.info("  Creating combined traditional practice exposure score")
        
        # Create exposure score (0-2)
        data['trad_exposure_score'] = data['exposed_to_tabouna'] + data['uses_neffa']
        
        # Analyze health outcomes by exposure score
        health_vars = [var for var in data.columns if var.startswith('has_')]
        
        for health_var in health_vars:
            plt.figure(figsize=(10, 6))
            
            # Calculate percentages
            props = pd.crosstab(
                data['trad_exposure_score'], 
                data[health_var],
                normalize='index'
            ) * 100
            
            # Create bar chart
            ax = props[1].plot(kind='bar')
            
            plt.title(f'{health_var} by Traditional Practice Exposure Score', fontsize=14)
            plt.xlabel('Exposure Score (0=None, 1=One practice, 2=Both practices)', fontsize=12)
            plt.ylabel(f'Percentage with {health_var}', fontsize=12)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Add percentage labels
            for i, v in enumerate(props[1]):
                ax.text(i, v + 1, f'{v:.1f}%', ha='center')
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(trad_dir, f"trad_exposure_score_vs_{health_var}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved Traditional Practices Score vs {health_var} visualization")

def transportation_health_analysis(data):
    """Analyze relationship between transportation method and health"""
    logger.info("Analyzing transportation method and health...")
    
    if 'Moyen de transport' not in data.columns:
        logger.warning("Transportation data not available for analysis")
        return
    
    # Create directory for transportation analysis
    trans_dir = os.path.join(output_dir, "transportation")
    os.makedirs(trans_dir, exist_ok=True)
    
    # Create binary variable for walking (a pieds) vs. other transport
    data['transport_walking'] = (data['Moyen de transport'] == 'a pieds').astype(int)
    
    # 1. Analyze relationship between walking and health complaints
    health_vars = [var for var in data.columns if var.startswith('has_')]
    
    results = []
    
    for health_var in health_vars:
        # Create contingency table
        contingency = pd.crosstab(data['transport_walking'], data[health_var])
        
        # Check if we have enough data
        if contingency.shape[0] > 1 and contingency.shape[1] > 1:
            # Perform chi-square test
            chi2, p, dof, expected = stats.chi2_contingency(contingency)
            
            # Calculate risk ratio if possible
            if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                risk_walking = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                risk_other = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                risk_ratio = risk_walking / risk_other
            else:
                risk_ratio = np.nan
            
            results.append({
                'Health_Outcome': health_var,
                'Chi_Square': chi2,
                'P_value': p,
                'Risk_Ratio': risk_ratio,
                'Significant': p < 0.05
            })
    
    # Save results
    if results:
        results_df = pd.DataFrame(results)
        results_path = os.path.join(trans_dir, "walking_health_results.csv")
        results_df.to_csv(results_path, index=False)
        logger.info(f"  Saved walking vs. health results to {results_path}")
        
        # Create visualizations for significant relationships
        sig_results = results_df[results_df['Significant']]
        
        for _, row in sig_results.iterrows():
            health_var = row['Health_Outcome']
            p = row['P_value']
            rr = row['Risk_Ratio']
            
            plt.figure(figsize=(10, 6))
            
            # Calculate percentages
            props = pd.crosstab(
                data['transport_walking'], 
                data[health_var],
                normalize='index'
            ) * 100
            
            # Create bar chart for the "Yes" column (health complaint present)
            ax = props[1].plot(kind='bar')
            
            plt.title(f'{health_var} by Transportation Method (p={p:.4f}, RR={rr:.2f})', fontsize=14)
            plt.xlabel('Walking as Transportation', fontsize=12)
            plt.ylabel(f'Percentage with {health_var}', fontsize=12)
            plt.xticks([0, 1], ['No', 'Yes'])
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Add percentage labels
            for i, v in enumerate(props[1]):
                ax.text(i, v + 1, f'{v:.1f}%', ha='center')
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(trans_dir, f"walking_vs_{health_var}.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved visualization to {fig_path}")
    
    # 2. Analyze relationship between transportation method and BMI
    if 'BMI' in data.columns:
        logger.info("  Analyzing transportation method vs. BMI")
        
        plt.figure(figsize=(10, 6))
        
        # Create boxplot
        sns.boxplot(x='transport_walking', y='BMI', data=data)
        
        # Add individual points
        sns.stripplot(
            x='transport_walking', 
            y='BMI', 
            data=data,
            color='black',
            alpha=0.3,
            jitter=True
        )
        
        # Perform t-test
        walking_bmi = data[data['transport_walking'] == 1]['BMI'].dropna()
        other_bmi = data[data['transport_walking'] == 0]['BMI'].dropna()
        
        if len(walking_bmi) > 0 and len(other_bmi) > 0:
            t_stat, p_value = stats.ttest_ind(walking_bmi, other_bmi, equal_var=False)
            
            # Add t-test result
            plt.annotate(
                f't-test: t={t_stat:.2f}, p={p_value:.4f}',
                xy=(0.5, 0.97),
                xycoords='axes fraction',
                ha='center',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            # Add means
            mean_walking = walking_bmi.mean()
            mean_other = other_bmi.mean()
            
            plt.annotate(
                f'Walking mean: {mean_walking:.1f}\nOther mean: {mean_other:.1f}',
                xy=(0.5, 0.85),
                xycoords='axes fraction',
                ha='center',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
        
        plt.title('BMI by Transportation Method', fontsize=14)
        plt.xlabel('Walking as Transportation', fontsize=12)
        plt.ylabel('BMI (kg/m²)', fontsize=12)
        plt.xticks([0, 1], ['No', 'Yes'])
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        
        # Save figure
        fig_path = os.path.join(trans_dir, "walking_vs_bmi.png")
        plt.savefig(fig_path, dpi=300)
        plt.close()
        logger.info(f"    Saved visualization to {fig_path}")
        
    # 3. Analyze relationship between transportation method and blood pressure
    if all(var in data.columns for var in ['TAS', 'TAD']):
        logger.info("  Analyzing transportation method vs. blood pressure")
        
        for bp_var, bp_name in [('TAS', 'Systolic'), ('TAD', 'Diastolic')]:
            plt.figure(figsize=(10, 6))
            
            # Create boxplot
            sns.boxplot(x='transport_walking', y=bp_var, data=data)
            
            # Add individual points
            sns.stripplot(
                x='transport_walking', 
                y=bp_var, 
                data=data,
                color='black',
                alpha=0.3,
                jitter=True
            )
            
            # Perform t-test
            walking_bp = data[data['transport_walking'] == 1][bp_var].dropna()
            other_bp = data[data['transport_walking'] == 0][bp_var].dropna()
            
            if len(walking_bp) > 0 and len(other_bp) > 0:
                t_stat, p_value = stats.ttest_ind(walking_bp, other_bp, equal_var=False)
                
                # Add t-test result
                plt.annotate(
                    f't-test: t={t_stat:.2f}, p={p_value:.4f}',
                    xy=(0.5, 0.97),
                    xycoords='axes fraction',
                    ha='center',
                    va='top',
                    bbox=dict(boxstyle='round', fc='white', alpha=0.8)
                )
                
                # Add means
                mean_walking = walking_bp.mean()
                mean_other = other_bp.mean()
                
                plt.annotate(
                    f'Walking mean: {mean_walking:.1f}\nOther mean: {mean_other:.1f}',
                    xy=(0.5, 0.85),
                    xycoords='axes fraction',
                    ha='center',
                    va='top',
                    bbox=dict(boxstyle='round', fc='white', alpha=0.8)
                )
            
            plt.title(f'{bp_name} Blood Pressure by Transportation Method', fontsize=14)
            plt.xlabel('Walking as Transportation', fontsize=12)
            plt.ylabel(f'{bp_name} Blood Pressure (mmHg)', fontsize=12)
            plt.xticks([0, 1], ['No', 'Yes'])
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            plt.tight_layout()
            
            # Save figure
            fig_path = os.path.join(trans_dir, f"{bp_var}_by_transportation.png")
            plt.savefig(fig_path, dpi=300)
            plt.close()
            logger.info(f"    Saved {bp_name} BP by transportation visualization to {fig_path}")

def experience_health_analysis(data):
    """Analyze relationship between agricultural experience and health"""
    logger.info("Analyzing agricultural experience and health...")
    
    if 'Ancienneté agricole' not in data.columns:
        logger.warning("Agricultural experience data not available for analysis")
        return
    
    # Create directory for experience analysis
    exp_dir = os.path.join(output_dir, "experience")
    os.makedirs(exp_dir, exist_ok=True)
    
    # 1. Compare experience levels across health complaint groups
    health_vars = [var for var in data.columns if var.startswith('has_')]
    
    for health_var in health_vars:
        logger.info(f"  Analyzing experience vs. {health_var}")
        
        plt.figure(figsize=(10, 6))
        
        # Create boxplot
        sns.boxplot(x=health_var, y='Ancienneté agricole', data=data)
        
        # Add individual points
        sns.stripplot(
            x=health_var, 
            y='Ancienneté agricole', 
            data=data,
            color='black',
            alpha=0.3,
            jitter=True
        )
        
        # Perform t-test
        health_exp = data[data[health_var] == 1]['Ancienneté agricole'].dropna()
        no_health_exp = data[data[health_var] == 0]['Ancienneté agricole'].dropna()
        
        if len(health_exp) > 0 and len(no_health_exp) > 0:
            t_stat, p_value = stats.ttest_ind(health_exp, no_health_exp, equal_var=False)
            
            # Add t-test result
            plt.annotate(
                f't-test: t={t_stat:.2f}, p={p_value:.4f}',
                xy=(0.5, 0.97),
                xycoords='axes fraction',
                ha='center',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
            
            # Add means
            mean_health = health_exp.mean()
            mean_no_health = no_health_exp.mean()
            
            plt.annotate(
                f'With complaint: {mean_health:.1f} years\nWithout complaint: {mean_no_health:.1f} years',
                xy=(0.5, 0.85),
                xycoords='axes fraction',
                ha='center',
                va='top',
                bbox=dict(boxstyle='round', fc='white', alpha=0.8)
            )
        
        plt.title(f'Agricultural Experience by {health_var}', fontsize=14)
        plt.xlabel(health_var, fontsize=12)
        plt.ylabel('Agricultural Experience (years)', fontsize=12)
        plt.xticks([0, 1], ['No', 'Yes'])
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        
        # Save figure
        fig_path = os.path.join(exp_dir, f"experience_vs_{health_var}.png")
        plt.savefig(fig_path, dpi=300)
        plt.close()
        logger.info(f"    Saved visualization to {fig_path}")
    
    # 2. Analyze correlation between experience and health metrics
    health_metrics = ['BMI', 'TAS', 'TAD']
    available_metrics = [var for var in health_metrics if var in data.columns]
    
    for metric in available_metrics:
        logger.info(f"  Analyzing experience vs. {metric}")
        
        # Calculate correlation
        corr, p = stats.pearsonr(data['Ancienneté agricole'], data[metric])
        
        # Create scatter plot
        plt.figure(figsize=(10, 6))
        
        sns.regplot(
            x='Ancienneté agricole',
            y=metric,
            data=data,
            scatter_kws={'alpha': 0.6},
            line_kws={'color': 'red'}
        )
        
        # Add correlation text
        plt.annotate(
            f'r = {corr:.2f}, p = {p:.4f}',
            xy=(0.05, 0.95),
            xycoords='axes fraction',
            ha='left',
            va='top',
            bbox=dict(boxstyle='round', fc='white', alpha=0.8)
        )
        
        plt.title(f'Relationship between Agricultural Experience and {metric}', fontsize=14)
        plt.xlabel('Agricultural Experience (years)', fontsize=12)
        plt.ylabel(metric, fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        
        # Save figure
        fig_path = os.path.join(exp_dir, f"experience_vs_{metric}.png")
        plt.savefig(fig_path, dpi=300)
        plt.close()
        logger.info(f"    Saved Experience vs {metric} visualization to {fig_path}")
    
    # 3. Create experience groups and analyze health outcomes
    logger.info("  Analyzing health outcomes by experience groups")
    
    # Create experience groups
    data['Experience_Group'] = pd.cut(
        data['Ancienneté agricole'],
        bins=[0, 5, 10, 20, 50],
        labels=['<5 yrs', '5-10 yrs', '10-20 yrs', '>20 yrs']
    )
    
    # Compare health outcomes across experience groups
    for health_var in health_vars:
        plt.figure(figsize=(12, 6))
        
        # Calculate percentages
        props = pd.crosstab(
            data['Experience_Group'], 
            data[health_var],
            normalize='index'
        ) * 100
        
        # Create bar chart for the "Yes" column (health complaint present)
        ax = props[1].plot(kind='bar')
        
        plt.title(f'{health_var} by Agricultural Experience', fontsize=14)
        plt.xlabel('Years of Agricultural Experience', fontsize=12)
        plt.ylabel(f'Percentage with {health_var}', fontsize=12)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        
        # Add percentage labels
        for i, v in enumerate(props[1]):
            ax.text(i, v + 1, f'{v:.1f}%', ha='center')
        
        plt.tight_layout()
        
        # Save figure
        fig_path = os.path.join(exp_dir, f"experience_group_vs_{health_var}.png")
        plt.savefig(fig_path, dpi=300)
        plt.close()
        logger.info(f"    Saved Experience Groups vs {health_var} visualization")

def multivariate_health_model(data):
    """Build multivariate models to predict health outcomes"""
    logger.info("Building multivariate models for health outcomes...")
    
    # Check if we have any health outcome variables
    health_vars = [var for var in data.columns if var.startswith('has_')]
    
    if not health_vars:
        logger.warning("No health outcome variables available for modeling")
        return
    
    # Create directory for models
    models_dir = os.path.join(output_dir, "multivariate_models")
    os.makedirs(models_dir, exist_ok=True)
    
    # For each health outcome, create a logistic regression model
    for health_var in health_vars:
        logger.info(f"  Building model for {health_var}")
        
        # Define potential predictors
        predictors = [
            'Age', 'BMI', 'Hours_Per_Week', 'exposed_to_tabouna',
            'is_smoker', 'uses_neffa', 'Ancienneté agricole'
        ]
        
        # Only include predictors that exist in the data
        available_predictors = [var for var in predictors if var in data.columns]
        
        if len(available_predictors) < 3:
            logger.warning(f"    Insufficient predictors for {health_var}")
            continue
        
        # Prepare data for model (drop missing values)
        model_data = data.dropna(subset=available_predictors + [health_var])
        
        if len(model_data) < 20:
            logger.warning(f"    Insufficient data for {health_var} model")
            continue
        
        # Build formula for statsmodels
        formula = f"{health_var} ~ " + " + ".join(available_predictors)
        
        try:
            # Fit logistic regression model
            model = glm(formula=formula, data=model_data, family=sm.families.Binomial()).fit()
            
            # Save model summary
            with open(os.path.join(models_dir, f"{health_var}_model_summary.txt"), 'w') as f:
                f.write(model.summary().as_text())
            
            # Extract odds ratios and confidence intervals
            params = model.params
            conf_int = model.conf_int()
            
            odds_ratios = pd.DataFrame({
                'Variable': params.index,
                'Coefficient': params.values,
                'Odds_Ratio': np.exp(params.values),
                'CI_Lower': np.exp(conf_int[0]),
                'CI_Upper': np.exp(conf_int[1]),
                'P_value': model.pvalues,
                'Significant': model.pvalues < 0.05
            })
            
            # Save odds ratios
            odds_path = os.path.join(models_dir, f"{health_var}_odds_ratios.csv")
            odds_ratios.to_csv(odds_path, index=False)
            logger.info(f"    Saved odds ratios to {odds_path}")
            
            # Create forest plot of odds ratios
            # Filter out intercept for visualization
            plot_odds = odds_ratios[odds_ratios['Variable'] != 'Intercept'].copy()
            
            if not plot_odds.empty:
                plt.figure(figsize=(10, 6))
                
                # Create forest plot
                plt.errorbar(
                    x=plot_odds['Odds_Ratio'],
                    y=range(len(plot_odds)),
                    xerr=[plot_odds['Odds_Ratio'] - plot_odds['CI_Lower'], 
                         plot_odds['CI_Upper'] - plot_odds['Odds_Ratio']],
                    fmt='o',
                    capsize=5
                )
                
                # Add vertical line at OR=1 (no effect)
                plt.axvline(1, color='gray', linestyle='--', alpha=0.7)
                
                # Format y-axis with variable names
                plt.yticks(range(len(plot_odds)), plot_odds['Variable'])
                
                plt.title(f'Odds Ratios for {health_var}', fontsize=14)
                plt.xlabel('Odds Ratio (log scale)', fontsize=12)
                plt.xscale('log')  # Use log scale for better visualization
                
                # Set reasonable x-axis limits
                max_upper = min(plot_odds['CI_Upper'].max(), 10)  # Cap at 10 for readability
                min_lower = max(plot_odds['CI_Lower'].min(), 0.1)  # Floor at 0.1 for readability
                plt.xlim(min_lower, max_upper)
                
                plt.grid(axis='x', linestyle='--', alpha=0.7)
                
                plt.tight_layout()
                
                # Save figure
                fig_path = os.path.join(models_dir, f"{health_var}_odds_ratios_plot.png")
                plt.savefig(fig_path, dpi=300)
                plt.close()
                logger.info(f"    Saved odds ratios plot to {fig_path}")
        
        except Exception as e:
            logger.error(f"    Error building model for {health_var}: {str(e)}")

def create_summary_report(data):
    """Create a summary report of key health findings"""
    logger.info("Creating health analysis summary report...")
    
    # Create list to store findings
    findings = []
    
    # 1. Check for BMI analysis results
    bmi_health_path = os.path.join(output_dir, "bmi_analysis/bmi_health_chi_square.csv")
    if os.path.exists(bmi_health_path):
        try:
            bmi_results = pd.read_csv(bmi_health_path)
            sig_results = bmi_results[bmi_results['Significant']]
            
            for _, row in sig_results.iterrows():
                findings.append(
                    f"BMI category is significantly associated with {row['Health_Variable']} " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f})"
                )
        except Exception as e:
            logger.error(f"Error reading BMI results: {str(e)}")
    
    # 2. Check for blood pressure correlations
    bp_corr_path = os.path.join(output_dir, "blood_pressure/bmi_bp_correlations.csv")
    if os.path.exists(bp_corr_path):
        try:
            bp_results = pd.read_csv(bp_corr_path)
            sig_results = bp_results[bp_results['Significant']]
            
            for _, row in sig_results.iterrows():
                findings.append(
                    f"BMI is significantly correlated with {row['Variable']} " +
                    f"(r={row['Correlation']:.2f}, p={row['P_value']:.4f})"
                )
        except Exception as e:
            logger.error(f"Error reading BP correlation results: {str(e)}")
    
    # 3. Check for traditional practices results
    trad_path = os.path.join(output_dir, "traditional_practices/tabouna_respiratory_results.csv")
    if os.path.exists(trad_path):
        try:
            trad_results = pd.read_csv(trad_path)
            if 'Significant' in trad_results.columns and trad_results['Significant'].any():
                row = trad_results.iloc[0]
                findings.append(
                    f"Tabouna smoke exposure is significantly associated with respiratory issues " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f}, " +
                    f"Risk Ratio={row['Risk_Ratio']:.2f})"
                )
        except Exception as e:
            logger.error(f"Error reading traditional practices results: {str(e)}")
    
    # 4. Check for work pattern results
    work_path = os.path.join(output_dir, "work_patterns/employment_status_health.csv")
    if os.path.exists(work_path):
        try:
            work_results = pd.read_csv(work_path)
            sig_results = work_results[work_results['Significant']]
            
            for _, row in sig_results.iterrows():
                findings.append(
                    f"Employment status (permanent vs. seasonal) is significantly associated with " +
                    f"{row['Health_Outcome']} (Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f})"
                )
        except Exception as e:
            logger.error(f"Error reading work pattern results: {str(e)}")
    
    # 5. Check for transportation results
    trans_path = os.path.join(output_dir, "transportation/walking_health_results.csv")
    if os.path.exists(trans_path):
        try:
            trans_results = pd.read_csv(trans_path)
            sig_results = trans_results[trans_results['Significant']]
            
            for _, row in sig_results.iterrows():
                findings.append(
                    f"Walking as transportation is significantly associated with {row['Health_Outcome']} " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f}, " +
                    f"Risk Ratio={row['Risk_Ratio']:.2f})"
                )
        except Exception as e:
            logger.error(f"Error reading transportation results: {str(e)}")
    
    # 6. Check for multivariate model results
    models_dir = os.path.join(output_dir, "multivariate_models")
    if os.path.exists(models_dir):
        for file in os.listdir(models_dir):
            if file.endswith("_odds_ratios.csv"):
                try:
                    model_results = pd.read_csv(os.path.join(models_dir, file))
                    sig_results = model_results[model_results['Significant'] & (model_results['Variable'] != 'Intercept')]
                    
                    health_outcome = file.split('_odds_ratios.csv')[0]
                    
                    for _, row in sig_results.iterrows():
                        direction = "increase" if row['Odds_Ratio'] > 1 else "decrease"
                        findings.append(
                            f"{row['Variable']} is significantly associated with a {direction} in {health_outcome} " +
                            f"(OR={row['Odds_Ratio']:.2f}, 95% CI: {row['CI_Lower']:.2f}-{row['CI_Upper']:.2f}, " +
                            f"p={row['P_value']:.4f})"
                        )
                except Exception as e:
                    logger.error(f"Error reading model results: {str(e)}")
    
    # Create summary report
    with open(os.path.join(output_dir, "health_analysis_summary.txt"), 'w') as f:
        f.write("# Health Analysis Summary Report\n\n")
        f.write(f"Date: {pd.Timestamp.now().strftime('%Y-%m-%d')}\n")
        f.write(f"Dataset: Female Farmers Health Study\n")
        f.write(f"Sample Size: {len(data)}\n\n")
        
        f.write("## Key Findings\n\n")
        
        if findings:
            for i, finding in enumerate(findings, 1):
                f.write(f"{i}. {finding}\n")
        else:
            f.write("No significant health relationships were identified.\n")
        
        f.write("\n## Analysis Methods\n\n")
        f.write("The following health analyses were performed:\n")
        f.write("- Relationships between BMI and health outcomes\n")
        f.write("- Blood pressure analysis and its determinants\n")
        f.write("- Impact of traditional practices (Tabouna smoke, Neffa) on health\n")
        f.write("- Work patterns and health relationships\n")
        f.write("- Transportation method and health outcomes\n")
        f.write("- Agricultural experience and health metrics\n")
        f.write("- Multivariate models to predict specific health outcomes\n")
    
    logger.info("Health analysis summary report created")

def main():
    """Main function to execute all missing health analyses"""
    try:
        # Load data
        data = load_data()
        
        # Prepare data for analysis
        analysis_data = prepare_data_for_analysis(data)
        
        # Traditional practices analysis
        traditional_practices_analysis(analysis_data)
        
        # Transportation method analysis
        transportation_health_analysis(analysis_data)
        
        # Agricultural experience analysis
        experience_health_analysis(analysis_data)
        
        # Multivariate health models
        multivariate_health_model(analysis_data)
        
        # Create summary report
        create_summary_report(analysis_data)
        
        logger.info(f"All missing health analyses completed and saved to {output_dir}")
        
    except Exception as e:
        logger.error(f"Error in health analysis: {str(e)}")
        logger.exception("Detailed error information:")

if __name__ == "__main__":
    main() == no_health

In [8]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Fix Transportation and Traditional Practices Analyses for Female Farmers
This script focuses on completing the problematic analyses:
1. Transportation health analysis
2. Traditional practices analysis
3. Update summary report

Author: [Your Name]
Date: April 2, 2025
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Set output directory for results
output_dir = 'C:/Users/SelmaB/Desktop/Project/Analysis/2.Anova part two all determinants/results/health_analysis'

def load_data():
    """Load the preprocessed dataset with detailed error logging"""
    logger.info("Loading the dataset...")
    
    # List potential file paths
    potential_paths = [
        'C:/Users/SelmaB/Desktop/Project/Analysis/fixed_female_farmers_data.xlsx',
        'fixed_female_farmers_data.xlsx',
        'C:/Users/SelmaB/Desktop/Project/fixed_female_farmers_data.xlsx',
        'C:/Users/SelmaB/Desktop/fixed_female_farmers_data.xlsx'
    ]
    
    data = None
    errors = []
    
    # Try each path
    for path in potential_paths:
        try:
            data = pd.read_excel(path)
            logger.info(f"Successfully loaded data from {path}")
            logger.info(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns")
            
            # Print column names to debug
            logger.info(f"Columns in dataset: {', '.join(data.columns)}")
            
            return data
        except Exception as e:
            errors.append(f"Error loading from {path}: {str(e)}")
    
    # If we reach here, all paths failed
    for error in errors:
        logger.error(error)
    
    raise FileNotFoundError("Could not find or load the dataset from any expected location")

def debug_data(data):
    """Print detailed information about key variables for debugging"""
    logger.info("Debugging key variables...")
    
    # Check transportation variable
    transport_cols = [col for col in data.columns if 'transport' in col.lower()]
    if transport_cols:
        for col in transport_cols:
            logger.info(f"Column found: {col}")
            logger.info(f"Values: {data[col].value_counts().to_dict()}")
    else:
        logger.warning("No transportation column found with 'transport' in the name")
        # Look for potential transportation columns
        potential_cols = [col for col in data.columns if col.startswith('Mo') or 'déplacement' in col.lower()]
        if potential_cols:
            logger.info(f"Potential transportation columns: {potential_cols}")
            for col in potential_cols:
                logger.info(f"Values in {col}: {data[col].value_counts().to_dict()}")
    
    # Check traditional practices variables
    trad_cols = ['Fumées de Tabouna', 'Tabagisme', 'Neffa']
    for col in trad_cols:
        if col in data.columns:
            logger.info(f"Column found: {col}")
            logger.info(f"Values: {data[col].value_counts().to_dict()}")
        else:
            logger.warning(f"Traditional practice column not found: {col}")
    
    # Check health outcome variables
    health_cols = [col for col in data.columns if 'Troubles' in col]
    for col in health_cols:
        logger.info(f"Health column: {col}")
        # Count non-null values
        non_null = data[col].notna().sum()
        logger.info(f"Non-null values: {non_null} ({non_null/len(data)*100:.1f}%)")
    
    return

def prepare_data_for_analysis(data):
    """
    Prepare data for health analysis, with additional checks and diagnostics
    """
    logger.info("Preparing data for analysis...")
    
    # Create a copy to avoid modifying the original
    df = data.copy()
    
    # Check for transportation variable and create if needed
    if 'Moyen de transport' in df.columns:
        logger.info("Found 'Moyen de transport' column")
        
        # Show value counts
        counts = df['Moyen de transport'].value_counts()
        logger.info(f"Transportation mode counts: {counts.to_dict()}")
        
        # Check if 'a pieds' is in the values (with variations)
        potential_walking = [val for val in counts.index if 'pied' in str(val).lower()]
        if potential_walking:
            walking_value = potential_walking[0]
            logger.info(f"Using '{walking_value}' as walking value")
            df['transport_walking'] = (df['Moyen de transport'] == walking_value).astype(int)
        else:
            logger.warning("Could not find 'a pieds' value, using most common value as reference")
            most_common = counts.index[0]
            df['transport_walking'] = (df['Moyen de transport'] == most_common).astype(int)
        
        logger.info(f"Created binary transportation variable: {df['transport_walking'].value_counts().to_dict()}")
    else:
        logger.warning("'Moyen de transport' column not found, checking alternatives")
        # Try to find alternative transportation column
        transport_cols = [col for col in df.columns if 'transport' in col.lower() or 'déplacement' in col.lower()]
        if transport_cols:
            transport_col = transport_cols[0]
            logger.info(f"Using alternative column: {transport_col}")
            # Count values
            counts = df[transport_col].value_counts()
            logger.info(f"Values: {counts.to_dict()}")
            
            # Create binary variable using most common value
            most_common = counts.index[0]
            df['transport_walking'] = (df[transport_col] == most_common).astype(int)
        else:
            logger.error("No transportation column found, cannot create transport_walking variable")
    
    # Create binary indicators for traditional practices
    trad_practices = {
        'Fumées de Tabouna': 'exposed_to_tabouna',
        'Tabagisme': 'is_smoker',
        'Neffa': 'uses_neffa'
    }
    
    for orig_col, new_col in trad_practices.items():
        if orig_col in df.columns:
            # Check value counts
            counts = df[orig_col].value_counts()
            logger.info(f"{orig_col} values: {counts.to_dict()}")
            
            # Look for 'oui' or similar affirmative values
            affirmative = [val for val in counts.index if str(val).lower() in ['oui', 'yes', '1', 'true']]
            if affirmative:
                affirmative_val = affirmative[0]
                logger.info(f"Using '{affirmative_val}' as affirmative value for {orig_col}")
                df[new_col] = (df[orig_col] == affirmative_val).astype(int)
            else:
                logger.warning(f"Could not find affirmative value for {orig_col}, using most common value")
                most_common = counts.index[0]
                df[new_col] = (df[orig_col] == most_common).astype(int)
            
            logger.info(f"Created binary variable {new_col}: {df[new_col].value_counts().to_dict()}")
        else:
            logger.warning(f"Column not found: {orig_col}")
    
    # Create binary health indicators
    health_categories = [col for col in df.columns if 'Troubles' in col]
    
    for category in health_categories:
        binary_var = f"has_{category.lower().replace('-', '_').replace(' ', '_').replace('/', '_')}"
        df[binary_var] = df[category].notna().astype(int)
        logger.info(f"Created binary health variable: {binary_var}: {df[binary_var].value_counts().to_dict()}")
    
    # Create a variable for any health complaint
    if health_categories:
        df['has_any_health_complaint'] = df[health_categories].notna().any(axis=1).astype(int)
        logger.info(f"Created 'has_any_health_complaint' variable: {df['has_any_health_complaint'].value_counts().to_dict()}")
    
    return df

def transportation_health_analysis(data):
    """Analyze relationship between transportation method and health with enhanced debugging"""
    logger.info("Analyzing transportation method and health...")
    
    # Check for transportation variables
    if 'transport_walking' not in data.columns:
        logger.error("transport_walking variable not found, cannot proceed with analysis")
        return False
    
    # Log counts
    logger.info(f"transport_walking counts: {data['transport_walking'].value_counts().to_dict()}")
    
    # Create directory for transportation analysis
    trans_dir = os.path.join(output_dir, "transportation")
    os.makedirs(trans_dir, exist_ok=True)
    
    # Get health variables
    health_vars = [var for var in data.columns if var.startswith('has_')]
    logger.info(f"Found {len(health_vars)} health variables: {health_vars}")
    
    if not health_vars:
        logger.error("No health variables found, cannot proceed with analysis")
        return False
    
    # 1. Analyze relationship between walking and health complaints
    results = []
    created_visualizations = False
    
    for health_var in health_vars:
        logger.info(f"  Analyzing transportation vs. {health_var}")
        
        # Count values in the health variable
        logger.info(f"  {health_var} counts: {data[health_var].value_counts().to_dict()}")
        
        try:
            # Create contingency table
            contingency = pd.crosstab(data['transport_walking'], data[health_var])
            logger.info(f"  Contingency table:\n{contingency}")
            
            # Check if we have enough data
            if contingency.shape[0] > 1 and contingency.shape[1] > 1:
                # Check for zero counts
                has_zeros = (contingency == 0).any().any()
                if has_zeros:
                    logger.warning(f"  Contingency table contains zeros, which may affect chi-square test")
                
                # Perform chi-square test
                chi2, p, dof, expected = stats.chi2_contingency(contingency)
                
                # Calculate risk ratio if possible
                try:
                    if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                        risk_walking = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                        risk_other = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                        risk_ratio = risk_walking / risk_other
                    else:
                        risk_ratio = np.nan
                except Exception as e:
                    logger.error(f"  Error calculating risk ratio: {str(e)}")
                    risk_ratio = np.nan
                
                results.append({
                    'Health_Outcome': health_var,
                    'Chi_Square': chi2,
                    'P_value': p,
                    'Risk_Ratio': risk_ratio,
                    'Significant': p < 0.05
                })
                
                # Create visualization for this health variable regardless of significance
                try:
                    plt.figure(figsize=(10, 6))
                    
                    # Calculate percentages
                    props = pd.crosstab(
                        data['transport_walking'], 
                        data[health_var],
                        normalize='index'
                    ) * 100
                    
                    # Create bar chart for the "Yes" column (health complaint present)
                    ax = props[1].plot(kind='bar')
                    
                    plt.title(f'{health_var} by Transportation Method (p={p:.4f})', fontsize=14)
                    plt.xlabel('Walking as Transportation', fontsize=12)
                    plt.ylabel(f'Percentage with {health_var}', fontsize=12)
                    plt.xticks([0, 1], ['No', 'Yes'])
                    plt.grid(axis='y', linestyle='--', alpha=0.7)
                    
                    # Add percentage labels
                    for i, v in enumerate(props[1]):
                        ax.text(i, v + 1, f'{v:.1f}%', ha='center')
                    
                    plt.tight_layout()
                    
                    # Save figure
                    fig_path = os.path.join(trans_dir, f"walking_vs_{health_var}.png")
                    plt.savefig(fig_path, dpi=300)
                    plt.close()
                    logger.info(f"    Saved visualization to {fig_path}")
                    created_visualizations = True
                except Exception as e:
                    logger.error(f"  Error creating visualization: {str(e)}")
            else:
                logger.warning(f"  Insufficient data for chi-square test for {health_var}")
        except Exception as e:
            logger.error(f"  Error analyzing {health_var}: {str(e)}")
    
    # Save results
    if results:
        try:
            results_df = pd.DataFrame(results)
            results_path = os.path.join(trans_dir, "walking_health_results.csv")
            results_df.to_csv(results_path, index=False)
            logger.info(f"  Saved walking vs. health results to {results_path}")
        except Exception as e:
            logger.error(f"  Error saving results: {str(e)}")
    else:
        logger.warning("  No valid results to save for transportation analysis")
    
    return created_visualizations

def traditional_practices_analysis(data):
    """Analyze the impact of traditional practices on health with enhanced debugging"""
    logger.info("Analyzing traditional practices and health...")
    
    # Check for traditional practice variables
    trad_vars = ['exposed_to_tabouna', 'uses_neffa']
    available_vars = [var for var in trad_vars if var in data.columns]
    
    if not available_vars:
        logger.error("No traditional practice variables found, cannot proceed with analysis")
        return False
    
    # Log counts for available variables
    for var in available_vars:
        logger.info(f"{var} counts: {data[var].value_counts().to_dict()}")
    
    # Create directory for traditional practices analysis
    trad_dir = os.path.join(output_dir, "traditional_practices")
    os.makedirs(trad_dir, exist_ok=True)
    
    # Get health variables
    health_vars = [var for var in data.columns if var.startswith('has_')]
    logger.info(f"Found {len(health_vars)} health variables for traditional practices analysis")
    
    if not health_vars:
        logger.error("No health variables found, cannot proceed with analysis")
        return False
    
    created_visualizations = False
    
    # Analyze each traditional practice
    for practice_var in available_vars:
        logger.info(f"  Analyzing {practice_var} and health outcomes")
        
        results = []
        
        for health_var in health_vars:
            try:
                # Create contingency table
                contingency = pd.crosstab(data[practice_var], data[health_var])
                logger.info(f"  Contingency table for {practice_var} vs {health_var}:\n{contingency}")
                
                # Check if we have enough data
                if contingency.shape[0] > 1 and contingency.shape[1] > 1:
                    # Perform chi-square test
                    chi2, p, dof, expected = stats.chi2_contingency(contingency)
                    
                    # Calculate risk ratio if possible
                    try:
                        if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                            risk_exposed = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                            risk_unexposed = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                            risk_ratio = risk_exposed / risk_unexposed
                        else:
                            risk_ratio = np.nan
                    except Exception as e:
                        logger.error(f"  Error calculating risk ratio: {str(e)}")
                        risk_ratio = np.nan
                    
                    results.append({
                        'Health_Outcome': health_var,
                        'Chi_Square': chi2,
                        'P_value': p,
                        'Risk_Ratio': risk_ratio,
                        'Significant': p < 0.05
                    })
                    
                    # Create visualization (regardless of significance)
                    try:
                        plt.figure(figsize=(10, 6))
                        
                        # Calculate percentages
                        props = pd.crosstab(
                            data[practice_var], 
                            data[health_var],
                            normalize='index'
                        ) * 100
                        
                        # Create bar chart
                        ax = props[1].plot(kind='bar')
                        
                        plt.title(f'{health_var} by {practice_var} (p={p:.4f})', fontsize=14)
                        plt.xlabel(practice_var, fontsize=12)
                        plt.ylabel(f'Percentage with {health_var}', fontsize=12)
                        plt.xticks([0, 1], ['No', 'Yes'])
                        plt.grid(axis='y', linestyle='--', alpha=0.7)
                        
                        # Add percentage labels
                        for i, v in enumerate(props[1]):
                            ax.text(i, v + 1, f'{v:.1f}%', ha='center')
                        
                        plt.tight_layout()
                        
                        # Save figure
                        fig_path = os.path.join(trad_dir, f"{practice_var}_vs_{health_var}.png")
                        plt.savefig(fig_path, dpi=300)
                        plt.close()
                        logger.info(f"    Saved visualization to {fig_path}")
                        created_visualizations = True
                    except Exception as e:
                        logger.error(f"  Error creating visualization: {str(e)}")
                else:
                    logger.warning(f"  Insufficient data for chi-square test for {practice_var} vs {health_var}")
            except Exception as e:
                logger.error(f"  Error analyzing {practice_var} vs {health_var}: {str(e)}")
        
        # Save results
        if results:
            try:
                results_df = pd.DataFrame(results)
                results_path = os.path.join(trad_dir, f"{practice_var}_health_outcomes.csv")
                results_df.to_csv(results_path, index=False)
                logger.info(f"  Saved {practice_var} health outcomes to {results_path}")
            except Exception as e:
                logger.error(f"  Error saving results: {str(e)}")
    
    # Special analysis for respiratory issues with Tabouna smoke if available
    if 'exposed_to_tabouna' in available_vars:
        logger.info("  Performing specific analysis of Tabouna exposure and respiratory issues")
        
        # Find respiratory issues variable
        resp_vars = [col for col in health_vars if 'respir' in col.lower() or 'cardio' in col.lower()]
        
        if resp_vars:
            resp_var = resp_vars[0]
            logger.info(f"  Using {resp_var} for respiratory analysis")
            
            try:
                # Create contingency table
                contingency = pd.crosstab(data['exposed_to_tabouna'], data[resp_var])
                logger.info(f"  Contingency table for Tabouna vs respiratory issues:\n{contingency}")
                
                # Perform chi-square test
                chi2, p, dof, expected = stats.chi2_contingency(contingency)
                
                # Calculate risk ratio
                if 1 in contingency.index and 0 in contingency.index and 1 in contingency.columns:
                    risk_exposed = contingency.loc[1, 1] / (contingency.loc[1, 0] + contingency.loc[1, 1])
                    risk_unexposed = contingency.loc[0, 1] / (contingency.loc[0, 0] + contingency.loc[0, 1])
                    risk_ratio = risk_exposed / risk_unexposed
                else:
                    risk_exposed = np.nan
                    risk_unexposed = np.nan
                    risk_ratio = np.nan
                
                # Create results dataframe
                results = pd.DataFrame({
                    'Test': ['Tabouna exposure vs. Respiratory issues'],
                    'Chi_Square': [chi2],
                    'P_value': [p],
                    'Risk_Ratio': [risk_ratio],
                    'Risk_Exposed': [risk_exposed],
                    'Risk_Unexposed': [risk_unexposed],
                    'Significant': [p < 0.05]
                })
                
                # Save results
                results_path = os.path.join(trad_dir, "tabouna_respiratory_results.csv")
                results.to_csv(results_path, index=False)
                logger.info(f"    Saved respiratory results to {results_path}")
                
                # Create special visualization
                plt.figure(figsize=(10, 6))
                
                # Calculate percentages
                props = pd.crosstab(
                    data['exposed_to_tabouna'], 
                    data[resp_var],
                    normalize='index'
                ) * 100
                
                # Create bar chart
                ax = props[1].plot(kind='bar')
                
                plt.title(f'Respiratory Issues by Tabouna Exposure (p={p:.4f}, RR={risk_ratio:.2f})', fontsize=14)
                plt.xlabel('Exposed to Tabouna Smoke', fontsize=12)
                plt.ylabel('Percentage with Respiratory Issues', fontsize=12)
                plt.xticks([0, 1], ['No', 'Yes'])
                plt.grid(axis='y', linestyle='--', alpha=0.7)
                
                # Add percentage labels
                for i, v in enumerate(props[1]):
                    ax.text(i, v + 1, f'{v:.1f}%', ha='center')
                
                plt.tight_layout()
                
                # Save figure
                fig_path = os.path.join(trad_dir, "tabouna_respiratory_issues.png")
                plt.savefig(fig_path, dpi=300)
                plt.close()
                logger.info(f"    Saved respiratory visualization to {fig_path}")
                created_visualizations = True
            except Exception as e:
                logger.error(f"  Error in respiratory analysis: {str(e)}")
        else:
            logger.warning("  No respiratory variable found for Tabouna smoke analysis")
    
    return created_visualizations

def update_summary_report():
    """Update the summary report with findings from all analyses"""
    logger.info("Updating health analysis summary report...")
    
    # Create list to store findings
    findings = []
    
    # Helper function to read CSV files and extract findings
    def extract_findings_from_csv(file_path, pattern, formatter):
        if os.path.exists(file_path):
            try:
                results = pd.read_csv(file_path)
                sig_results = results[results['Significant'] == True] if 'Significant' in results.columns else results
                
                for _, row in sig_results.iterrows():
                    findings.append(formatter(row))
                    
                return len(sig_results)
            except Exception as e:
                logger.error(f"Error reading {file_path}: {str(e)}")
                return 0
        else:
            logger.warning(f"File not found: {file_path}")
            return 0
    
    # 1. Check for BMI analysis results
    count = extract_findings_from_csv(
        os.path.join(output_dir, "bmi_analysis/bmi_health_chi_square.csv"),
        'Health_Variable',
        lambda row: f"BMI category is significantly associated with {row['Health_Variable']} " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f})"
    )
    logger.info(f"Found {count} significant BMI findings")
    
    # 2. Check for blood pressure correlations
    count = extract_findings_from_csv(
        os.path.join(output_dir, "blood_pressure/bmi_bp_correlations.csv"),
        'Variable',
        lambda row: f"BMI is significantly correlated with {row['Variable']} " +
                    f"(r={row['Correlation']:.2f}, p={row['P_value']:.4f})"
    )
    logger.info(f"Found {count} significant blood pressure correlations")
    
    # 3. Check for traditional practices results
    count = extract_findings_from_csv(
        os.path.join(output_dir, "traditional_practices/tabouna_respiratory_results.csv"),
        'Test',
        lambda row: f"Tabouna smoke exposure is significantly associated with respiratory issues " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f}, " +
                    f"Risk Ratio={row['Risk_Ratio']:.2f})"
    )
    logger.info(f"Found {count} significant traditional practices findings")
    
    # 4. Check for additional traditional practices results
    trad_dir = os.path.join(output_dir, "traditional_practices")
    if os.path.exists(trad_dir):
        for file in os.listdir(trad_dir):
            if file.endswith("_health_outcomes.csv") and "tabouna_respiratory_results" not in file:
                count = extract_findings_from_csv(
                    os.path.join(trad_dir, file),
                    'Health_Outcome',
                    lambda row: f"Traditional practice is significantly associated with {row['Health_Outcome']} " +
                                f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f}, " +
                                f"Risk Ratio={row['Risk_Ratio']:.2f if 'Risk_Ratio' in row else 'N/A'})"
                )
                logger.info(f"Found {count} significant findings in {file}")
    
    # 5. Check for transportation results
    count = extract_findings_from_csv(
        os.path.join(output_dir, "transportation/walking_health_results.csv"),
        'Health_Outcome',
        lambda row: f"Walking as transportation is significantly associated with {row['Health_Outcome']} " +
                    f"(Chi-square={row['Chi_Square']:.2f}, p={row['P_value']:.4f}, " +
                    f"Risk Ratio={row['Risk_Ratio']:.2f if 'Risk_Ratio' in row else 'N/A'})"
    )
    logger.info(f"Found {count} significant transportation findings")
    
    # Create updated summary report
    try:
        with open(os.path.join(output_dir, "health_analysis_summary.txt"), 'w') as f:
            f.write("# Health Analysis Summary Report\n\n")
            f.write(f"Date: {pd.Timestamp.now().strftime('%Y-%m-%d')}\n")
            f.write(f"Dataset: Female Farmers Health Study\n")
            f.write(f"Sample Size: 80\n\n")
            
            f.write("## Key Findings\n\n")
            
            if findings:
                for i, finding in enumerate(findings, 1):
                    f.write(f"{i}. {finding}\n")
            else:
                f.write("No significant health relationships were identified. This might be due to the small sample size or data quality issues.\n")
            
            f.write("\n## Analysis Methods\n\n")
            f.write("The following health analyses were performed:\n")
            f.write("- Relationships between BMI and health outcomes\n")
            f.write("- Blood pressure analysis and its determinants\n")
            f.write("- Impact of traditional practices (Tabouna smoke, Neffa) on health\n")
            f.write("- Work patterns and health relationships\n")
            f.write("- Transportation method and health outcomes\n")
            f.write("- Agricultural experience and health metrics\n")
            
            f.write("\n## Data Limitations\n\n")
            f.write("- Small sample size limiting statistical power\n")
            f.write("- Missing values in some key variables\n")
            f.write("- Self-reported health complaints may be subject to recall bias\n")
            
            logger.info("Summary report updated successfully")
            return True
    except Exception as e:
        logger.error(f"Error updating summary report: {str(e)}")
        return False

def main():
    """Main function to fix transportation and traditional practices analyses"""
    try:
        # Load data
        data = load_data()
        
        # Debug data
        debug_data(data)
        
        # Prepare data for analysis with enhanced checks
        analysis_data = prepare_data_for_analysis(data)
        
        # Run transportation analysis with enhanced diagnostics
        logger.info("Running transportation health analysis with enhanced diagnostics...")
        trans_success = transportation_health_analysis(analysis_data)
        logger.info(f"Transportation analysis {'completed successfully' if trans_success else 'failed'}")
        
        # Run traditional practices analysis with enhanced diagnostics
        logger.info("Running traditional practices analysis with enhanced diagnostics...")
        trad_success = traditional_practices_analysis(analysis_data)
        logger.info(f"Traditional practices analysis {'completed successfully' if trad_success else 'failed'}")
        
        # Update summary report
        logger.info("Updating summary report...")
        update_summary_report()
        
        logger.info("Analysis fixes completed")
        
    except Exception as e:
        logger.error(f"Unexpected error in analysis: {str(e)}")
        logger.exception("Detailed error information:")

if __name__ == "__main__":
    main()

2025-04-01 15:07:24,193 - INFO - Loading the dataset...
2025-04-01 15:07:24,284 - INFO - Successfully loaded data from fixed_female_farmers_data.xlsx
2025-04-01 15:07:24,285 - INFO - Dataset loaded with 80 rows and 37 columns
2025-04-01 15:07:24,286 - INFO - Columns in dataset: N°, Age, Situation maritale, Nb enfants, Nb pers à charge, Domicile, Niveau socio-économique, Tabagisme, Neffa, Fumées de Tabouna, AT en milieu agricole, H travail / jour, Mécanisme AT, Ménopause, Age ménopause, Antécédents gynéco, Ancienneté agricole, Catégorie professionnelle, Statut, J travail / Sem, Masque pour pesticides, Bottes, Niveau scolaire, Gants, Casquette/Mdhalla, Manteau imperméable, Poids, Taille, TAS, TAD, GAD, Produits chimiques utilisés, Produits biologiques utilisés, Engrais utilisés, Contraintes thermiques, Moyen de transport, Profession du mari
2025-04-01 15:07:24,288 - INFO - Debugging key variables...
2025-04-01 15:07:24,290 - INFO - Column found: Moyen de transport
2025-04-01 15:07:24,294