In [89]:
import pandas as pd

In [91]:
def preprocess_fips_life(year):
    """
    Preprocesses and merges FIPS data with life expectancy data for a given year.
    
    Parameters:
    year (int): The year of the life expectancy data (e.g., 2011).
    
    Returns:
    pd.DataFrame: Preprocessed DataFrame ready for analysis.
    """
    # Fixed file paths
    fips_path = '../data/raw/state_fips.csv'
    life_expectancy_path = f'../data/processed/le_single_year/life_expectancy_{year}.csv'

    # Load and preprocess FIPS codes
    fips_df = pd.read_csv(fips_path, dtype={'fips': str})
    fips_df['fips'] = fips_df['fips'].str.zfill(5)  # Ensure all FIPS codes are 5 characters
    fips_df['State_FIPS'] = fips_df['fips'].str[:2]
    fips_df['County_FIPS'] = fips_df['fips'].str[2:]
    fips_df['state_full'] = fips_df['state'].map({
        'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
        'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
        'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
        'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
        'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
        'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
        'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
        'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
        'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
        'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
        'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
        'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
        'WI': 'Wisconsin', 'WY': 'Wyoming'
    })

    # Load and preprocess life expectancy data
    life_df = pd.read_csv(life_expectancy_path)
    life_df[['County', 'State']] = life_df['location_name'].str.extract(r'^(.*) \((.*)\)$')
    life_df = life_df[['County', 'State']].apply(lambda x: x.str.strip())

    # Merge FIPS and life expectancy data
    merged_df = pd.merge(
        life_df,
        fips_df[['State_FIPS', 'County_FIPS', 'name', 'state_full']],
        left_on=['State', 'County'],
        right_on=['state_full', 'name'],
        how='left'
    )

    # Drop redundant columns and return
    return merged_df.drop(columns=['name', 'state_full'])

In [93]:
# Example usage
preprocessed_df = preprocess_fips_life(2012)
preprocessed_df.dropna(inplace=True)
preprocessed_df.to_csv('../data/processed/preprocessed_fips_life_expectancy/preprocessed_life_fips_2012.csv', index=False)
print("Preprocessed file saved.")

Preprocessed file saved.


In [95]:
for year in range(2010, 2020):
    df = preprocess_fips_life(year)
    df.dropna(inplace=True)
    df.to_csv(f'../data/processed/preprocessed_fips_life_expectancy/preprocessed_life_fips_{year}.csv', index=False)

In [102]:
def preprocess_fips_life(year):
    """
    Preprocesses and merges FIPS data with life expectancy data for a given year.
    
    Parameters:
    year (int): The year of the life expectancy data (e.g., 2011).
    
    Returns:
    pd.DataFrame: Preprocessed DataFrame with life expectancy and FIPS codes.
    """
    # Fixed file paths
    fips_path = '../data/raw/state_fips.csv'
    life_expectancy_path = f'../data/processed/le_single_year/life_expectancy_{year}.csv'

    # Load and preprocess FIPS codes
    fips_df = pd.read_csv(fips_path, dtype={'fips': str})
    fips_df['fips'] = fips_df['fips'].str.zfill(5)  # Ensure all FIPS codes are 5 characters
    fips_df['State_FIPS'] = fips_df['fips'].str[:2]
    fips_df['County_FIPS'] = fips_df['fips'].str[2:]
    fips_df['state_full'] = fips_df['state'].map({
        'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas',
        'CA': 'California', 'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware',
        'FL': 'Florida', 'GA': 'Georgia', 'HI': 'Hawaii', 'ID': 'Idaho',
        'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa', 'KS': 'Kansas',
        'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
        'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
        'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada',
        'NH': 'New Hampshire', 'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York',
        'NC': 'North Carolina', 'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma',
        'OR': 'Oregon', 'PA': 'Pennsylvania', 'RI': 'Rhode Island', 'SC': 'South Carolina',
        'SD': 'South Dakota', 'TN': 'Tennessee', 'TX': 'Texas', 'UT': 'Utah',
        'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington', 'WV': 'West Virginia',
        'WI': 'Wisconsin', 'WY': 'Wyoming'
    })

    # Load and preprocess life expectancy data
    life_df = pd.read_csv(life_expectancy_path)
    life_df[['County', 'State']] = life_df['location_name'].str.extract(r'^(.*) \((.*)\)$')
    life_df = life_df[['County', 'State', 'mean_life_expectancy']].apply(lambda x: x.str.strip() if x.name in ['County', 'State'] else x)

    # Merge FIPS and life expectancy data
    merged_df = pd.merge(
        life_df,
        fips_df[['State_FIPS', 'County_FIPS', 'name', 'state_full']],
        left_on=['State', 'County'],
        right_on=['state_full', 'name'],
        how='left'
    )

    # Drop redundant columns and return
    return merged_df.drop(columns=['name', 'state_full'])

In [104]:
for year in range(2010, 2020):
    df = preprocess_fips_life(year)
    df.dropna(inplace=True)
    df.to_csv(f'../data/processed/preprocessed_fips_life_expectancy/preprocessed_life_fips_{year}.csv', index=False)