In [7]:
! pip install xlrd==2.0.1 openpyxl


Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hDownloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [22]:
import os
import pandas as pd

def extract_and_combine_data(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    data_by_year = {}

    for file_name in os.listdir(input_folder):
        if file_name.endswith(".xls") or file_name.endswith(".xlsx"):
            # Extract the year from the file name
            year = file_name.split()[0]
            
            # Only process files from 2014 and above
            try:
                year_int = int(year)
                if year_int < 2014:
                    continue
            except ValueError:
                print(f"Could not extract a valid year from {file_name}. Skipping this file.")
                continue
            
            file_path = os.path.join(input_folder, file_name)
            try:
                # Load the data from the 'Additional Measure Data' sheet
                if file_name.endswith(".xls"):
                    df = pd.read_excel(file_path, sheet_name='Additional Measure Data', engine='xlrd', skiprows=1)
                else:
                    df = pd.read_excel(file_path, sheet_name='Additional Measure Data', skiprows=1)
            except Exception as e:
                print(f"Failed to read 'Additional Measure Data' from {file_name}: {e}")
                continue

            # Determine which sheet to use for df2
            try:
                if file_name.endswith(".xls"):
                    df2 = pd.read_excel(file_path, sheet_name='Ranked Measure Data', engine='xlrd', skiprows=1)
                else:
                    df2 = pd.read_excel(file_path, sheet_name='Ranked Measure Data', skiprows=1)
                sheet_used = 'Ranked Measure Data'
            except:
                try:
                    if file_name.endswith(".xls"):
                        df2 = pd.read_excel(file_path, sheet_name='Select Measure Data', engine='xlrd', skiprows=1)
                    else:
                        df2 = pd.read_excel(file_path, sheet_name='Select Measure Data', skiprows=1)
                    sheet_used = 'Select Measure Data'
                except Exception as e:
                    print(f"Failed to read both 'Ranked Measure Data' and 'Select Measure Data' from {file_name}: {e}")
                    continue


            # Check for both possible column names
            if '% Limited Access to Healthy Foods' in df.columns:
                column_name = '% Limited Access to Healthy Foods'
            elif '% Limited Access' in df.columns:
                column_name = '% Limited Access'
            else:
                print(f"Neither '% Limited Access to Healthy Foods' nor '% Limited Access' found in {file_name}")
                continue
            print(file_name)

            if '% Diabetic' in df.columns:
                diabetes_column = '% Diabetic'
            elif '% diabetic' in df.columns:
                diabetes_column = '% diabetic'
            elif '% Adults with Diabetes' in df.columns:
                diabetes_column = '% Adults with Diabetes'
        
            else:
                print(f"Did not find diabetes column in {file_name}")
                continue

            

            if '% Rural' in df.columns:
                rural_column = '% Rural'
            elif '% rural' in df.columns:
                rural_column = '% rural'
            elif 'Rural' in df.columns:
                rural_column = 'Rural'
            else:
                print(f"Did not find rural column in {file_name}")
                continue

            if 'Household Income' in df.columns:
                income_column = 'Household Income'
            elif 'Median Household Income' in df.columns:
                income_column = 'Median Household Income'
            else:
                print(f"Did not find household income column in {file_name}")
                continue

            if 'Average Number of Mentally Unhealthy Days' in df2.columns:
                mentally_unhealthy_column = 'Average Number of Mentally Unhealthy Days'
            elif 'Mentally Unhealthy Days' in df2.columns:
                mentally_unhealthy_column = 'Mentally Unhealthy Days'
            else:
                print(f"Did not find life expectancy column in {file_name}")
                continue

            if 'Food Environment Index' in df2.columns:
                food_env_column = 'Food Environment Index'
            # elif 'Mentally Unhealthy Days' in df2.columns:
            #     mentally_unhealthy_column = 'Mentally Unhealthy Days'
            else:
                print(f"Did not find food env column in {file_name}")
                continue

            
            if 'Income Ratio' in df2.columns:
                income_ratio_column = 'Income Ratio'
            else:
                print(f"Did not find income ratio column in {file_name}")
                continue
            
          

            if 'County' in df.columns:
                region_column = 'County'
            elif 'Parish' in df.columns:
                region_column = 'Parish'
            elif 'Borough' in df.columns:
                region_column = 'Borough'
            else:
                print(f"Neither 'County' nor 'Parish' found in {file_name}")
                continue

            merged_df = pd.merge(df, df2[['FIPS', mentally_unhealthy_column, food_env_column, income_ratio_column]], on='FIPS', how='inner')

            # Select only the columns of interest
            merged_df = merged_df[['FIPS', 'State', region_column, column_name, diabetes_column, rural_column, income_column, mentally_unhealthy_column, food_env_column, income_ratio_column]]


            if year not in data_by_year:
                data_by_year[year] = merged_df
            else:
                data_by_year[year] = pd.concat([data_by_year[year], merged_df], ignore_index=True)

    # Write each year's data to a new Excel file
    for year, data in data_by_year.items():
        output_file_path = os.path.join(output_folder, f"{year}_Limited_Access_to_Healthy_Foods.csv")
        data.to_csv(output_file_path, index=False)
        print(f"Data for {year} saved to {output_file_path}")


In [23]:
input_folder = "County Health Rankings"  
output_folder = "health-regression"
    
extract_and_combine_data(input_folder, output_folder)

2019 County Health Rankings New Jersey Data - v1_0.xls
2022 County Health Rankings Alabama Data - v2.xlsx
2023 County Health Rankings New York Data - v3.xlsx
2019 County Health Rankings New Mexico Data - v1_0.xls
2023 County Health Rankings North Dakota Data - v2.xlsx
2018 County Health Rankings Kentucky Data - v3.xls
2019 County Health Rankings Kentucky Data - v1_0.xls
2022 County Health Rankings South Dakota Data - v2.xlsx
2024 County Health Rankings Delaware Data - v2.xlsx
2023 County Health Rankings Wisconsin Data - v3.xlsx
2022 County Health Rankings Georgia Data - v2.xlsx
2019 County Health Rankings Maine Data - v1_0.xls
2022 County Health Rankings Nebraska Data - v1_0.xlsx
2018 County Health Rankings Arkansas Data - v3.xls
2024 County Health Rankings Ohio Data - v2.xlsx
2021 County Health Rankings Arizona Data - v1.xlsx
2018 County Health Rankings Maine Data - v3.xls
2015 County Health Rankings Colorado Data - v3.xls
2015 County Health Rankings Iowa Data - v3.xls
2024 County Hea