# Combine Feature Files by Year

## Purpose
This notebook combines individual feature CSV files for each year (2012–2019) into consolidated datasets. It merges feature files horizontally for each year and stacks the yearly datasets vertically into a single file for all years.

## Output
- Combined feature files for each year saved in `data/processed/combined_by_year`.
- A single dataset for all years saved in `data/processed/all_features_2012_to_2019.csv`.

## Notes
- The merging is based on common columns: `County`, `State`, `State_FIPS`, and `County_FIPS`.
- Duplicate columns are removed, and an **outer join** is used to ensure consistency across features.

In [2]:
import pandas as pd
from pathlib import Path
import os

In [4]:
# Directory containing the CSV files
directory = Path("../data/processed/final_dataset/all_features_2012_to_2019")
output_directory = Path("../data/processed/combined_by_year") 

output_directory.mkdir(parents=True, exist_ok=True)  # Ensure the output directory exists

In [6]:
# Function to combine all features for a single year horizontally
def combine_features_for_year(year):
    """
    Combines all feature files for a single year by merging them horizontally 
    while retaining the mean_life_expectancy column.
    """
    # Get all files for the given year
    files = directory.glob(f"*{year}.csv")
    dataframes = [pd.read_csv(file) for file in files]  # Read all files for the year
    
    # Merge all files horizontally on common keys
    combined_year_df = dataframes[0]
    for df in dataframes[1:]:
        combined_year_df = pd.merge(combined_year_df, df, 
                                    on=["County", "State", "State_FIPS", "County_FIPS", "mean_life_expectancy"], 
                                    how="outer")
    
    # Add the Year column for identification
    combined_year_df["Year"] = year
    
    # Save the combined file for the year
    combined_year_filepath = output_directory / f"combined_features_{year}.csv"
    combined_year_df.to_csv(combined_year_filepath, index=False)
    print(f"Saved combined features for {year} at {combined_year_filepath}")
    
    return combined_year_df

In [8]:
# Function to combine all years into a single DataFrame vertically
def combine_all_years(start_year, end_year):
    """
    Combines all years by merging each year's horizontally combined features vertically.
    """
    all_years_dataframes = []
    
    for year in range(start_year, end_year + 1):
        # Combine features for the year and save the file
        yearly_df = combine_features_for_year(year)
        all_years_dataframes.append(yearly_df)
    
    # Combine all years vertically
    combined_all_years_df = pd.concat(all_years_dataframes, axis=0, ignore_index=True)
    
    # Save the final combined file
    combined_all_years_filepath = output_directory / "combined_all_years.csv"
    combined_all_years_df.to_csv(combined_all_years_filepath, index=False)
    print(f"Saved combined file for all years at {combined_all_years_filepath}")
    
    return combined_all_years_df

In [10]:
# Example: Combine all files from 2012 to 2019
final_combined_df = combine_all_years(2012, 2019)

# Verify the final combined DataFrame
print(final_combined_df.info())

Saved combined features for 2012 at ../data/processed/combined_by_year/combined_features_2012.csv
Saved combined features for 2013 at ../data/processed/combined_by_year/combined_features_2013.csv
Saved combined features for 2014 at ../data/processed/combined_by_year/combined_features_2014.csv
Saved combined features for 2015 at ../data/processed/combined_by_year/combined_features_2015.csv
Saved combined features for 2016 at ../data/processed/combined_by_year/combined_features_2016.csv
Saved combined features for 2017 at ../data/processed/combined_by_year/combined_features_2017.csv
Saved combined features for 2018 at ../data/processed/combined_by_year/combined_features_2018.csv
Saved combined features for 2019 at ../data/processed/combined_by_year/combined_features_2019.csv
Saved combined file for all years at ../data/processed/combined_by_year/combined_all_years.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24888 entries, 0 to 24887
Data columns (total 28 columns):
 #   Column 