# Data Processing

The focus of this notebook is to handle data processing and create various documented datasets for our models

### Dataset# 1: (all_states.csv)<br>
This dataset is providing data for all states and creating a year feature from the original excel file. 

In [7]:
def normalize_election_result(value):
    """
    Normalizes the Election Result values to ensure consistency.
    Converts 'Republican' to 'R' and 'Democratic' to 'D'.
    """
    if isinstance(value, str):
        value = value.strip().lower()  # Remove whitespace and normalize case
        if value in ['republican', 'r']:
            return 'R'
        elif value in ['democratic', 'd']:
            return 'D'
    return value  # Return the original value if it doesn't match

def process_excel_to_csv(input_excel_path, output_csv_path, years_to_process):
    """
    Processes an Excel file with multiple sheets into a single CSV file, adding a 'Year' column and normalizing data.
    
    Args:
    - input_excel_path: Path to the raw Excel file.
    - output_csv_path: Path to save the processed CSV file.
    - years_to_process: List of sheet names (years) to process.
    """
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_csv_path)
    os.makedirs(output_dir, exist_ok=True)  # Create the directory if it doesn't exist

    # Initialize an empty DataFrame to hold combined data
    combined_data = pd.DataFrame()

    # Read the Excel file
    xls = pd.ExcelFile(input_excel_path)

    # Process each sheet
    for sheet_name in years_to_process:
        if sheet_name in xls.sheet_names:
            # Read the sheet into a DataFrame
            df = pd.read_excel(xls, sheet_name=sheet_name)
            
            # Normalize the Election Result column
            if 'Election Result' in df.columns:
                df['Election Result'] = df['Election Result'].apply(normalize_election_result)
            
            # Add the 'Year' column
            df['Year'] = int(sheet_name)
            
            # Append to the combined data
            combined_data = pd.concat([combined_data, df], ignore_index=True)

    # Save the combined data to CSV
    combined_data.to_csv(output_csv_path, index=False)
    print(f"Processed data saved to {output_csv_path}")

# Example usage
input_excel_path = '../data_raw/raw_data.xlsx'  # Adjust based on your directory structure
output_csv_path = '../data_processed/all_states/all_states.csv'  # Correct directory structure
years_to_process = ['2024', '2020', '2016', '2012', '2008', '2004', '2000']

process_excel_to_csv(input_excel_path, output_csv_path, years_to_process)



Processed data saved to ../data_processed/all_states/all_states.csv
