In [2]:
import pandas as pd
import numpy as np


INSTRUCTIONS:
    1. make sure the IncomeStatement.xlsx is in the same folder as this file
    2. The purpose of dataCleaning.py is to clean up the raw IncomeStatement.xlsx
    3. The cleanIncomeData.xlsx is supposed to be the final data we use for the project 
    4. The indexing has been set up so that you can use .loc[year][typeOfIncome] to get the value 
    5. Use incomeDF.loc['gross cash income']['2016'] to get 399353213
    6. For numerical indexing --> incomeDF.iloc[row_idx, col_idx])
    7. NOTE) the numerical indexing starts from [Row 0: gross cash income], and Row 0: 2016
    8. SO incomeDF.iloc[0][0] gets 399353213

In [7]:
# Load the Excel file, adjusting 'skiprows' to ignore extra headers
incomeDF = pd.read_excel("IncomeStatement.xlsx", sheet_name="United States", header=1)

# clean the columns and rows 
incomeDF.columns = incomeDF.columns.map(str)  # ensure every column is a string
incomeDF.columns = incomeDF.columns.map(lambda x: x[:4]) # only take the first 4 chars
incomeDF.rename(columns={'United States': 'Category'}, inplace=True)

# IMPORTANT!!!!!) Raw Columns - ['United States', 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, '2024F', '2025F']
#                 Final Accesible columns - ['Unit', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']

# Clean up the column names and row indices
incomeDF.rename(columns={'Unit': 'cat'}, inplace=True)
incomeDF.drop([0, 1], axis=0, inplace=True)
incomeDF = incomeDF.reset_index(drop=True)

In [10]:
# Set 'cat' as the index
incomeDF.set_index('cat', inplace=True)

# Remove footnote markers like '1/' from category names
incomeDF.index = incomeDF.index.str.replace(r'\d+/', '', regex=True).str.strip()

# Convert index to lowercase
incomeDF.index = incomeDF.index.astype(str).str.lower()

# Drop NaN rows
incomeDF.dropna(axis=0, how="all", inplace=True)  # Drops rows where all values are NaN

# Cut off the dataframe at row 31 (keeping only the first 31 rows)
incomeDF = incomeDF.iloc[:31, :]

# multiply each value by 1000 as they are in 1000s of dollars
incomeDF.iloc[:, :] = incomeDF.iloc[:, :].apply(pd.to_numeric, errors='coerce') * 1000

print(incomeDF.head(5))  # Look at the first 5 rows

                                       2016          2017          2018  \
cat                                                                       
gross cash income              399353213000  413155591000  414846669000   
all commodity receipts         358481924000  370436722000  372070460000   
crop receipts                  195751261000  194877004000  195977424000   
animals and products receipts  162730663000  175559718000  176093036000   
cash farm-related income        27891612000   31187258000   29107198000   

                                       2019          2020          2021  \
cat                                                                       
gross cash income              426473417000  447356024000  500618160000   
all commodity receipts         369316702000  367498927000  442416992000   
crop receipts                  193755281000  202491297000  246031209000   
animals and products receipts  175561421000  165007630000  196385783000   
cash farm-related income

In [13]:
# Save cleaned data, keeping the index to maintain category labels
outputFile = 'cleanIncomeData.xlsx'
with pd.ExcelWriter(outputFile, engine='openpyxl') as writer:
    incomeDF.to_excel(writer, index=True, sheet_name='Cleaned Data')  # <-- Keep index=True
