In [1]:
import pandas as pd
import re
from collections import defaultdict

In [2]:
income_file_path = "resources/state_income_data.csv"

In [3]:
income_df = pd.read_csv(income_file_path)

In [4]:
income_df.head()

Unnamed: 0,State,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
0,United States,68703,63179,61372,59039,56516,53657,53585,51017,50054,49276
1,Alabama,56200,49936,51113,47221,44509,42278,47320,43464,42590,40933
2,Alaska,78394,68734,72231,75723,75112,67629,72472,63648,57431,57848
3,Arizona,70674,62283,61125,57100,52248,49254,52611,47044,48621,46896
4,Arkansas,54539,49781,48829,45907,42798,44922,39376,39018,41302,38587


In [5]:
income_df.columns

Index(['State', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
       '2011', '2010'],
      dtype='object')

In [6]:
income_df.rename({"State": "state"}, 
          axis = "columns", inplace = True)

In [7]:
income_df.head()

Unnamed: 0,state,2019,2018,2017,2016,2015,2014,2013,2012,2011,2010
0,United States,68703,63179,61372,59039,56516,53657,53585,51017,50054,49276
1,Alabama,56200,49936,51113,47221,44509,42278,47320,43464,42590,40933
2,Alaska,78394,68734,72231,75723,75112,67629,72472,63648,57431,57848
3,Arizona,70674,62283,61125,57100,52248,49254,52611,47044,48621,46896
4,Arkansas,54539,49781,48829,45907,42798,44922,39376,39018,41302,38587


In [8]:
def condense_date_columns(df):
    # Grab non-date columns from dataframe
    non_date_columns = [x for x in df.columns if x[0].isalpha()]
    
    # Create data and non-date dataframes based on original df
    non_date_df = df[non_date_columns]
    date_df = df.drop(columns=non_date_columns)
    
    # Initialize final data dictionary
    data = defaultdict(list)
    
    # Loop through dates
    for col_date in date_df.columns:
        
        # Loop through values for each date
        for i, x in enumerate(list(date_df[col_date])):
            
            # Loop through non-date values
            for col_non_date in non_date_df.columns:
                
                # Append non-date value to non-date column
                data[str(col_non_date)].append(non_date_df[col_non_date][i])
            
            # Append date and price values
            data["date"].append(str(col_date))
            data["income"].append(float(x.replace(",","")))
            
    return pd.DataFrame(data)

In [9]:
final_income_df = condense_date_columns(income_df)

In [10]:
print(final_income_df.shape)
final_income_df.head()

(520, 3)


Unnamed: 0,state,date,income
0,United States,2019,68703.0
1,Alabama,2019,56200.0
2,Alaska,2019,78394.0
3,Arizona,2019,70674.0
4,Arkansas,2019,54539.0


In [11]:
# Create primary key column derived from unique region_id and date
final_income_df['region_date'] = final_income_df.apply(lambda row: row.state.lower().replace(" ","") + row.date.replace("-",""), axis=1)

In [12]:
final_income_df.columns

Index(['state', 'date', 'income', 'region_date'], dtype='object')

In [13]:
reordered_columns = ['region_date', 'state', 'date', 'income']

In [14]:
final_income_df = final_income_df[reordered_columns]
final_income_df.head()

Unnamed: 0,region_date,state,date,income
0,unitedstates2019,United States,2019,68703.0
1,alabama2019,Alabama,2019,56200.0
2,alaska2019,Alaska,2019,78394.0
3,arizona2019,Arizona,2019,70674.0
4,arkansas2019,Arkansas,2019,54539.0


In [15]:
final_income_df.to_csv("resources/final_income_data.csv", index=False)