In [1]:
import pandas as pd

ad = pd.read_csv('datasets/1976-2020-president.csv')
years = set(ad['year'])
states = set(ad['state'])

In [2]:
def to_percent(older, col_key):
    newer = []
    for y in years:
        for s in states:
            try:
                d = older.loc[y, s, 'DEMOCRAT'][col_key]
                r = older.loc[y, s, 'REPUBLICAN'][col_key]
                newer.append([int(y), s, 100*d/(d+r)]) # % share for each state
            except KeyError as e:
                pass
    return pd.DataFrame(data=newer, columns=['year', 'state', col_key, ])

In [3]:
def to_splatter(a, col_name): # from row-column to table-list
    lst = []
    for year in a.columns[2:]:
        for index, row in a.iterrows():
            lst.append([int(year), row['GeoName'].upper(), row[year], ])
    
    return pd.DataFrame(data=lst, columns=['year', 'state', col_name, ])

In [4]:
pop_vote = pd.read_csv('datasets/1976-2020-president.csv')
pop_vote = pop_vote[['year', 'state', 'party_simplified', 'candidatevotes', ]] # drop useless columns
pop_vote = pop_vote.groupby(['year', 'state', 'party_simplified', ]).sum() # drop multiple candidate
pop_vote = to_percent(pop_vote, 'candidatevotes')
pop_vote = pop_vote.rename(columns={"candidatevotes": "y_votes_percent", })
pop_vote.to_csv('datasets/pop_vote.csv', index=False)
pop_vote.head()

Unnamed: 0,year,state,y_votes_percent
0,1984,NEW JERSEY,39.478609
1,1984,DISTRICT OF COLUMBIA,86.147734
2,1984,HAWAII,44.296276
3,1984,PENNSYLVANIA,46.299269
4,1984,MASSACHUSETTS,48.60167


In [5]:
'''It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997. Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year. While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year. Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.'''

'It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997. Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year. While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year. Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.'

In [6]:
gdp_nom_97 = to_splatter(pd.read_csv('datasets/gdp-nominal-63-97.csv'), 'nominal_gdp_mln')
gdp_nom_12 = to_splatter(pd.read_csv('datasets/gdp-nominal-97-20.csv'), 'nominal_gdp_mln')
gdp_real_97 = to_splatter(pd.read_csv('datasets/gdp-real-77-97-chain-97.csv'), 'real_gdp_mln')
gdp_real_12 = to_splatter(pd.read_csv('datasets/gdp-real-97-20-chain-12.csv'), 'real_gdp_mln')

In [7]:
gdp_nom_97 = gdp_nom_97.set_index(['year', 'state'])
gdp_nom_12 = gdp_nom_12.set_index(['year', 'state'])
gdp_real_97 = gdp_real_97.set_index(['year', 'state'])
gdp_real_12 = gdp_real_12.set_index(['year', 'state'])

In [8]:
gdp_def_97 = (gdp_nom_97['nominal_gdp_mln'] / gdp_real_97['real_gdp_mln']).to_frame('gdp_def').dropna()
gdp_def_12 = (gdp_nom_12['nominal_gdp_mln'] / gdp_real_12['real_gdp_mln']).to_frame('gdp_def').dropna()

In [9]:
gdp_def_97.loc[(slice(None),'ALASKA'),:] # gdp_def_12.loc[(slice(None),'ALASKA'),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,gdp_def
year,state,Unnamed: 2_level_1
1977,ALASKA,0.44553
1978,ALASKA,0.494832
1979,ALASKA,0.562546
1980,ALASKA,0.681013
1981,ALASKA,0.843731
1982,ALASKA,0.883878
1983,ALASKA,0.874878
1984,ALASKA,0.885818
1985,ALASKA,0.876271
1986,ALASKA,0.768556


In [10]:
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote['candidatevotes'] = house_vote['candidatevotes'].apply(lambda x: 1 if x < 50 else -1)
house_vote['year'] -= 4
house_vote = house_vote.rename(columns={'candidatevotes':'incumbent'})
house_vote

Unnamed: 0,year,state,incumbent
0,1980,NEW JERSEY,-1
1,1980,HAWAII,-1
2,1980,PENNSYLVANIA,-1
3,1980,MASSACHUSETTS,-1
4,1980,VERMONT,1
...,...,...,...
584,1976,SOUTH DAKOTA,-1
585,1976,OREGON,-1
586,1976,WASHINGTON,-1
587,1976,NORTH CAROLINA,-1


In [11]:
'''
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })
#'''

'\nhouse_vote = pd.read_csv(\'datasets/1976-2020-house-utf8.csv\')\nhouse_vote = house_vote[[\'year\', \'state\', \'party\', \'candidatevotes\', ]]\nhouse_vote = house_vote.groupby([\'year\', \'state\', \'party\', ]).sum()\nhouse_vote = to_percent(house_vote, \'candidatevotes\')\nhouse_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })\n#'

In [12]:
'''
df = pd.merge(pop_vote, nom_gdp, how='inner', left_on=['year','state'], right_on=['year','state'])
df = pd.merge(df, house_vote, how='inner', left_on=['year','state'], right_on=['year','state'])
df.head()
#'''

"\ndf = pd.merge(pop_vote, nom_gdp, how='inner', left_on=['year','state'], right_on=['year','state'])\ndf = pd.merge(df, house_vote, how='inner', left_on=['year','state'], right_on=['year','state'])\ndf.head()\n#"

In [13]:
'''
df.to_csv('datasets/xxx-final-dataset.csv', index=False)
#'''

"\ndf.to_csv('datasets/xxx-final-dataset.csv', index=False)\n#"

In [16]:
gas = pd.read_csv('datasets/gasoline.csv')
gas.date = gas.date.apply(lambda x: time.mktime(datetime.datetime.strptime(x,"%m/%d/%Y").timetuple()))
gas.date = gas.date.apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
gas.head()

In [21]:
import time
import datetime