# and then we go to mario draghi

In [34]:
import pandas as pd
import time
import datetime
import matplotlib
import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF
import math

ad = pd.read_csv('datasets/1976-2020-president.csv')
years = set(ad['year'])
states = set(ad['state'])

#### some function

In [35]:
def ecdf_inv(a, q): return sorted(a)[int(len(a) * q)]

In [36]:
def to_percent(older, col_key):
    newer = []
    for y in years:
        for s in states:
            try:
                d = older.loc[y, s, 'DEMOCRAT'][col_key]
                r = older.loc[y, s, 'REPUBLICAN'][col_key]
                newer.append([int(y), s, 100*d/(d+r)]) # % share for each state
            except KeyError as e:
                pass
    return pd.DataFrame(data=newer, columns=['year', 'state', col_key, ])

In [37]:
def to_splatter(a, col_name): # from row-column to table-list
    lst = []
    for year in a.columns[2:]:
        for index, row in a.iterrows():
            lst.append([int(year), row['GeoName'].upper(), row[year], ])
    
    return pd.DataFrame(data=lst, columns=['year', 'state', col_name, ])

In [50]:
def var42(df_res, colname, filename, old_colname):
    new_df = []
    
    df_res = df_res.set_index(['year', 'state'])
    for y in years:
        for s in states:
            try:
                new_df.append([y,s,] + [df_res.loc[y-i,s][old_colname] for i in range(4)])
            except KeyError:
                pass

    new_df = pd.DataFrame(data=new_df, columns = ['year', 'state',] + [f'{colname}_{i}' for i in range(4,0,-1)])
    new_df = new_df.sort_values(by=['year', 'state'])
    new_df.to_csv(f'datasets-clean/{filename}.csv', index=False)
    return new_df

#### y_hat aka presitdent popular vote

In [39]:
pop_vote = pd.read_csv('datasets/1976-2020-president.csv')
pop_vote = pop_vote[['year', 'state', 'party_simplified', 'candidatevotes', ]] # drop useless columns
pop_vote = pop_vote.groupby(['year', 'state', 'party_simplified', ]).sum() # drop multiple candidate
pop_vote = to_percent(pop_vote, 'candidatevotes')
pop_vote = pop_vote.rename(columns={"candidatevotes": "y_votes_percent", })
pop_vote = pop_vote.sort_values(by=['year', 'state'])
pop_vote.to_csv('datasets-clean/popular-vote-y.csv', index=False)
pop_vote

Unnamed: 0,year,state,y_votes_percent
525,1976,ALABAMA,56.666724
532,1976,ALASKA,38.108171
518,1976,ARIZONA,41.386697
546,1976,ARKANSAS,65.048851
558,1976,CALIFORNIA,49.082173
...,...,...,...
174,2020,VIRGINIA,55.154687
178,2020,WASHINGTON,59.925503
181,2020,WEST VIRGINIA,30.201468
154,2020,WISCONSIN,50.319063


#### GDP and friends

It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997.
Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year.
While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year.
~~Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.~~

In [40]:
gdp_nom_97 = to_splatter(pd.read_csv('datasets/gdp-nominal-63-97.csv'), 'nominal_gdp_mln')
gdp_nom_12 = to_splatter(pd.read_csv('datasets/gdp-nominal-97-20.csv'), 'nominal_gdp_mln')
gdp_real_97 = to_splatter(pd.read_csv('datasets/gdp-real-77-97-chain-97.csv'), 'real_gdp_mln')
gdp_real_12 = to_splatter(pd.read_csv('datasets/gdp-real-97-20-chain-12.csv'), 'real_gdp_mln')
gdp_nom_97 = gdp_nom_97.set_index(['year', 'state'])
gdp_nom_12 = gdp_nom_12.set_index(['year', 'state'])
gdp_real_97 = gdp_real_97.set_index(['year', 'state'])
gdp_real_12 = gdp_real_12.set_index(['year', 'state'])

In [41]:
gdp_def_97 = (gdp_nom_97['nominal_gdp_mln'] / gdp_real_97['real_gdp_mln']).to_frame('gdp_def').dropna()
gdp_def_12 = (gdp_nom_12['nominal_gdp_mln'] / gdp_real_12['real_gdp_mln']).to_frame('gdp_def').dropna()

In [42]:
gdp_def = []

for s in states:
    tmp = gdp_def_97.loc[(slice(None),s),:]
    yyyy = list({i[0] for i in tmp.index})
    yyyy.sort()
    for y in yyyy[1:]:
        q = gdp_def_97.loc[(y,s),:]['gdp_def']/gdp_def_97.loc[(int(y)-1,s),:]['gdp_def']
        gdp_def.append([y, s, (q - 1)*100])

for s in states:
    tmp = gdp_def_12.loc[(slice(None),s),:]
    yyyy = list({i[0] for i in tmp.index})
    yyyy.sort()
    for y in yyyy[1:]:
        q = gdp_def_12.loc[(y,s),:]['gdp_def']/gdp_def_12.loc[(int(y)-1,s),:]['gdp_def']
        gdp_def.append([y, s, (q - 1)*100])

gdp_def = pd.DataFrame(data=gdp_def, columns=['year', 'state', 'gdp_def'])
gdp_def = gdp_def.sort_values(by=['year', 'state'])
gdp_def.to_csv('datasets-clean/gdp-def-percent-change-year-ago.csv', index=False)
gdp_def

Unnamed: 0,year,state,gdp_def
300,1978,ALABAMA,7.402566
440,1978,ALASKA,11.065835
160,1978,ARIZONA,7.468516
720,1978,ARKANSAS,8.054511
960,1978,CALIFORNIA,7.094885
...,...,...,...
1525,2020,VIRGINIA,1.950355
1617,2020,WASHINGTON,1.658615
1686,2020,WEST VIRGINIA,-0.257632
1065,2020,WISCONSIN,2.055099


In [51]:
def_index = var42(gdp_def, 'def', 'def-index', 'gdp_def')
def_index

Unnamed: 0,year,state,def_4,def_3,def_2,def_1
15,1984,ALABAMA,4.548485,4.185139,6.426162,9.372111
22,1984,ALASKA,1.250442,-1.018197,4.758325,23.893438
8,1984,ARIZONA,4.731524,5.112800,6.966784,8.376158
36,1984,ARKANSAS,4.363585,4.161762,5.603037,8.995826
48,1984,CALIFORNIA,4.737549,4.477916,6.742430,8.871767
...,...,...,...,...,...,...
174,2020,VIRGINIA,1.950355,2.222617,1.866446,1.307918
178,2020,WASHINGTON,1.658615,1.894215,1.658639,1.276351
181,2020,WEST VIRGINIA,-0.257632,0.908538,3.283808,3.326667
154,2020,WISCONSIN,2.055099,2.222565,1.889366,1.230655


In [52]:
gdp_real = []

for s in states:
    tmp = gdp_real_97.loc[(slice(None),s),:]
    yyyy = list({i[0] for i in tmp.index})
    yyyy.sort()
    for y in yyyy[1:]:
        q = gdp_real_97.loc[(y,s),:]['real_gdp_mln']/gdp_real_97.loc[(int(y)-1,s),:]['real_gdp_mln']
        gdp_real.append([y, s, (q - 1)*100])

for s in states:
    tmp = gdp_real_12.loc[(slice(None),s),:]
    yyyy = list({i[0] for i in tmp.index})
    yyyy.sort()
    for y in yyyy[1:]:
        q = gdp_real_12.loc[(y,s),:]['real_gdp_mln']/gdp_real_12.loc[(int(y)-1,s),:]['real_gdp_mln']
        gdp_real.append([y, s, (q - 1)*100])

gdp_real = pd.DataFrame(data=gdp_real, columns=['year', 'state', 'gdp_real'])
gdp_real = gdp_real.sort_values(by=['year', 'state'])
gdp_real.to_csv('datasets-clean/gdp-real-percent-change-year-ago.csv', index=False)
gdp_real

Unnamed: 0,year,state,gdp_real
300,1978,ALABAMA,6.421827
440,1978,ALASKA,8.958679
160,1978,ARIZONA,10.462245
720,1978,ARKANSAS,6.534163
960,1978,CALIFORNIA,6.881903
...,...,...,...
1525,2020,VIRGINIA,-2.757118
1617,2020,WASHINGTON,-0.581850
1686,2020,WEST VIRGINIA,-3.902932
1065,2020,WISCONSIN,-4.006500


#### z growth index

In [53]:
z_yearly = gdp_real.copy()
z_yearly.gdp_real = z_yearly.gdp_real.apply(lambda x : 1 if x > 2.67 else 0)

growth_index = var42(z_yearly, 'z', 'z-growth-index', 'gdp_real')
growth_index

Unnamed: 0,year,state,z_4,z_3,z_2,z_1
15,1984,ALABAMA,1,1,0,0
22,1984,ALASKA,1,0,0,1
8,1984,ARIZONA,1,1,0,1
36,1984,ARKANSAS,1,1,0,1
48,1984,CALIFORNIA,1,1,0,1
...,...,...,...,...,...,...
174,2020,VIRGINIA,0,0,0,0
178,2020,WASHINGTON,0,1,1,1
181,2020,WEST VIRGINIA,0,0,0,0
154,2020,WISCONSIN,0,0,0,0


#### g gdp index

In [54]:
gdp_index = var42(gdp_real, 'gdp', 'gdp-growth-index', 'gdp_real')
gdp_index

Unnamed: 0,year,state,gdp_4,gdp_3,gdp_2,gdp_1
15,1984,ALABAMA,4.719412,4.944172,-2.488787,1.840597
22,1984,ALASKA,3.995028,-2.715237,2.618149,15.266624
8,1984,ARIZONA,10.752575,5.715111,-2.352463,2.781647
36,1984,ARKANSAS,7.839078,3.206390,-2.673933,3.580354
48,1984,CALIFORNIA,8.028589,3.578332,0.028616,3.291989
...,...,...,...,...,...,...
174,2020,VIRGINIA,-2.757118,1.973924,2.377204,1.469457
178,2020,WASHINGTON,-0.581850,3.946746,6.794566,5.359424
181,2020,WEST VIRGINIA,-3.902932,-0.781380,2.315721,2.068235
154,2020,WISCONSIN,-4.006500,1.494643,2.430560,0.133906


#### incumbent

In [55]:
incumbent = pd.read_csv('datasets/incumbent-4president-76-20.csv', sep=';')

tmp = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
incumbent = pd.merge(tmp, incumbent, how='inner', left_on='year', right_on='year')
incumbent = incumbent.sort_values(by=['year', 'state'])
incumbent.to_csv('datasets-clean/incumbent-longitudinal-replication.csv', index=False)
incumbent

Unnamed: 0,year,state,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice
525,1976,ALABAMA,-1,0,-1.0,0,0
532,1976,ALASKA,-1,0,-1.0,0,0
518,1976,ARIZONA,-1,0,-1.0,0,0
546,1976,ARKANSAS,-1,0,-1.0,0,0
558,1976,CALIFORNIA,-1,0,-1.0,0,0
...,...,...,...,...,...,...,...
174,2020,VIRGINIA,-1,-1,0.0,0,1
178,2020,WASHINGTON,-1,-1,0.0,0,1
181,2020,WEST VIRGINIA,-1,-1,0.0,0,1
154,2020,WISCONSIN,-1,-1,0.0,0,1


#### merging features

In [56]:
df = pd.merge(pop_vote, gdp_index, how='inner', left_on=['year','state'], right_on=['year','state'])
df = pd.merge(df, def_index, how='inner', left_on=['year','state'], right_on=['year','state'])
df = pd.merge(df, growth_index, how='inner', left_on=['year','state'], right_on=['year','state'])
df = pd.merge(df, incumbent, how='inner', left_on=['year','state'], right_on=['year','state'])

df = df.sort_values(by=['year', 'state'])
df.to_csv('datasets-clean/xxx-final-dataset.csv', index=False)
df

Unnamed: 0,year,state,y_votes_percent,gdp_4,gdp_3,gdp_2,gdp_1,def_4,def_3,def_2,def_1,z_4,z_3,z_2,z_1,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice
0,1984,ALABAMA,38.736605,4.719412,4.944172,-2.488787,1.840597,4.548485,4.185139,6.426162,9.372111,1,1,0,0,-1,-1,0.0,0,0
1,1984,ALASKA,30.944087,3.995028,-2.715237,2.618149,15.266624,1.250442,-1.018197,4.758325,23.893438,1,0,0,1,-1,-1,0.0,0,0
2,1984,ARIZONA,32.883272,10.752575,5.715111,-2.352463,2.781647,4.731524,5.112800,6.966784,8.376158,1,1,0,1,-1,-1,0.0,0,0
3,1984,ARKANSAS,38.772412,7.839078,3.206390,-2.673933,3.580354,4.363585,4.161762,5.603037,8.995826,1,1,0,1,-1,-1,0.0,0,0
4,1984,CALIFORNIA,41.775465,8.028589,3.578332,0.028616,3.291989,4.737549,4.477916,6.742430,8.871767,1,1,0,1,-1,-1,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,2020,VIRGINIA,55.154687,-2.757118,1.973924,2.377204,1.469457,1.950355,2.222617,1.866446,1.307918,0,0,0,0,-1,-1,0.0,0,1
506,2020,WASHINGTON,59.925503,-0.581850,3.946746,6.794566,5.359424,1.658615,1.894215,1.658639,1.276351,0,1,1,1,-1,-1,0.0,0,1
507,2020,WEST VIRGINIA,30.201468,-3.902932,-0.781380,2.315721,2.068235,-0.257632,0.908538,3.283808,3.326667,0,0,0,0,-1,-1,0.0,0,1
508,2020,WISCONSIN,50.319063,-4.006500,1.494643,2.430560,0.133906,2.055099,2.222565,1.889366,1.230655,0,0,0,0,-1,-1,0.0,0,1


In [57]:
0/0
# chernobyl zone

ZeroDivisionError: division by zero

In [None]:
gdp_exp = gdp_boot.set_index(['year', 'state'])
for s in states:
    print(gdp_exp.loc[(slice(None),s),:].gdp_boot.apply(lambda x : math.exp((x - tmp.gdp_boot.mean())/ math.sqrt(tmp.gdp_boot.var()))))
    break

us stuff

In [None]:
gdp_us = pd.read_csv('datasets/gdp-nomina-47-20-chain-12.csv')
gdp_us.DATE = gdp_us.DATE.apply(lambda x: int(str(x).split('-')[0]))
s = gdp_us[gdp_us.DATE > 1975].GDPC1_PC1
s.hist()

In [None]:
ecdf = ECDF(s)
ecdf(2.8)

qnt = 0.67
std_err = np.sqrt(s.var())/2
ecdf_inv(s, qnt), len(s)*(1-qnt), std_err

#### house

In [None]:
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote['candidatevotes'] = house_vote['candidatevotes'].apply(lambda x: 1 if x < 50 else -1)
house_vote['year'] -= 4
house_vote = house_vote.rename(columns={'candidatevotes':'incumbent'})
house_vote.to_csv('datasets-clean/incumbent-house-rep.csv', index=False)
house_vote.head()

In [None]:
'''
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })
#'''

#### gasoline and friends + approval

In [None]:
gas = pd.read_csv('datasets/gasoline-93-21.csv')
gas.date = gas.date.apply(lambda x: time.mktime(datetime.datetime.strptime(x,"%m/%d/%Y").timetuple()))
gas.date = gas.date.apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
gas

#### houses prices and rent + personal income

In [None]:
f = open('datasets/house-chain-00.csv', 'rt')
lines = f.readlines()
houses = [[i.strip() for i in l.split(sep = '$') ] for l in lines ]
houses = pd.DataFrame(data=houses, columns=['state', 2000, 1990, 1980, 1970, 1960, 1950, 1940 ])
houses.head()