# and then we go to mario draghi

In [1]:
import pandas as pd
import time
import datetime
import matplotlib
import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF
import math
import requests
from bs4 import BeautifulSoup as bs
import pickle as pk

In [2]:
ad = pd.read_csv('datasets/1976-2020-president.csv')
years = set(ad['year'])
states = set(ad['state'])
dflist = list()

In [3]:
states.remove('DISTRICT OF COLUMBIA')
years.remove(1976)
years.remove(1980)

#### some function

In [4]:
def foofloat(x):
    try:
        return float(x)
    except:
        return np.NaN

In [5]:
def bls_serial(series_id):
  bsobj = bs(requests.post("https://data.bls.gov/pdq/SurveyOutputServlet", {'series_id': series_id}).text)
  return bsobj.findAll('table')[0].findAll('tr')[2].findAll('td')[0].text.strip().upper()

In [6]:
def ecdf_inv(a, q): return sorted(a)[int(len(a) * q)]
def count_elem(l): return {y:list(l).count(y) for y in set(l)}
def dpc(a,b): return (a/b - 1)*100

In [7]:
def to_percent(older, col_key, shift=0):
    newer = []
    for y in years:
        for s in states:
            try:
                d = older.loc[y-shift, s, 'DEMOCRAT'][col_key]
                r = older.loc[y-shift, s, 'REPUBLICAN'][col_key]
                newer.append([int(y), s, 100*d/(d+r)]) # % share for each state
            except KeyError as e:
                pass
    return pd.DataFrame(data=newer, columns=['year', 'state', col_key, ])

In [8]:
def to_splatter(a, col_name): # from row-column to table-list
    lst = []
    for y in a.columns[2:]:
        for index, row in a.iterrows():
            lst.append([int(y), row['GeoName'].upper(), row[y], ])
    
    return pd.DataFrame(data=lst, columns=['year', 'state', col_name, ])

In [9]:
def var42(df_res, old_colname, new_colname, jump=1):
    new_df = []
    
    df_res = df_res.set_index(['year', 'state'])
    for y in years:
        for s in states:
            try:
                new_df.append([y,s,] + [df_res.at[(y-i,s), old_colname] for i in range(0, 4, jump)])
            except KeyError:
                pass

    new_df = pd.DataFrame(data=new_df, columns = ['year', 'state',] + [f'{new_colname}_{i}' for i in range(4//jump, 0, -1)])
    return new_df

In [10]:
def beadiff(rowcol, colname, diff = True, shift=1):
    difflist = []

    for s in states:
        tmp = rowcol.loc[(slice(None),s),:]
        yyyy = list({i[0] for i in tmp.index})
        yyyy.sort()
        for y in yyyy[shift:]:
            if diff:
                q = dpc(rowcol.at[(y,s), colname], rowcol.at[(int(y)-shift,s),colname])
            else:
                q = rowcol.at[(y,s),colname]
            difflist.append([y, s, q])
            
    return  pd.DataFrame(data=difflist, columns=['year', 'state', colname])

In [11]:
def whore(df, yy=years, ss=states):
    df = df.set_index(['year','state'])
    x = []
    for s in ss:
        for y in yy:
            try:
                df.loc[(y,s),:]
            except:
                x.append((y,s))
    return x

In [12]:
def sell(df, filename=None):
    df = df.sort_values(by=['year', 'state'])
    dflist.append(df)
    if filename is not None:
        df.to_csv(f'datasets-clean/{filename}.csv', index=False)
    return df

#### y_hat aka presitdent popular vote

In [13]:
pop_vote = pd.read_csv('datasets/1976-2020-president.csv')
pop_vote = pop_vote[['year', 'state', 'party_simplified', 'candidatevotes', ]] # drop useless columns
pop_vote = pop_vote.groupby(['year', 'state', 'party_simplified', ]).sum() # drop multiple candidate
pop_vote = to_percent(pop_vote, 'candidatevotes')
pop_vote = pop_vote.rename(columns={"candidatevotes": "y_votes_percent", })
pop_vote = pop_vote.sort_values(by=['year', 'state'])
pop_vote.to_csv('datasets-clean/popular-vote-y.csv', index=False)
dflist.append(pop_vote)
pop_vote

Unnamed: 0,year,state,y_votes_percent
45,1984,ALABAMA,38.736605
7,1984,ALASKA,30.944087
28,1984,ARIZONA,32.883272
48,1984,ARKANSAS,38.772412
46,1984,CALIFORNIA,41.775465
...,...,...,...
171,2020,VIRGINIA,55.154687
161,2020,WASHINGTON,59.925503
164,2020,WEST VIRGINIA,30.201468
162,2020,WISCONSIN,50.319063


#### GDP and friends

It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997.
Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year.
While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year.
~~Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.~~

In [14]:
gdp_nom_97 = to_splatter(pd.read_csv('datasets/gdp-nominal-63-97.csv'), 'gdp_nom')
gdp_nom_12 = to_splatter(pd.read_csv('datasets/gdp-nominal-97-20.csv'), 'gdp_nom')
gdp_real_97 = to_splatter(pd.read_csv('datasets/gdp-real-77-97-chain-97.csv'), 'gdp_real')
gdp_real_12 = to_splatter(pd.read_csv('datasets/gdp-real-97-20-chain-12.csv'), 'gdp_real')

In [15]:
gdp_nom_97 = gdp_nom_97.set_index(['year', 'state'])
gdp_nom_12 = gdp_nom_12.set_index(['year', 'state'])
gdp_real_97 = gdp_real_97.set_index(['year', 'state'])
gdp_real_12 = gdp_real_12.set_index(['year', 'state'])

In [16]:
GDP_THRESHOLD = 3.2

# moving to midterms (retrocompatibility)
k = 2
sx = '_mt' if k-1 else ''

#### price indexes

In [17]:
gdp_def_97 = (gdp_nom_97['gdp_nom'] / gdp_real_97['gdp_real']).to_frame('gdp_def').dropna()
gdp_def_12 = (gdp_nom_12['gdp_nom'] / gdp_real_12['gdp_real']).to_frame('gdp_def').dropna()

In [18]:
# price index FAIR
fair_p = pd.concat([beadiff(gdp_def_12,'gdp_def',shift=4), beadiff(gdp_def_97,'gdp_def',shift=4)])

# fix 97jump
tmp = dpc(gdp_def_12.loc[2000], gdp_def_97.loc[1996])
tmp['year'] = 2000
fair_p = pd.concat([fair_p, tmp.reset_index(), ])

# recolumning + appending
fair_p = var42(fair_p, new_colname='fair_p', old_colname='gdp_def', jump=4)

sell(fair_p)

Unnamed: 0,year,state,fair_p_1
45,1984,ALABAMA,26.788107
7,1984,ALASKA,30.073594
28,1984,ARIZONA,27.619112
48,1984,ARKANSAS,25.124853
46,1984,CALIFORNIA,27.168417
...,...,...,...
171,2020,VIRGINIA,7.549968
161,2020,WASHINGTON,6.646364
164,2020,WEST VIRGINIA,7.411864
162,2020,WISCONSIN,7.602507


In [19]:
# OUR price index
def_index = pd.concat([beadiff(gdp_def_12,'gdp_def', shift=k), beadiff(gdp_def_97,'gdp_def',shift=k)])

if k-1:
    tmp = dpc(gdp_def_12.loc[1996+k], gdp_def_97.loc[1996])
    tmp['year'] = 1996+k
    def_index = pd.concat([def_index, tmp.reset_index(), ])

def_index = var42(def_index, new_colname='def'+sx, old_colname='gdp_def', jump=k)

sell(def_index, 'price-deflator')

Unnamed: 0,year,state,def_mt_2,def_mt_1
45,1984,ALABAMA,8.923985,16.400541
7,1984,ALASKA,0.219513,29.788690
28,1984,ARIZONA,10.086238,15.926491
48,1984,ARKANSAS,8.706949,15.102903
46,1984,CALIFORNIA,9.427608,16.212370
...,...,...,...,...
171,2020,VIRGINIA,4.216321,3.198776
161,2020,WASHINGTON,3.584248,2.956160
164,2020,WEST VIRGINIA,0.648566,6.719717
162,2020,WISCONSIN,4.323340,3.143273


#### gdp indexes

In [20]:
# re-generate plain series (see despacito)
gdp_real_plain = gdp_real_12.copy()
for y in range(1996, 1977, -1):
    for s in sorted(states):
        a = gdp_real_97.loc[(y,s),'gdp_real']
        b = gdp_real_97.loc[(y+1,s),'gdp_real']
        c = gdp_real_plain.loc[(y+1,s),'gdp_real']

        gdp_real_plain.at[(y,s),'gdp_real'] = a*b/c

gdp_real_plain = gdp_real_plain.reset_index()

In [21]:
# one-lag FAIR
fair_g = pd.concat([beadiff(gdp_real_97,'gdp_real'), beadiff(gdp_real_12,'gdp_real')])
fair_g = var42(fair_g, new_colname='fair_g', old_colname='gdp_real', jump=4)

sell(fair_g)

Unnamed: 0,year,state,fair_g_1
45,1984,ALABAMA,4.719412
7,1984,ALASKA,3.995028
28,1984,ARIZONA,10.752575
48,1984,ARKANSAS,7.839078
46,1984,CALIFORNIA,8.028589
...,...,...,...
171,2020,VIRGINIA,-2.757118
161,2020,WASHINGTON,-0.581850
164,2020,WEST VIRGINIA,-3.902932
162,2020,WISCONSIN,-4.006500


In [22]:
# OUR two-lag + fix 97jump
gdp_real_var = pd.concat([beadiff(gdp_real_97,'gdp_real', shift=k), beadiff(gdp_real_12,'gdp_real', shift=k)])

if k-1:
    tmp = gdp_real_plain.set_index(['year','state'])
    tmp = dpc(tmp.loc[1996+k],tmp.loc[1996])
    tmp['year'] = 1996+k
    gdp_real_var = pd.concat([gdp_real_var, tmp.reset_index(), ])
    
gdp_index = var42(gdp_real_var, new_colname='gdp'+sx, old_colname='gdp_real', jump=k)

sell(gdp_index, 'gdp-growth')

Unnamed: 0,year,state,gdp_mt_2,gdp_mt_1
45,1984,ALABAMA,9.896919,-0.693998
7,1984,ALASKA,1.171316,18.284477
28,1984,ARIZONA,17.082207,0.363747
48,1984,ARKANSAS,11.296819,0.810684
46,1984,CALIFORNIA,11.894211,3.321546
...,...,...,...,...
171,2020,VIRGINIA,-0.837617,3.881593
161,2020,WASHINGTON,3.341932,12.518139
164,2020,WEST VIRGINIA,-4.653816,4.431850
162,2020,WISCONSIN,-2.571740,2.567721


#### z growth index

In [23]:
# linear combo Z (fair = m2 +m1))
# maybe k = 1 was better
growth_index = pd.concat([beadiff(gdp_real_97,'gdp_real'), beadiff(gdp_real_12,'gdp_real')])
growth_index.gdp_real = growth_index.gdp_real.apply(lambda x : 1 if x > GDP_THRESHOLD else 0)
growth_index = var42(growth_index, new_colname='z'+sx, old_colname='gdp_real', jump=1)

sell(growth_index, 'z-growth')

Unnamed: 0,year,state,z_mt_4,z_mt_3,z_mt_2,z_mt_1
45,1984,ALABAMA,1,1,0,0
7,1984,ALASKA,1,0,0,1
28,1984,ARIZONA,1,1,0,0
48,1984,ARKANSAS,1,1,0,1
46,1984,CALIFORNIA,1,1,0,1
...,...,...,...,...,...,...
171,2020,VIRGINIA,0,0,0,0
161,2020,WASHINGTON,0,1,1,1
164,2020,WEST VIRGINIA,0,0,0,0
162,2020,WISCONSIN,0,0,0,0


#### incumbent

In [24]:
incumbent = pd.read_csv('datasets/incumbent-4president-76-20.csv', sep=';')
tmp = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
incumbent = pd.merge(tmp, incumbent, how='inner', left_on='year', right_on='year')

sell(incumbent, 'incumbent-longrep')

Unnamed: 0,year,state,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice
45,1984,ALABAMA,-1,-1,0.0,0,0
7,1984,ALASKA,-1,-1,0.0,0,0
28,1984,ARIZONA,-1,-1,0.0,0,0
48,1984,ARKANSAS,-1,-1,0.0,0,0
46,1984,CALIFORNIA,-1,-1,0.0,0,0
...,...,...,...,...,...,...,...
171,2020,VIRGINIA,-1,-1,0.0,0,1
161,2020,WASHINGTON,-1,-1,0.0,0,1
164,2020,WEST VIRGINIA,-1,-1,0.0,0,1
162,2020,WISCONSIN,-1,-1,0.0,0,1


#### house dummy

In [25]:
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes', shift=2)
house_vote['candidatevotes'] = house_vote['candidatevotes'].apply(lambda x: -1 if x < 50 else 1)
house_vote = house_vote.rename(columns={'candidatevotes':'house_midterm'})
house_vote = pd.concat([house_vote, pd.read_csv('datasets/midterm-fixed.csv')])
house_vote.state = house_vote.state.apply(lambda x : x.upper())

sell(house_vote, 'midterm')

Unnamed: 0,year,state,house_midterm
44,1984,ALABAMA,1
7,1984,ALASKA,-1
28,1984,ARIZONA,-1
47,1984,ARKANSAS,1
45,1984,CALIFORNIA,1
...,...,...,...
168,2020,VIRGINIA,1
158,2020,WASHINGTON,1
161,2020,WEST VIRGINIA,-1
159,2020,WISCONSIN,1


#### personal income

In [26]:
inc_cap = to_splatter(pd.read_csv('datasets/personal-income-per-capita-72-20.csv'), 'avg_inc')
inc_cap = inc_cap.set_index(['year', 'state'])
inc_cap = beadiff(inc_cap, 'avg_inc', shift=k)
inc_cap = var42(inc_cap,  'avg_inc', 'avg_inc'+sx, jump=k)

sell(inc_cap, 'income')

Unnamed: 0,year,state,avg_inc_mt_2,avg_inc_mt_1
45,1984,ALABAMA,16.997412,16.526329
7,1984,ALASKA,1.426071,25.259560
28,1984,ARIZONA,17.224666,15.200083
48,1984,ARKANSAS,16.832890,18.783730
46,1984,CALIFORNIA,15.239614,15.470908
...,...,...,...,...
171,2020,VIRGINIA,8.740216,6.622504
161,2020,WASHINGTON,11.224152,9.655147
164,2020,WEST VIRGINIA,8.665411,10.554562
162,2020,WISCONSIN,8.474146,7.874298


#### unenployment

In [27]:
unn = pd.read_csv('datasets/unemployment-76-21-percent.csv')
furi = 'datasets-clean/serial-id-bsl.pkl'

if False:
    sd = dict()
    for s in set([*unn['Series ID'], *nunn['Series ID']]):
        try:
            sd[s] = bls_serial(s)
        except IndexError:
            sd[s] = f'BADASS_{s}'
    pk.dump(sd, open(furi, 'wb'))
else:
    sd = pk.load(open(furi, 'rb'))

unn = unn.rename(columns={
    "Series ID": "state",
    "Year": "year",
    "Value": "unemp",
    "Period": "month",
    })

# transform
unn.unemp = unn.unemp.apply(foofloat).astype(float)
unn.state = unn.state.map(sd)
unn.month = unn.month.apply(lambda x: x.split('M')[-1])

In [28]:
glue = pd.read_csv('datasets/unemployment-rate-us-country-47-21.csv')
glue['year'] = glue.DATE.apply(lambda x: x.split('-')[0])
glue['month'] = glue.DATE.apply(lambda x: x.split('-')[1])
glue = glue.set_index(['year','month'])

for i in unn.index:
    unn.at[i, "unemp"] = glue.at[(unn.at[i, "year"], unn.at[i, "month"]), 'UNRATE'] if unn.at[i, "unemp"] == np.NaN else unn.at[i, "unemp"]

In [29]:
dmp = unn.set_index(['year','month','state'])

tmp = []
for y in years:
    for s in states:
        m2 = dpc(dmp.at[(y, '10', s), 'unemp'], dmp.at[(y-2, '11', s), 'unemp'])
        m1 = dpc(dmp.at[(y-2, '10', s), 'unemp'], dmp.at[(y-3, '01', s), 'unemp'])
        tmp.append([y,s,m1,m2,])
        
mid_unem = pd.DataFrame(data=tmp, columns=['year', 'state', 'unemp_mt_1', 'unemp_mt_2'])

sell(mid_unem)

Unnamed: 0,year,state,unemp_mt_1,unemp_mt_2
45,1984,ALABAMA,51.546392,-28.187919
7,1984,ALASKA,13.186813,-7.619048
28,1984,ARIZONA,85.245902,-56.896552
48,1984,ARKANSAS,18.292683,-15.151515
46,1984,CALIFORNIA,52.112676,-31.818182
...,...,...,...,...
171,2020,VIRGINIA,-30.000000,110.344828
161,2020,WASHINGTON,-8.333333,55.555556
164,2020,WEST VIRGINIA,-7.407407,45.098039
162,2020,WISCONSIN,-14.285714,60.000000


### merging features

In [30]:
df = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
for d in dflist:
    df = df.merge(d, how='inner', left_on=['year','state'], right_on=['year','state'])

df = df.sort_values(by=['year', 'state'])
df.to_csv('datasets-clean/xxx-final-dataset.csv', index=False)
df

Unnamed: 0,year,state,y_votes_percent,fair_p_1,def_mt_2,def_mt_1,fair_g_1,gdp_mt_2,gdp_mt_1,z_mt_4,...,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice,house_midterm,avg_inc_mt_2,avg_inc_mt_1,unemp_mt_1,unemp_mt_2
45,1984,ALABAMA,38.736605,26.788107,8.923985,16.400541,4.719412,9.896919,-0.693998,1,...,-1,-1,0.0,0,0,1,16.997412,16.526329,51.546392,-28.187919
7,1984,ALASKA,30.944087,30.073594,0.219513,29.788690,3.995028,1.171316,18.284477,1,...,-1,-1,0.0,0,0,-1,1.426071,25.259560,13.186813,-7.619048
28,1984,ARIZONA,32.883272,27.619112,10.086238,15.926491,10.752575,17.082207,0.363747,1,...,-1,-1,0.0,0,0,-1,17.224666,15.200083,85.245902,-56.896552
48,1984,ARKANSAS,38.772412,25.124853,8.706949,15.102903,7.839078,11.296819,0.810684,1,...,-1,-1,0.0,0,0,1,16.832890,18.783730,18.292683,-15.151515
46,1984,CALIFORNIA,41.775465,27.168417,9.427608,16.212370,8.028589,11.894211,3.321546,1,...,-1,-1,0.0,0,0,1,15.239614,15.470908,52.112676,-31.818182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,2020,VIRGINIA,55.154687,7.549968,4.216321,3.198776,-2.757118,-0.837617,3.881593,0,...,-1,-1,0.0,0,1,1,8.740216,6.622504,-30.000000,110.344828
161,2020,WASHINGTON,59.925503,6.646364,3.584248,2.956160,-0.581850,3.341932,12.518139,0,...,-1,-1,0.0,0,1,1,11.224152,9.655147,-8.333333,55.555556
164,2020,WEST VIRGINIA,30.201468,7.411864,0.648566,6.719717,-3.902932,-4.653816,4.431850,0,...,-1,-1,0.0,0,1,-1,8.665411,10.554562,-7.407407,45.098039
162,2020,WISCONSIN,50.319063,7.602507,4.323340,3.143273,-4.006500,-2.571740,2.567721,0,...,-1,-1,0.0,0,1,1,8.474146,7.874298,-14.285714,60.000000


In [31]:
for d in dflist:
    w = whore(d)
    if len(w):
        print(d.head())
        print('-+'*30)
        print(w)
        print('-+'*30)

In [32]:
0/0
# chernobyl zone

ZeroDivisionError: division by zero

exp fail

In [None]:
tmp = []
for y in years:
    for s in states:
        m2 = unn[unn.year >= y-1][unn.year <= y][unn.state == s].unemp.median()
        m1 = unn[unn.year >= y-3][unn.year <= y-2][unn.state == s].unemp.median()
        tmp.append([y,s,m1,m2])
mid_unem = pd.DataFrame(data=tmp, columns=['year', 'state', 'unemp_12', 'unemp_34'])
dflist.append(mid_unem)
mid_unem

In [None]:
unn = var42(unn, 'unemp', 'unemployment-index', 'unemp')
dflist.append(unn)
unn

In [None]:
gdp_index['gdp_34'] = (gdp_index['gdp_3']+gdp_index['gdp_3'])/2
gdp_index['gdp_12'] = (gdp_index['gdp_1']+gdp_index['gdp_2'])/2

del gdp_index['gdp_1']
del gdp_index['gdp_2']
del gdp_index['gdp_3']
del gdp_index['gdp_4']

In [None]:
gdp_boot = gdp_real.copy()
gdp_exp = gdp_boot.set_index(['year', 'state'])
for s in states:
    print(gdp_exp.loc[(slice(None),s),:].gdp_real.apply(lambda x : math.exp((x - gdp_exp.gdp_real.mean())/ math.sqrt(gdp_exp.gdp_real.var()))))
    break

us stuff

In [None]:
gdp_us = pd.read_csv('datasets/gdp-nomina-47-20-chain-12.csv')
gdp_us.DATE = gdp_us.DATE.apply(lambda x: int(str(x).split('-')[0]))
s = gdp_us[gdp_us.DATE > 1975].GDPC1_PC1
s.hist()

In [None]:
ecdf = ECDF(s)
ecdf(2.8)

qnt = 0.67
std_err = np.sqrt(s.var())/2
ecdf_inv(s, qnt), len(s)*(1-qnt), std_err

#### house

In [None]:
'''
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })
#'''

#### gasoline and friends + approval

In [None]:
gas = pd.read_csv('datasets/gasoline-93-21.csv')
gas.date = gas.date.apply(lambda x: time.mktime(datetime.datetime.strptime(x,"%m/%d/%Y").timetuple()))
gas.date = gas.date.apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
gas

#### houses prices and rent + personal income

In [None]:
f = open('datasets/house-chain-00.csv', 'rt')
lines = f.readlines()
houses = [[i.strip() for i in l.split(sep = '$') ] for l in lines ]
houses = pd.DataFrame(data=houses, columns=['state', 2000, 1990, 1980, 1970, 1960, 1950, 1940 ])
houses.head()