# and then we go to mario draghi

In [None]:
import pandas as pd
import time
import datetime
import matplotlib
import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF
import math
import requests
from bs4 import BeautifulSoup as bs
import pickle as pk

In [None]:
ad = pd.read_csv('datasets/1976-2020-president.csv')
years = set(ad['year'])
states = set(ad['state'])
dflist = list()

In [None]:
states.remove('DISTRICT OF COLUMBIA')
years.remove(1976)
years.remove(1980)

#### some function

In [None]:
def foofloat(x):
    try:
        return float(x)
    except:
        return np.NaN

In [None]:
def bls_serial(series_id):
  bsobj = bs(requests.post("https://data.bls.gov/pdq/SurveyOutputServlet", {'series_id': series_id}).text)
  return bsobj.findAll('table')[0].findAll('tr')[2].findAll('td')[0].text.strip().upper()

In [None]:
def ecdf_inv(a, q): return sorted(a)[int(len(a) * q)]
def count_elem(l): return {y:list(l).count(y) for y in set(l)}

In [None]:
def to_percent(older, col_key, shift=0):
    newer = []
    for y in years:
        for s in states:
            try:
                d = older.loc[y-shift, s, 'DEMOCRAT'][col_key]
                r = older.loc[y-shift, s, 'REPUBLICAN'][col_key]
                newer.append([int(y), s, 100*d/(d+r)]) # % share for each state
            except KeyError as e:
                pass
    return pd.DataFrame(data=newer, columns=['year', 'state', col_key, ])

In [None]:
def to_splatter(a, col_name): # from row-column to table-list
    lst = []
    for y in a.columns[2:]:
        for index, row in a.iterrows():
            lst.append([int(y), row['GeoName'].upper(), row[y], ])
    
    return pd.DataFrame(data=lst, columns=['year', 'state', col_name, ])

In [None]:
def var42(df_res, colname, filename, old_colname, jump=1):
    new_df = []
    
    df_res = df_res.set_index(['year', 'state'])
    for y in years:
        for s in states:
            try:
                new_df.append([y,s,] + [df_res.loc[y-i,s][old_colname] for i in range(0, 4, jump)])
            except KeyError:
                pass

    new_df = pd.DataFrame(data=new_df, columns = ['year', 'state',] + [f'{colname}_{i}' for i in range(4//jump, 0, -1)])
    new_df = new_df.sort_values(by=['year', 'state'])
    new_df.to_csv(f'datasets-clean/{filename}.csv', index=False)
    return new_df

In [None]:
def beadiff(rowcol, colname, diff = True, shift=1):
    difflist = []

    for s in states:
        tmp = rowcol.loc[(slice(None),s),:]
        yyyy = list({i[0] for i in tmp.index})
        yyyy.sort()
        for y in yyyy[shift:]:
            if diff:
                q = rowcol.loc[(y,s),:][colname]/rowcol.loc[(int(y)-shift,s),:][colname]
                q = (q - 1)*100
            else:
                q = rowcol.loc[(y,s),:][colname]
            difflist.append([y, s, q])
            
    return  pd.DataFrame(data=difflist, columns=['year', 'state', colname])

In [None]:
def whore(df, yy=years, ss=states):
    df = df.set_index(['year','state'])
    x = []
    for s in ss:
        for y in yy:
            try:
                df.loc[(y,s),:]
            except:
                x.append((y,s))
    return x

#### y_hat aka presitdent popular vote

In [None]:
pop_vote = pd.read_csv('datasets/1976-2020-president.csv')
pop_vote = pop_vote[['year', 'state', 'party_simplified', 'candidatevotes', ]] # drop useless columns
pop_vote = pop_vote.groupby(['year', 'state', 'party_simplified', ]).sum() # drop multiple candidate
pop_vote = to_percent(pop_vote, 'candidatevotes')
pop_vote = pop_vote.rename(columns={"candidatevotes": "y_votes_percent", })
pop_vote = pop_vote.sort_values(by=['year', 'state'])
pop_vote.to_csv('datasets-clean/popular-vote-y.csv', index=False)
dflist.append(pop_vote)
pop_vote

#### GDP and friends

It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997.
Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year.
While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year.
~~Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.~~

In [None]:
gdp_nom_97 = to_splatter(pd.read_csv('datasets/gdp-nominal-63-97.csv'), 'gdp_nom')
gdp_nom_12 = to_splatter(pd.read_csv('datasets/gdp-nominal-97-20.csv'), 'gdp_nom')
gdp_real_97 = to_splatter(pd.read_csv('datasets/gdp-real-77-97-chain-97.csv'), 'gdp_real')
gdp_real_12 = to_splatter(pd.read_csv('datasets/gdp-real-97-20-chain-12.csv'), 'gdp_real')

In [None]:
gdp_nom_97 = gdp_nom_97.set_index(['year', 'state'])
gdp_nom_12 = gdp_nom_12.set_index(['year', 'state'])
gdp_real_97 = gdp_real_97.set_index(['year', 'state'])
gdp_real_12 = gdp_real_12.set_index(['year', 'state'])

#### price index

In [None]:
gdp_def_97 = (gdp_nom_97['gdp_nom'] / gdp_real_97['gdp_real']).to_frame('gdp_def').dropna()
gdp_def_12 = (gdp_nom_12['gdp_nom'] / gdp_real_12['gdp_real']).to_frame('gdp_def').dropna()

In [None]:
gdp_def = pd.concat([beadiff(gdp_def_12,'gdp_def'), beadiff(gdp_def_97,'gdp_def')])
gdp_def = gdp_def.sort_values(by=['year', 'state'])
gdp_def

In [None]:
def_index = var42(gdp_def, colname='def', filename='price-index', old_colname='gdp_def')
dflist.append(def_index)
def_index

#### gdp dummies and margin

In [None]:
gdp_real_plain = gdp_real_12.copy()
for y in range(1996, 1977, -1):
    for s in sorted(states):
        a = gdp_real_97.loc[(y,s),'gdp_real']
        b = gdp_real_97.loc[(y+1,s),'gdp_real']
        c = gdp_real_plain.loc[(y+1,s),'gdp_real']

        gdp_real_plain.at[(y,s),'gdp_real'] = a*b/c

gdp_real_plain = gdp_real_plain.reset_index()

In [None]:
gdp_real_var = pd.concat([beadiff(gdp_real_97,'gdp_real'), beadiff(gdp_real_12,'gdp_real')])

In [None]:
gdp_real_var_2 = pd.concat([beadiff(gdp_real_97,'gdp_real', shift=2), beadiff(gdp_real_12,'gdp_real', shift=2)])

tmp = gdp_real_plain.set_index(['year','state'])
tmp = tmp.loc[1998] / tmp.loc[1996]
tmp['year'] = 1998

gdp_real_var_2 = pd.concat([gdp_real_var_2, tmp.reset_index(), ])

#### z growth index

In [None]:
GDP_THRESHOLD = 3.6

In [None]:
z_yearly = gdp_real_var.copy()
z_yearly.gdp_real = z_yearly.gdp_real.apply(lambda x : 1 if x > GDP_THRESHOLD else 0)

growth_index = var42(z_yearly, colname='z', filename='z-growth-index', old_colname='gdp_real')
dflist.append(growth_index)
growth_index

#### g gdp index

In [None]:
gdp_index = var42(gdp_real_var_2, colname='gdp_mt', filename='gdp-growth-index', old_colname='gdp_real', jump=2)
dflist.append(gdp_index)
gdp_index

In [None]:
'''
gdp_index['gdp_34'] = (gdp_index['gdp_3']+gdp_index['gdp_3'])/2
gdp_index['gdp_12'] = (gdp_index['gdp_1']+gdp_index['gdp_2'])/2

del gdp_index['gdp_1']
del gdp_index['gdp_2']
del gdp_index['gdp_3']
del gdp_index['gdp_4']
#'''

#### incumbent

In [None]:
incumbent = pd.read_csv('datasets/incumbent-4president-76-20.csv', sep=';')

tmp = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
incumbent = pd.merge(tmp, incumbent, how='inner', left_on='year', right_on='year')
incumbent = incumbent.sort_values(by=['year', 'state'])
incumbent.to_csv('datasets-clean/incumbent-longitudinal-replication.csv', index=False)
dflist.append(incumbent)
incumbent

#### house dummy

In [None]:
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes', shift=2)
house_vote['candidatevotes'] = house_vote['candidatevotes'].apply(lambda x: -1 if x < 50 else 1)
house_vote = house_vote.rename(columns={'candidatevotes':'house_midterm'})
house_vote = pd.concat([house_vote, pd.read_csv('datasets/midterm-fixed.csv')])
house_vote.state = house_vote.state.apply(lambda x : x.upper())
house_vote = house_vote.sort_values(by=['year', 'state'])
house_vote.to_csv('datasets-clean/incumbent-house-rep.csv', index=False)
dflist.append(house_vote)
house_vote

#### personal income

In [None]:
inc_cap = to_splatter(pd.read_csv('datasets/personal-income-per-capita-72-20.csv'), 'avg_inc')
#inc_cap.avg_inc = inc_cap.avg_inc.apply(lambda x: math.log(x))
inc_cap = inc_cap.set_index(['year', 'state'])
inc_cap = beadiff(inc_cap, 'avg_inc', False)
inc_cap = var42(inc_cap, 'avg_inc', 'income-index', 'avg_inc')
dflist.append(inc_cap)
inc_cap

#### unenployment

In [None]:
unn = pd.read_csv('datasets/unemployment-76-21-percent.csv')
furi = 'datasets-clean/serial-id-bsl.pkl'

if False:
    sd = dict()
    for s in set([*unn['Series ID'], *nunn['Series ID']]):
        try:
            sd[s] = bls_serial(s)
        except IndexError:
            sd[s] = f'BADASS_{s}'
    pk.dump(sd, open(furi, 'wb'))
else:
    sd = pk.load(open(furi, 'rb'))

unn = unn.rename(columns={
    "Series ID": "state",
    "Year": "year",
    "Value": "unemp",
    "Period": "month",
    })

# transform
unn.unemp = unn.unemp.apply(foofloat).astype(float)
unn.state = unn.state.map(sd)
unn.month = unn.month.apply(lambda x: x.split('M')[-1])

In [None]:
glue = pd.read_csv('datasets/unemployment-rate-us-country-47-21.csv')
glue['year'] = glue.DATE.apply(lambda x: x.split('-')[0])
glue['month'] = glue.DATE.apply(lambda x: x.split('-')[1])
glue = glue.set_index(['year','month'])

for i in unn.index:
    unn.at[i, "unemp"] = glue.at[(unn.at[i, "year"], unn.at[i, "month"]), 'UNRATE'] if unn.at[i, "unemp"] == np.NaN else unn.at[i, "unemp"]

In [None]:
dmp = unn.set_index(['year','month','state'])

tmp = []
for y in years:
    for s in states:
        m2 = (dmp.loc[(y, '10', s), 'unemp'] / dmp.loc[(y-2, '11', s), 'unemp'])*100 - 100
        m1 = (dmp.loc[(y-2, '10', s), 'unemp'] / dmp.loc[(y-4, '01', s), 'unemp'])*100 - 100
        tmp.append([y,s,m1,m2])
        
mid_unem = pd.DataFrame(data=tmp, columns=['year', 'state', 'unemp_mt_1', 'unemp_mt_2'])
dflist.append(mid_unem)
mid_unem

In [None]:
'''
tmp = []
for y in years:
    for s in states:
        m2 = unn[unn.year >= y-1][unn.year <= y][unn.state == s].unemp.median()
        m1 = unn[unn.year >= y-3][unn.year <= y-2][unn.state == s].unemp.median()
        tmp.append([y,s,m1,m2])
mid_unem = pd.DataFrame(data=tmp, columns=['year', 'state', 'unemp_12', 'unemp_34'])
dflist.append(mid_unem)
mid_unem
#'''

In [None]:
'''
unn = var42(unn, 'unemp', 'unemployment-index', 'unemp')
dflist.append(unn)
unn
#'''

### merging features

In [None]:
df = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
for d in dflist:
    df = df.merge(d, how='inner', left_on=['year','state'], right_on=['year','state'])

df = df.sort_values(by=['year', 'state'])
df.to_csv('datasets-clean/xxx-final-dataset.csv', index=False)
df

In [None]:
for d in dflist:
    w = whore(d.set_index(['year', 'state']))
    if len(w):
        print('-+'*30)
        print(d.head())

In [None]:
0/0
# chernobyl zone

exp fail

In [None]:
gdp_boot = gdp_real.copy()
gdp_exp = gdp_boot.set_index(['year', 'state'])
for s in states:
    print(gdp_exp.loc[(slice(None),s),:].gdp_real.apply(lambda x : math.exp((x - gdp_exp.gdp_real.mean())/ math.sqrt(gdp_exp.gdp_real.var()))))
    break

us stuff

In [None]:
gdp_us = pd.read_csv('datasets/gdp-nomina-47-20-chain-12.csv')
gdp_us.DATE = gdp_us.DATE.apply(lambda x: int(str(x).split('-')[0]))
s = gdp_us[gdp_us.DATE > 1975].GDPC1_PC1
s.hist()

In [None]:
ecdf = ECDF(s)
ecdf(2.8)

qnt = 0.67
std_err = np.sqrt(s.var())/2
ecdf_inv(s, qnt), len(s)*(1-qnt), std_err

#### house

In [None]:
'''
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })
#'''

#### gasoline and friends + approval

In [None]:
gas = pd.read_csv('datasets/gasoline-93-21.csv')
gas.date = gas.date.apply(lambda x: time.mktime(datetime.datetime.strptime(x,"%m/%d/%Y").timetuple()))
gas.date = gas.date.apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
gas

#### houses prices and rent + personal income

In [None]:
f = open('datasets/house-chain-00.csv', 'rt')
lines = f.readlines()
houses = [[i.strip() for i in l.split(sep = '$') ] for l in lines ]
houses = pd.DataFrame(data=houses, columns=['state', 2000, 1990, 1980, 1970, 1960, 1950, 1940 ])
houses.head()