# and then we go to mario draghi

In [1]:
import pandas as pd
import time
import datetime
import matplotlib
import numpy as np
from statsmodels.distributions.empirical_distribution import ECDF
import math

ad = pd.read_csv('datasets/1976-2020-president.csv')
years = set(ad['year'])
states = set(ad['state'])
dflist = list()

#### some function

In [2]:
def ecdf_inv(a, q): return sorted(a)[int(len(a) * q)]

In [3]:
def to_percent(older, col_key, shift=0):
    newer = []
    for y in years:
        for s in states:
            try:
                d = older.loc[y-shift, s, 'DEMOCRAT'][col_key]
                r = older.loc[y-shift, s, 'REPUBLICAN'][col_key]
                newer.append([int(y), s, 100*d/(d+r)]) # % share for each state
            except KeyError as e:
                pass
    return pd.DataFrame(data=newer, columns=['year', 'state', col_key, ])

In [4]:
def to_splatter(a, col_name): # from row-column to table-list
    lst = []
    for year in a.columns[2:]:
        for index, row in a.iterrows():
            lst.append([int(year), row['GeoName'].upper(), row[year], ])
    
    return pd.DataFrame(data=lst, columns=['year', 'state', col_name, ])

In [5]:
def var42(df_res, colname, filename, old_colname):
    new_df = []
    
    df_res = df_res.set_index(['year', 'state'])
    for y in years:
        for s in states:
            try:
                new_df.append([y,s,] + [df_res.loc[y-i,s][old_colname] for i in range(4)])
            except KeyError:
                pass

    new_df = pd.DataFrame(data=new_df, columns = ['year', 'state',] + [f'{colname}_{i}' for i in range(4,0,-1)])
    new_df = new_df.sort_values(by=['year', 'state'])
    new_df.to_csv(f'datasets-clean/{filename}.csv', index=False)
    return new_df

In [6]:
def beadiff(rowcol, colname):
    difflist = []

    for s in states:
        tmp = rowcol.loc[(slice(None),s),:]
        yyyy = list({i[0] for i in tmp.index})
        yyyy.sort()
        for y in yyyy[1:]:
            q = rowcol.loc[(y,s),:][colname]/rowcol.loc[(int(y)-1,s),:][colname]
            difflist.append([y, s, (q - 1)*100])
            
    return  pd.DataFrame(data=difflist, columns=['year', 'state', colname])

#### y_hat aka presitdent popular vote

In [7]:
pop_vote = pd.read_csv('datasets/1976-2020-president.csv')
pop_vote = pop_vote[['year', 'state', 'party_simplified', 'candidatevotes', ]] # drop useless columns
pop_vote = pop_vote.groupby(['year', 'state', 'party_simplified', ]).sum() # drop multiple candidate
pop_vote = to_percent(pop_vote, 'candidatevotes')
pop_vote = pop_vote.rename(columns={"candidatevotes": "y_votes_percent", })
pop_vote = pop_vote.sort_values(by=['year', 'state'])
pop_vote.to_csv('datasets-clean/popular-vote-y.csv', index=False)
dflist.append(pop_vote)
pop_vote

Unnamed: 0,year,state,y_votes_percent
516,1976,ALABAMA,56.666724
513,1976,ALASKA,38.108171
542,1976,ARIZONA,41.386697
548,1976,ARKANSAS,65.048851
545,1976,CALIFORNIA,49.082173
...,...,...,...
169,2020,VIRGINIA,55.154687
180,2020,WASHINGTON,59.925503
196,2020,WEST VIRGINIA,30.201468
190,2020,WISCONSIN,50.319063


#### GDP and friends

It was necessary to generate an adjusted series of state GDP because of a change in BEA’s estimation procedure from a Standard Industrial Classification (SIC) basis to a North American Industry Classification System (NAICS) basis in 1997.
Data prior to 1997 were adjusted to avoid any erratic shifts in GDP that year.
While the change to NAICS basis occurred in 1997, BEA also provides estimates under a SIC basis in that year.
~~Our adjustment involved calculating the 1997 ratio of NAICS-based GDP to SIC-based GDP for each state, and multiplying it by SIC-based GDP in all years prior to 1997 to obtain our adjusted series of state-level GDP.~~

In [8]:
gdp_nom_97 = to_splatter(pd.read_csv('datasets/gdp-nominal-63-97.csv'), 'nominal_gdp_mln')
gdp_nom_12 = to_splatter(pd.read_csv('datasets/gdp-nominal-97-20.csv'), 'nominal_gdp_mln')
gdp_real_97 = to_splatter(pd.read_csv('datasets/gdp-real-77-97-chain-97.csv'), 'gdp_real')
gdp_real_12 = to_splatter(pd.read_csv('datasets/gdp-real-97-20-chain-12.csv'), 'gdp_real')

In [9]:
gdp_nom_97 = gdp_nom_97.set_index(['year', 'state'])
gdp_nom_12 = gdp_nom_12.set_index(['year', 'state'])
gdp_real_97 = gdp_real_97.set_index(['year', 'state'])
gdp_real_12 = gdp_real_12.set_index(['year', 'state'])

In [10]:
gdp_def_97 = (gdp_nom_97['nominal_gdp_mln'] / gdp_real_97['gdp_real']).to_frame('gdp_def').dropna()
gdp_def_12 = (gdp_nom_12['nominal_gdp_mln'] / gdp_real_12['gdp_real']).to_frame('gdp_def').dropna()

In [11]:
gdp_def = pd.concat([beadiff(gdp_def_12,'gdp_def'), beadiff(gdp_def_97,'gdp_def')])
gdp_def = gdp_def.sort_values(by=['year', 'state'])
gdp_def

Unnamed: 0,year,state,gdp_def
120,1978,ALABAMA,7.402566
60,1978,ALASKA,11.065835
640,1978,ARIZONA,7.468516
760,1978,ARKANSAS,8.054511
700,1978,CALIFORNIA,7.094885
...,...,...,...
390,2020,VIRGINIA,1.950355
643,2020,WASHINGTON,1.658615
1011,2020,WEST VIRGINIA,-0.257632
873,2020,WISCONSIN,2.055099


In [12]:
def_index = var42(gdp_def, 'def', 'price-index', 'gdp_def')
dflist.append(def_index)
def_index

Unnamed: 0,year,state,def_4,def_3,def_2,def_1
6,1984,ALABAMA,4.548485,4.185139,6.426162,9.372111
3,1984,ALASKA,1.250442,-1.018197,4.758325,23.893438
32,1984,ARIZONA,4.731524,5.112800,6.966784,8.376158
38,1984,ARKANSAS,4.363585,4.161762,5.603037,8.995826
35,1984,CALIFORNIA,4.737549,4.477916,6.742430,8.871767
...,...,...,...,...,...,...
169,2020,VIRGINIA,1.950355,2.222617,1.866446,1.307918
180,2020,WASHINGTON,1.658615,1.894215,1.658639,1.276351
196,2020,WEST VIRGINIA,-0.257632,0.908538,3.283808,3.326667
190,2020,WISCONSIN,2.055099,2.222565,1.889366,1.230655


In [13]:
gdp_real = pd.concat([beadiff(gdp_real_97,'gdp_real'), beadiff(gdp_real_12,'gdp_real')])
gdp_real = gdp_real.sort_values(by=['year', 'state'])
gdp_real

Unnamed: 0,year,state,gdp_real
120,1978,ALABAMA,6.421827
60,1978,ALASKA,8.958679
640,1978,ARIZONA,10.462245
760,1978,ARKANSAS,6.534163
700,1978,CALIFORNIA,6.881903
...,...,...,...
390,2020,VIRGINIA,-2.757118
643,2020,WASHINGTON,-0.581850
1011,2020,WEST VIRGINIA,-3.902932
873,2020,WISCONSIN,-4.006500


#### z growth index

In [14]:
GDP_THRESHOLD = 2.67

In [15]:
z_yearly = gdp_real.copy()
z_yearly.gdp_real = z_yearly.gdp_real.apply(lambda x : 1 if x > GDP_THRESHOLD else 0)

growth_index = var42(z_yearly, 'z', 'z-growth-index', 'gdp_real')
dflist.append(growth_index)
growth_index

Unnamed: 0,year,state,z_4,z_3,z_2,z_1
6,1984,ALABAMA,1,1,0,0
3,1984,ALASKA,1,0,0,1
32,1984,ARIZONA,1,1,0,1
38,1984,ARKANSAS,1,1,0,1
35,1984,CALIFORNIA,1,1,0,1
...,...,...,...,...,...,...
169,2020,VIRGINIA,0,0,0,0
180,2020,WASHINGTON,0,1,1,1
196,2020,WEST VIRGINIA,0,0,0,0
190,2020,WISCONSIN,0,0,0,0


#### g gdp index

In [16]:
gdp_index = var42(gdp_real, 'gdp', 'gdp-growth-index', 'gdp_real')
dflist.append(gdp_index)
gdp_index

Unnamed: 0,year,state,gdp_4,gdp_3,gdp_2,gdp_1
6,1984,ALABAMA,4.719412,4.944172,-2.488787,1.840597
3,1984,ALASKA,3.995028,-2.715237,2.618149,15.266624
32,1984,ARIZONA,10.752575,5.715111,-2.352463,2.781647
38,1984,ARKANSAS,7.839078,3.206390,-2.673933,3.580354
35,1984,CALIFORNIA,8.028589,3.578332,0.028616,3.291989
...,...,...,...,...,...,...
169,2020,VIRGINIA,-2.757118,1.973924,2.377204,1.469457
180,2020,WASHINGTON,-0.581850,3.946746,6.794566,5.359424
196,2020,WEST VIRGINIA,-3.902932,-0.781380,2.315721,2.068235
190,2020,WISCONSIN,-4.006500,1.494643,2.430560,0.133906


#### incumbent

In [17]:
incumbent = pd.read_csv('datasets/incumbent-4president-76-20.csv', sep=';')

tmp = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
incumbent = pd.merge(tmp, incumbent, how='inner', left_on='year', right_on='year')
incumbent = incumbent.sort_values(by=['year', 'state'])
incumbent.to_csv('datasets-clean/incumbent-longitudinal-replication.csv', index=False)
dflist.append(incumbent)
incumbent

Unnamed: 0,year,state,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice
516,1976,ALABAMA,-1,0,-1.0,0,0
513,1976,ALASKA,-1,0,-1.0,0,0
542,1976,ARIZONA,-1,0,-1.0,0,0
548,1976,ARKANSAS,-1,0,-1.0,0,0
545,1976,CALIFORNIA,-1,0,-1.0,0,0
...,...,...,...,...,...,...,...
169,2020,VIRGINIA,-1,-1,0.0,0,1
180,2020,WASHINGTON,-1,-1,0.0,0,1
196,2020,WEST VIRGINIA,-1,-1,0.0,0,1
190,2020,WISCONSIN,-1,-1,0.0,0,1


#### house dummy

In [18]:
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes', shift=2)
house_vote['candidatevotes'] = house_vote['candidatevotes'].apply(lambda x: 1 if x < 50 else -1)
house_vote = house_vote.rename(columns={'candidatevotes':'house_midterm'})
house_vote = house_vote.sort_values(by=['year', 'state'])
house_vote.to_csv('datasets-clean/incumbent-house-rep.csv', index=False)
dflist.append(house_vote)
house_vote

Unnamed: 0,year,state,house_midterm
495,1980,ALABAMA,-1
492,1980,ALASKA,1
520,1980,ARIZONA,-1
526,1980,ARKANSAS,1
523,1980,CALIFORNIA,-1
...,...,...,...
162,2020,VIRGINIA,-1
172,2020,WASHINGTON,-1
187,2020,WEST VIRGINIA,1
181,2020,WISCONSIN,-1


#### personal income

In [19]:
inc_cap = to_splatter(pd.read_csv('datasets/personal-income-per-capita-72-20.csv'), 'avg_inc')
inc_cap.avg_inc = inc_cap.avg_inc.apply(lambda x: math.log(x))
inc_cap = inc_cap.set_index(['year', 'state'])

inc_cap = beadiff(inc_cap, 'avg_inc')
inc_cap = inc_cap.sort_values(by=['year', 'state'])
inc_cap.to_csv('datasets-clean/avg-income.csv', index=False)
dflist.append(inc_cap)
inc_cap

Unnamed: 0,year,state,avg_inc
288,1973,ALABAMA,1.307419
144,1973,ALASKA,1.428610
1536,1973,ARIZONA,1.041957
1824,1973,ARKANSAS,1.831638
1680,1973,CALIFORNIA,0.926445
...,...,...,...
815,2020,VIRGINIA,0.486027
1343,2020,WASHINGTON,0.571009
2111,2020,WEST VIRGINIA,0.535082
1823,2020,WISCONSIN,0.453397


#### merging features

In [20]:
df = pd.DataFrame(data=[[y,s] for y in years for s in states], columns=['year', 'state'])
for d in dflist:
    df = df.merge(d, how='inner', left_on=['year','state'], right_on=['year','state'])

df = df.sort_values(by=['year', 'state'])
df.to_csv('datasets-clean/xxx-final-dataset.csv', index=False)
df

Unnamed: 0,year,state,y_votes_percent,def_4,def_3,def_2,def_1,z_4,z_3,z_2,...,gdp_3,gdp_2,gdp_1,incumbent,former_president_again,former_party_morethan_2,sudden_vice,lag_vice,house_midterm,avg_inc
6,1984,ALABAMA,38.736605,4.548485,4.185139,6.426162,9.372111,1,1,0,...,4.944172,-2.488787,1.840597,-1,-1,0.0,0,0,-1,0.987296
3,1984,ALASKA,30.944087,1.250442,-1.018197,4.758325,23.893438,1,0,0,...,-2.715237,2.618149,15.266624,-1,-1,0.0,0,0,1,0.115255
30,1984,ARIZONA,32.883272,4.731524,5.112800,6.966784,8.376158,1,1,0,...,5.715111,-2.352463,2.781647,-1,-1,0.0,0,0,1,0.954184
36,1984,ARKANSAS,38.772412,4.363585,4.161762,5.603037,8.995826,1,1,0,...,3.206390,-2.673933,3.580354,-1,-1,0.0,0,0,-1,1.122251
33,1984,CALIFORNIA,41.775465,4.737549,4.477916,6.742430,8.871767,1,1,0,...,3.578332,0.028616,3.291989,-1,-1,0.0,0,0,-1,0.899258
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162,2020,VIRGINIA,55.154687,1.950355,2.222617,1.866446,1.307918,0,0,0,...,1.973924,2.377204,1.469457,-1,-1,0.0,0,1,-1,0.486027
172,2020,WASHINGTON,59.925503,1.658615,1.894215,1.658639,1.276351,0,1,1,...,3.946746,6.794566,5.359424,-1,-1,0.0,0,1,-1,0.571009
187,2020,WEST VIRGINIA,30.201468,-0.257632,0.908538,3.283808,3.326667,0,0,0,...,-0.781380,2.315721,2.068235,-1,-1,0.0,0,1,1,0.535082
181,2020,WISCONSIN,50.319063,2.055099,2.222565,1.889366,1.230655,0,0,0,...,1.494643,2.430560,0.133906,-1,-1,0.0,0,1,-1,0.453397


In [22]:
0/0
# chernobyl zone

ZeroDivisionError: division by zero

exp fail

In [None]:
gdp_boot = gdp_real.copy()
gdp_exp = gdp_boot.set_index(['year', 'state'])
for s in states:
    print(gdp_exp.loc[(slice(None),s),:].gdp_real.apply(lambda x : math.exp((x - gdp_exp.gdp_real.mean())/ math.sqrt(gdp_exp.gdp_real.var()))))
    break

us stuff

In [None]:
gdp_us = pd.read_csv('datasets/gdp-nomina-47-20-chain-12.csv')
gdp_us.DATE = gdp_us.DATE.apply(lambda x: int(str(x).split('-')[0]))
s = gdp_us[gdp_us.DATE > 1975].GDPC1_PC1
s.hist()

In [None]:
ecdf = ECDF(s)
ecdf(2.8)

qnt = 0.67
std_err = np.sqrt(s.var())/2
ecdf_inv(s, qnt), len(s)*(1-qnt), std_err

#### house

In [None]:
'''
house_vote = pd.read_csv('datasets/1976-2020-house-utf8.csv')
house_vote = house_vote[['year', 'state', 'party', 'candidatevotes', ]]
house_vote = house_vote.groupby(['year', 'state', 'party', ]).sum()
house_vote = to_percent(house_vote, 'candidatevotes')
house_vote = house_vote.rename(columns={"candidatevotes": "houserep_votes_percent", })
#'''

#### gasoline and friends + approval

In [None]:
gas = pd.read_csv('datasets/gasoline-93-21.csv')
gas.date = gas.date.apply(lambda x: time.mktime(datetime.datetime.strptime(x,"%m/%d/%Y").timetuple()))
gas.date = gas.date.apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
gas

#### houses prices and rent + personal income

In [None]:
f = open('datasets/house-chain-00.csv', 'rt')
lines = f.readlines()
houses = [[i.strip() for i in l.split(sep = '$') ] for l in lines ]
houses = pd.DataFrame(data=houses, columns=['state', 2000, 1990, 1980, 1970, 1960, 1950, 1940 ])
houses.head()