# Introduction

We want our model to estimate election outcomes based on the fundamentals and poll data. Thus, we need a "poll feature" for each district upon which to train. Unfortunately, many districts do not have poll data. We overcome this by interpolating poll results for districts that do not have polls from districts that do have polls based on districts' similarity in demographics and historical election results. We achieve this using [radial basis function interpolation](http://num.math.uni-goettingen.de/schaback/teaching/sc.pdf) (RBF interpolation).

# First Estimate

Before polls are released, we may validate this interpolation method by interpolating the 2016 presidential races' outcome based on the demographic and historical data that we will use to interpolate polls.

In [1]:
import pandas as pd
import numpy as np
import scipy.cluster
import scipy.interpolate
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae
import matplotlib.pyplot as plt
plt.style.use('ggplot')

# ins = ['White (non-hispanic)',
#        'BachOrHigher',
#        'MedHouseholdIncome',
#        'Obama 2012 Percent',
#        'Dem 2014 Congressional Percent',
#        'Dem 2016 Congressional Percent',
#        'Clinton 2016 Percent']
# outs = ['Most recent adjusted margin']

# # Open the file, drop rows that are missing relevant data
# df_orig = pd.read_excel('2018 data sheet.xlsx', sheetname='Input data')
# df_orig = df_orig.dropna(subset = ins + outs)

## Training and Validation

**TODO**: get the correct number of training points corresponding to the number of available polls

We partition our data into training and validation sets. We train the interpolation on most of the available data points, leaving out a set of validation points. We then predict the *Clinton 2016 Percent* of the validation points and compare to the true *Clinton 2016 Percent*, calculating the error in our interpolation.

In [2]:
# # Simulate interpolating the Obama 2012 percents given several random
# # validating sets
# totcor = 0
# y_pred_all = []
# y_true_all = []
# N = 1000
# for k in range(N):
#     # Randomly select 50 validation rows; we will try to interpolate
#     # Clinton 2016 Percent without running the interpolator on these rows
#     tidx = [np.random.randint(0, len(df_orig)) for i in range(5)]
#     trows = [df_orig.iloc[idx] for idx in tidx]
#     df = df_orig.drop(df_orig.index[tidx])

#     # Calculate RBF interpolations
#     inrep = [np.array(df[rep]).astype(float) for rep in ins]
#     outrep = [np.array(df[rep]).astype(float) for rep in outs]
#     features = inrep + outrep
#     rbfi = scipy.interpolate.Rbf(*features)

#     y_pred = [float(rbfi(*[row[rep] for rep in ins])) for row in trows]
#     y_true = [row[outs[0]] for row in trows]

#     correct = []
#     for i in range(len(y_pred)):
#         correct.append((y_pred[i] > 0.5) == (y_true[i] > 0.5))
#     totcor += sum(correct) / len(y_pred)
#     y_pred_all.extend(y_pred)
#     y_true_all.extend(y_true)

# # This represents the expected rate at which the interpolator correctly predicts the outcome of the race
# print('Fraction of Races Predicted Correctly: ' + str(totcor / N))
# print('Mean Squared Prediction Error: ' + str(mse(y_true_all, y_pred_all)))
# print('Mean Absolute Prediction Error: ' + str(mae(y_true_all, y_pred_all)))
# print('Stdev Prediction Error: ' + str(np.std(np.array(y_true_all) - np.array(y_pred_all))))

# plt.hist(np.array(y_true_all) - np.array(y_pred_all))
# plt.title('y_true - y_pred')
# plt.show()

# Real Interpolation

We will now interpolate real polls based on a large range of input features. We predict based on ProximityOne's [demographic data by congressional district](http://proximityone.com/cd.htm).

## Duplicates

RBF interpolation cannot handle multiple points that any of the same coordinate. ~~Thus we must drop rows that have any duplicated values.~~

In [46]:
ins = ['S' + str(rep).zfill(3) for rep in range(1, 100)]
outs = ['MRAM']

# Load the raw data
df = pd.read_excel('sdata.xlsx', sheet='Sheet 1')
df.columns = [rep.strip() for rep in df.columns]    # fix col names

# Strip non-data columns (i.e. district name)
for column in df.columns:
    if not (column in ins or column in outs):
        df.drop(column, axis=1, inplace=True)

# drop rows that have non-numerical data
df.dropna(inplace=True)
df = df[df.applymap(np.isreal).any(1)]

df_orig = df
df.to_csv('tmp.csv', sep=',')

In [57]:
def get_repeat_idxs(records_array):
    idx_sort = np.argsort(records_array)
    sorted_records_array = records_array[idx_sort]
    vals, idx_start, count = np.unique(sorted_records_array, return_counts=True,
                                    return_index=True)

    # sets of indices
    res = np.split(idx_sort, idx_start[1:])
    #filter them with respect to their size, keeping only items occurring more than once

    vals = vals[count > 1]
    return filter(lambda x: x.size > 1, res)

for cni in range(len(ins)):
    cn = ins[cni]
    col = np.array(df_orig[cn])
    repeated = list(get_repeat_idxs(col))
    print('repeated: ' + str(repeated))
    
    for r in repeated:
        for ki in range(1, len(r)):
#             print('df.iloc[r[ki], cni]: ' + str(df.iloc[r[ki], cni + 1]))
#             print('r[ki]: ' + str(r[ki]) + ', cni: ' + str(cni))
            df.iloc[r[ki], cni + 1] += ki * 0.003
#             print('df.iloc[r[ki], cni]: ' + str(df.iloc[r[ki], cni + 1]))
#         print('Repeated Vals (modded): ' + str(np.array(df[cn])[r[0]]) + ', ' + str(np.array(df[cn])[r[1]]))

for cni in range(len(ins)):
    cn = ins[cni]
    col = np.array(df_orig[cn])
    repeated = list(get_repeat_idxs(col))
#     print('repeated: ' + str(repeated))
    
    for r in repeated:
        for ki in range(1, len(r)):
#             print('df.iloc[r[ki], cni]: ' + str(df.iloc[r[ki], cni + 1]))
#             print('r[ki]: ' + str(r[ki]) + ', cni: ' + str(cni))
            df.iloc[r[ki], cni + 1] += ki
#             print('df.iloc[r[ki], cni]: ' + str(df.iloc[r[ki], cni + 1]))
#         print('Repeated Vals (modded): ' + str(np.array(df[cn])[r[0]]) + ', ' + str(np.array(df[cn])[r[1]]))
        

repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: [array([37, 43]), array([48, 45]), array([58, 22]), array([50, 79]), array([ 41, 103]), array([66, 87]), array([53, 96]), array([38, 67])]
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: []
repeated: [array([79, 54]), array([60, 13]), array([66, 14]), array([65,  9, 16]), array([25, 33, 11]), array([12, 26, 91]), array([48, 94]), array([ 83, 100,  95]), array([93, 76, 86]), array([39, 98, 82]), array([89, 55, 87]), array([97, 68, 20]), array([47, 96, 45]), array([92, 80]), array([35, 62]), array([37, 15]), array([44, 99, 90]), array([74,  3, 10]), array([58, 70, 85, 

In [51]:
# remove rows with NAs in relevant input columns (includes districts without polls)
df_orig = df_orig.dropna(subset = ins)
# remove rows with NAs in all relevant columns (doesn't include districts without polls)
df_valued = df_orig.dropna(subset = ins + outs)

# print('len(df_valued): ' + str(len(df_valued)))

# Validate interpolation on many partitions
totcor = 0
y_pred_all = []
y_true_all = []
N = 10
for k in range(N):
    tidx = [np.random.randint(0, len(df_valued)) for i in range(50)]
    trows = [df_valued.iloc[idx] for idx in tidx]
    df_train = df_valued.drop(df_valued.index[tidx])
    
    inrep = [np.array(df_train[rep]).astype(float) for rep in ins]
    outrep = [np.array(df_train[rep]).astype(float) for rep in outs]
    features = inrep + outrep
    rbfi = scipy.interpolate.Rbf(*features)
    
    y_pred = [float(rbfi(*[row[rep] for rep in ins])) for row in trows]
    y_true = [row[outs[0]] for row in trows]

    correct = []
    for i in range(len(y_pred)):
        correct.append((y_pred[i] > 0.5) == (y_true[i] > 0.5))
    totcor += sum(correct) / len(y_pred)
    y_pred_all.extend(y_pred)
    y_true_all.extend(correct)

# This represents the expected rate at which the interpolator correctly predicts the outcome of the race
print('Fraction of Races Predicted Correctly: ' + str(totcor / N))
print('Mean Squared Prediction Error: ' + str(mse(y_true_all, y_pred_all)))
print('Mean Absolute Prediction Error: ' + str(mae(y_true_all, y_pred_all)))
print('Stdev Prediction Error: ' + str(np.std(np.array(y_true_all) - np.array(y_pred_all))))

plt.hist(np.array(y_true_all) - np.array(y_pred_all))
plt.title('y_true - y_pred')
plt.show()
np.savetxt('ypred.csv', y_pred, delimiter=',')

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


LinAlgError: Matrix is singular.

In [None]:
# df_train = df_orig.dropna(subset = ins + outs)
# df_calc = df_orig.dropna(subset = ins)

# inrep = [np.array(df_train[rep]).astype(float) for rep in ins]
# outrep = [np.array(df_train[rep]).astype(float) for rep in outs]
# features = inrep + outrep
# rbfi = scipy.interpolate.Rbf(*features)

# y_pred = [float(rbfi(*[row[rep] for rep in ins])) for idx, row in df_calc.iterrows()]

# plt.hist(y_pred)
# plt.show()