In [53]:
import pandas as pd
import numpy as np
import scipy.cluster
import scipy.interpolate
from sklearn.metrics import mean_squared_error as mse, mean_absolute_error as mae

ins = ['White (non-hispanic)',
       'BachOrHigher',
       'MedHouseholdIncome',
       'Obama 2012 Percent',
       'Dem 2014 Congressional Percent',
       'Dem 2016 Congressional Percent']
outs = ['Clinton 2016 Percent']

# Open the file, drop rows that are missing relevant data
df_orig = pd.read_excel('2018 data sheet.xlsx', sheetname='Input data')
df_orig = df_orig.dropna(subset= ins + outs)

# Simulate interpolating the Clinton 2016 percents given several random
# validating sets
totcor = 0
y_pred_all = []
y_true_all = []
for k in range(20):
    # Randomly select 50 validation rows; we will try to interpolate
    # Clinton 2016 Percent without running the interpolator on these rows
    tidx = [np.random.randint(0, len(df_orig)) for i in range(50)]
    trows = [df_orig.iloc[idx] for idx in tidx]
    df = df_orig.drop(df_orig.index[tidx])

    # Calculate K-means and RBF interpolations
    inrep = [df[rep] for rep in ins]
    outrep = [df[rep] for rep in outs]
    features = inrep + outrep
    km = scipy.cluster.vq.kmeans(np.array(features), 30)
    rbfi = scipy.interpolate.Rbf(*features)

    y_pred = [float(rbfi(*[row[rep] for rep in ins])) for row in trows]
    y_true = [row[outs[0]] for row in trows]

    correct = []
    for i in range(len(y_pred)):
        correct.append((y_pred[i] > 0.5) == (y_true[i] > 0.5))
    totcor += sum(correct) / len(y_pred)
    y_pred_all.extend(y_pred)
    y_true_all.extend(y_true)

# This represents the expected rate at which the interpolator correctly predicts the outcome of the race
print('Fraction of Races Predicted Correctly: ' + str(totcor / 20))
print('Mean Squared Prediction Error: ' + str(mse(y_true_all, y_pred_all)))
print('Mean Absolute Prediction Error: ' + str(mae(y_true_all, y_pred_all)))
print('Stdev Prediction Error: ' + str(np.std(np.array(y_true_all) - np.array(y_pred_all))))


Fraction of Races Predicted Correctly: 0.5479999999999999
Mean Squared Prediction Error: 0.05684779322782409
Mean Absolute Prediction Error: 0.17925785565171162
Stdev Prediction Error: 0.2380419082019354
