In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as MSE
from sklearn.cross_validation import LabelKFold
from IPython.display import display
from itertools import chain, combinations
from collections import Counter

import pandas as pd
import pickle as cp
import numpy as np

## Model creation:
  * Load data
  * Extract/scale features
  * Split into test/train
  * Train Model
  * Evaluate performance

#### Settings

In [2]:
# Load the data into a single dataframe; votes + chosen census data
# Codes: AGE, IPE, RHI, LND, CRM, POP, AGN
census_codes = ['AGE', 'IPE', 'RHI', 'LND', 'CRM', 'POP', 'AGN']

In [10]:
# Load all of the voting data, filtering null vote counts or missing FIPS codes
vote_data = pd.read_csv('Combined_2012_2016_votes.csv').drop(['Unnamed: 0'], axis=1)
vote_data = vote_data.loc[vote_data["Dem"] > 0]
vote_data = vote_data.loc[vote_data["FIPS"] != 0]

# Convert vote data into labels for 2016
#full_data = full_data.loc[full_data["Year"] == 2016]
votes_2012 = vote_data.loc[vote_data["Year"] == 2012]
votes_2016 = vote_data.loc[vote_data["Year"] == 2016]
full_data = pd.DataFrame()#votes_2016[["FIPS", "State", "County"]]
full_data['Dfrac_2012'] = votes_2012['Dem'] / votes_2012['Total Votes']
full_data['Dfrac_2016'] = votes_2016['Dem'] / votes_2016['Total Votes']


#full_data = full_data.drop(['Total Votes', 'Dem', 'Rep', 'Lib', 'Grn'], axis=1)

for code in census_codes:
    curr_data = pd.read_csv('proc_census_data/{}.csv'.format(code)).drop(['Unnamed: 0'], axis=1)
    #full_data = full_data.join(curr_data.set_index("FIPS"), on=["FIPS"], how="left")

In [11]:
full_data[:20]#.head()

Unnamed: 0,Dfrac_2012,Dfrac_2016
0,0.415674,
1,0.265758,
2,0.215666,
3,0.512523,
4,0.262186,
5,0.123478,
6,0.763069,
7,0.460508,
8,0.335208,
9,0.470608,


In [None]:
def column_ratio(df, numerators, denominators, name, to_remove=None):
    num = sum([df[col] for col in numerators])
    den = sum([df[col] for col in denominators])
    df[name] = num / den
    if to_remove is None:
        to_remove = list(set(numerators + denominators))
    df.drop(to_remove, axis=1, inplace=True)

In [None]:

# Do some cross-data-type calculations
column_ratio(full_data, ['Normalized Cropland - total'], ['Normalized Land area'],
             'Agricultural Land Fraction', to_remove=['Normalized Cropland - total'])

column_ratio(full_data, ['property crimes'], ['Resident population'], 'property crime rate', to_remove=['property crimes'])

full_data['Dfrac'] = full_data['Dem'] / full_data['Total Votes']
display(full_data.head())

In [None]:
full_data[full_data['Dfrac'].isnull()]

In [None]:
full_data.columns

In [None]:
def baseline_model(df, label_col, method='avg'):
    labels = df[label_col].values
    if method == 'avg':
        predictions = np.ones(len(labels)) * np.mean(labels)
    elif method == 'random':
        predictions = np.random.uniform(low=0, high=1, size=len(labels))
    rmse = np.sqrt(MSE(labels, predictions))
    print("Baseline model via '{}': {:.5f}".format(method, rmse))

In [None]:
def evaluate_model(df, label_col, feature_cols, fold_col=None, n_folds=10):
    # Create a numpy.array dataset
    if not feature_cols:
        return
    labels = df[label_col].values
    features = df[feature_cols].values
    
    # Identify how to fold: using a column (e.g. separate by state) or random sample
    if not fold_col:
        folds = KFold(len(labels), n_folds)
    else:
        folds = LabelKFold(df[fold_col].values, n_folds)
        
    pred_row = np.zeros(len(labels))
    results = []
    for train_idxs, test_idxs in folds:
        rfr = RandomForestRegressor()
        rfr = rfr.fit(features[train_idxs], labels[train_idxs])
        predictions = rfr.predict(features[test_idxs])
        pred_row[test_idxs] = predictions
        rmse = np.sqrt(MSE(labels[test_idxs], predictions))
        results.append(rmse)
    df['predictions'] = pred_row
    feat_str = ', '.join([feature.replace('Normalized', '')[:8] for feature in feature_cols])
    print("Performance:  RMSE: {:.4f} +- {:.4f} for {}".format(np.mean(results), np.std(results), feat_str))

In [None]:
def random_sample(df, cols, size=20):
    sample = np.random.choice(len(df), size=size)
    display(df.iloc[sample][cols])

def powerset(iterable, min_ct=0, max_ct=None):
    if max_ct is None:
        max_ct = len(iterable)
    return list(map(list, chain.from_iterable(combinations(list(iterable),n) for n in range(min_ct, max_ct + 1))))

In [None]:
# Feature types | 0: base features, always used, 1: additional features, tested by adding one at a time, 2: unused features 
features = [(0, 'Asian'),
            (0, 'Black'),
            (0, 'Hispanic or Latino Origin'),
            (0, 'White'),
            (0, 'Voting Age Fraction'),
            (0, 'Normalized Median Income'),
            (0, 'Normalized Land area'),
            (0, 'Urban Fraction'),
            (0, 'Agricultural Land Fraction'),
            (2, 'Poverty Fraction'),
            (2, 'property crime rate')
           ]
base_feature_cols = [feat[1] for feat in features if feat[0] == 0]
add_feature_cols = [feat[1] for feat in features if feat[0] == 1]

# Try to predict the fraction of democratic voters
label_col = 'Dfrac'

# Fold by a column to try testing transfer predictability across values of that column (e.g. can a subset of states predict others)
fold_columns = [(0, 'State'),
                (0, 'FIPS'),
                (0, None)
                ]

fold_col = 'State'
fold_col = 'FIPS'
fold_col = None
sample_cols = [2, 3, 4] + list(range(9, len(full_data.columns)))

baseline_model(full_data, label_col)
for add_feat_set in powerset(add_feature_cols, min_ct=0, max_ct=1):
    for fold_col in [fc[1] for fc in fold_columns if fc[0] == 0]:
        print("Folding by {}".format(fold_col))
        feature_cols = base_feature_cols + add_feat_set
        evaluate_model(full_data, label_col, feature_cols, fold_col)
random_sample(full_data, sample_cols)

In [None]:
#full_data.loc[full_data['FIPS'] < 10000]

In [None]:
for item in Counter(full_data['FIPS'].values).items():
    if item[1] > 1:
        print(item)

In [None]:
list(map(list, list(powerset(range(4)))))