## Imports and Functions

#### The code in the file ML.py is modified from material for Rayid Ghani's Spring 2018 ML for Public Policy course.

In [1]:
from data_functions import *
from ML_continuous import *

import warnings
warnings.filterwarnings('ignore')

In [2]:
def temporal_train_test_data_split(df, features, filter_var, outcome_var):
    start_date = 1992
    training_end_date = 2014
    testing_end_date = 2018

    y_df = df[[filter_var, outcome_var]]

    x_train_filter_by_date = filter_df_by_date_range(df, filter_var, training_end_date, testing_end_date)
    x_test_filter_by_date = filter_df_by_date_range(df, filter_var, start_date, training_end_date)

    y_train_filter_by_date = filter_df_by_date_range(y_df, filter_var, training_end_date, testing_end_date)
    y_test_filter_by_date = filter_df_by_date_range(y_df, filter_var, start_date, training_end_date)

    x_train = x_train_filter_by_date[features]
    x_test = x_test_filter_by_date[features]

    y_train = y_train_filter_by_date[outcome_var]
    y_test = y_test_filter_by_date[outcome_var]

    return x_test, x_train, y_test, y_train

### Open and reformat data

In [8]:
data = read('texas_info.xlsx')
data.columns

Index(['election_year', 'candidate_name', 'vote_share', 'president',
       'governor', 'senate', 'pres_elec_year', 'current_presidents_dem',
       'unemployment', 'turnout', 'candidate_white', 'candidate_hispanic',
       'candidate_male'],
      dtype='object')

#### Add columns that could be useful.

In [10]:
df = pd.concat([data, pd.get_dummies(data['election_year'])], axis=1)

In [36]:
df.columns = [ 'election_year','candidate_name','vote_share', 'president','governor','senate','pres_elec_year', 
              'current_presidents_dem','unemployment',  'turnout','candidate_white',     'candidate_hispanic',
               'candidate_male','1992','1994',                     '1996',
                           '1998',                     '2000',
                           '2002',                     '2004',
                           '2006',                     '2008',
                           '2010',                     '2012',
                           '2014',                     '2016',
                           '2018']

In [53]:
all_feats = ['president', 
                     'governor',                 'senate',
               'pres_elec_year', 'current_presidents_dem',
                 'unemployment',                'turnout',
              'candidate_white',     'candidate_hispanic',
               'candidate_male']

In [65]:
import random
group_of_items = set(all_feats)
num_to_select = 4
feats = random.sample(group_of_items, num_to_select)

### Models

In [66]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [76]:
build_model = 'vote_share ~ '
for f in feats:
    build_model = build_model + 'Q("' + str(f) + '")' + ' + '

In [82]:
build_model = build_model + "C(election_year)"

In [88]:
trial = smf.ols(build_model, data=df).fit()
print(trial.summary())
trial.rsquared
print(build_model)

                            OLS Regression Results                            
Dep. Variable:             vote_share   R-squared:                       0.834
Model:                            OLS   Adj. R-squared:                  0.255
Method:                 Least Squares   F-statistic:                     1.439
Date:                Tue, 13 Nov 2018   Prob (F-statistic):              0.392
Time:                        13:44:40   Log-Likelihood:                 48.076
No. Observations:                  19   AIC:                            -66.15
Df Residuals:                       4   BIC:                            -51.99
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [89]:
best_model = ''
best_rsquared = 0

for i in range(5):
    group_of_items = set(all_feats)
    num_to_select = 4
    feats = random.sample(group_of_items, num_to_select)
    
    build_model = 'vote_share ~ '
    for f in feats:
        build_model = build_model + 'Q("' + str(f) + '")' + ' + '
    build_model = build_model + "C(election_year)"
    
    trial = smf.ols(build_model, data=df).fit()
    if trial.rsquared > best_rsquared:
        best_model = build_model
        best_rsquared = trial.rsquared

## This model is decent

vote_share ~ Q("president") + Q("turnout") + Q("pres_elec_year") + Q("unemployment") + Q("current_presidents_dem") + C(election_year)

In [90]:
best_model

'vote_share ~ Q("governor") + Q("candidate_white") + Q("current_presidents_dem") + Q("president") + C(election_year)'

In [91]:
best_rsquared

0.8894968750684702