## Imports and Functions

#### The code in the file ML.py is modified from material for Rayid Ghani's Spring 2018 ML for Public Policy course.

In [1]:
from data_functions import *
from ML_continuous import *

import warnings
warnings.filterwarnings('ignore')

import statsmodels.api as sm
import statsmodels.formula.api as smf

import random

In [2]:
def temporal_train_test_data_split(df, features, filter_var, outcome_var):
    start_date = 1992
    training_end_date = 2014
    testing_end_date = 2018

    y_df = df[[filter_var, outcome_var]]

    x_train_filter_by_date = filter_df_by_date_range(df, filter_var, training_end_date, testing_end_date)
    x_test_filter_by_date = filter_df_by_date_range(df, filter_var, start_date, training_end_date)

    y_train_filter_by_date = filter_df_by_date_range(y_df, filter_var, training_end_date, testing_end_date)
    y_test_filter_by_date = filter_df_by_date_range(y_df, filter_var, start_date, training_end_date)

    x_train = x_train_filter_by_date[features]
    x_test = x_test_filter_by_date[features]

    y_train = y_train_filter_by_date[outcome_var]
    y_test = y_test_filter_by_date[outcome_var]

    return x_test, x_train, y_test, y_train

### Open and reformat data

In [3]:
data = read('texas_info.xlsx')
data.columns

Index(['election_year', 'candidate_name', 'vote_share', 'president',
       'governor', 'senate', 'pres_elec_year', 'current_presidents_dem',
       'unemployment', 'turnout', 'candidate_white', 'candidate_hispanic',
       'candidate_male'],
      dtype='object')

In [4]:
data.iloc[11]['candidate_hispanic'] = 1
# make tony sanchez hispanic

In [5]:
data

Unnamed: 0,election_year,candidate_name,vote_share,president,governor,senate,pres_elec_year,current_presidents_dem,unemployment,turnout,candidate_white,candidate_hispanic,candidate_male
0,2018,Lupe Valdez,0.4234,0,1,0,0,0,4.0,0.53,0,1,0
1,2018,Beto O'Rourke,0.483,0,0,1,0,0,4.0,0.53,1,0,1
2,2016,Hillary Clinton,0.4324,1,0,0,1,1,4.6,0.4645,1,0,0
3,2014,Wendy Davis,0.389,0,1,0,0,1,5.1,0.2499,1,0,0
4,2012,Barack Obama,0.4138,1,0,0,1,1,6.7,0.4373,0,0,1
5,2012,Paul Sadler,0.4062,0,0,1,1,1,6.7,0.4373,1,0,1
6,2010,Bill White,0.423,0,1,0,0,1,8.1,0.27,1,0,1
7,2008,Barack Obma,0.4366,1,0,0,1,0,4.8,0.4555,1,0,1
8,2006,Chris Bell,0.2979,0,1,0,0,0,4.9,0.2644,1,0,1
9,2006,Barbara Ann Radnofsky,0.3605,0,0,1,0,0,4.9,0.2644,1,0,0


#### Add columns that could be useful.

In [6]:
df = pd.concat([data, pd.get_dummies(data['election_year'])], axis=1)

In [7]:
df.columns = [ 'election_year','candidate_name','vote_share', 'president','governor','senate','pres_elec_year', 
              'current_presidents_dem','unemployment',  'turnout','candidate_white',     'candidate_hispanic',
               'candidate_male','1992','1994',                     '1996',
                           '1998',                     '2000',
                           '2002',                     '2004',
                           '2006',                     '2008',
                           '2010',                     '2012',
                           '2014',                     '2016',
                           '2018']

In [8]:
all_feats = ['pres_elec_year', 'current_presidents_dem',
                 'unemployment',                'turnout',
              'candidate_white',     'candidate_hispanic',
               'candidate_male']

### Models

#### Model with all factors

In [9]:
offices = ['president', 'governor','senate',]
    
build_model = 'vote_share ~ ' + 'Q("' + str(random.choice(offices)) + '")' + ' + '

for f in all_feats:
    build_model = build_model + 'Q("' + str(f) + '")' + ' + '
    
build_model = build_model + "C(election_year)"

In [14]:
build_model

'vote_share ~ Q("president") + Q("current_presidents_dem") + Q("candidate_hispanic") + Q("candidate_male") + Q("candidate_white") + C(election_year)'

In [10]:
trial = smf.ols(build_model, data=df).fit()
print(trial.summary())

                            OLS Regression Results                            
Dep. Variable:             vote_share   R-squared:                       0.980
Model:                            OLS   Adj. R-squared:                  0.646
Method:                 Least Squares   F-statistic:                     2.936
Date:                Tue, 13 Nov 2018   Prob (F-statistic):              0.433
Time:                        15:35:45   Log-Likelihood:                 68.330
No. Observations:                  19   AIC:                            -100.7
Df Residuals:                       1   BIC:                            -83.66
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

#### Testing 4 factor models 

In [11]:
best_name = ''
best_model = ''
best_rsquared = 0

for i in range(5):
    offices = ['president', 'governor','senate',]
    
    group_of_items = set(all_feats)             
    num_to_select = 4                          
    feats = random.sample(group_of_items, num_to_select)
    
    build_model = 'vote_share ~ ' + 'Q("' + str(random.choice(offices)) + '")' + ' + '
    for f in feats:
        build_model = build_model + 'Q("' + str(f) + '")' + ' + '
    build_model = build_model + "C(election_year)"
    
    trial = smf.ols(build_model, data=df).fit()
    if trial.rsquared > best_rsquared:
        best_model = trial
        best_name = build_model
        best_rsquared = trial.rsquared_adj

In [12]:
best_name

'vote_share ~ Q("president") + Q("current_presidents_dem") + Q("candidate_hispanic") + Q("candidate_male") + Q("candidate_white") + C(election_year)'

In [13]:
print(best_model.summary())

                            OLS Regression Results                            
Dep. Variable:             vote_share   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.986
Method:                 Least Squares   F-statistic:                     77.06
Date:                Tue, 13 Nov 2018   Prob (F-statistic):             0.0894
Time:                        15:35:45   Log-Likelihood:                 99.191
No. Observations:                  19   AIC:                            -162.4
Df Residuals:                       1   BIC:                            -145.4
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                                  coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept         

In [18]:
pd.read_excel('hstpov9.xls')

Unnamed: 0,Table with row headers in column A and column headers in rows 4 to 6.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,"Table 9. Poverty of People, by Region: 1959 to...",,,,,,,,,,,,,,,
1,NOTE: Numbers in thousands. People as of March...,,,,,,,,,,,,,,,
2,Year,Total,,,Northeast,,,Midwest,,,South,,,West,,
3,,Total,Below poverty,,Total,Below poverty,,Total,Below poverty,,Total,Below poverty,,Total,Below poverty,
4,,,Number,Percent,,Number,Percent,,Number,Percent,,Number,Percent,,Number,Percent
5,2017,322549,39698,12.3,55972,6373,11.4,67345,7647,11.4,122250,16609,13.6,76982,9069,11.8
6,2016,319911,40616,12.7,55470,5969,10.8,66897,7809,11.7,121166,17028,14.1,76377,9810,12.8
7,2015,318454,43123,13.5,55779,6891,12.4,67030,7849,11.7,119955,18305,15.3,75690,10079,13.3
8,2014,315804,46657,14.8,55725,7020,12.6,67130,8714,13,118193,19531,16.5,74756,11391,15.2
9,2013 (19),313096,46269,14.8,55529,7205,13,66732,9269,13.9,116956,19040,16.3,73879,10754,14.6
