In [43]:
import os
import pandas as pd
import re
from collections import OrderedDict

In [2]:
pd.set_option('display.max_rows', 1000)

## Hierarchical Models

- Multiple parameters that can be regarded as related or connected by the structure of the problem itself.
- The population and thus the observed data is grouped over an index j.

In [3]:
ddir = '../data/priors'
polls_dir = os.path.join(ddir, 'gallup_polls')

In [4]:
def clean_df_cols(df):
        df.columns = clean_cols(df.columns)
        return df

def clean_cols(cs):
    pattern = re.compile('[\W]')
    return map(lambda c: pattern.sub('', c.strip(' ').replace(' ', '_').replace('__', '_').lower()), cs)

In [5]:
def get_polls(polls_dir):

    prefix = 'Gallup_Opinion_Polls_2013_2018_'
    suffix = '.csv'

    poll_csvs = filter(lambda f: f.startswith(prefix) and f.endswith(suffix), os.listdir(polls_dir))
    ref = lambda p: os.path.join(polls_dir, p)
        
    def clean_df_dict(df_dict):
        return {k: clean_df_cols(df) for k, df in df_dict.iteritems()}

    polls = clean_df_dict({f.replace(prefix, '').replace(suffix, '').lower(): pd.read_csv(ref(f)) for f in poll_csvs})
    return polls

In [50]:
def get_electoral_history(ddir):
    path = 'Previous_Elections_2002-2013.csv'
    base_df = clean_df_cols(pd.read_csv(os.path.join(ddir, path)))
    base_df = base_df[base_df['assembly'] == 'National']
    NA_by_election_year = OrderedDict(((y, df) for y, df in base_df.groupby('year')))    
    return NA_by_election_year

In [51]:
NA_by_election_year = get_electoral_history(ddir)

In [22]:
def get_candidates(ddir):
    f = 'NA_List.csv'
    df = clean_df_cols(pd.read_csv(os.path.join(ddir, f)))
    return df

In [23]:
candidates = get_candidates(ddir)

In [24]:
candidates.columns

Index([u'serial_number', u'province', u'name_of_candidate',
       u'address_of_the_contestant', u'symbol_alloted', u'party_affiliation',
       u'constituency_number_id', u'constituency_name'],
      dtype='object')

In [31]:
history.columns

Index([u'position', u'candidate', u'party', u'votes', u'_vote_share',
       u'registered_votes', u'valid_votes', u'rejected_votes', u'turnout',
       u'constituency', u'assembly', u'year'],
      dtype='object')

In [38]:
history.assembly.unique()

array(['National', 'Balochistan', 'KPK', 'Punjab', 'Sindh'], dtype=object)

In [56]:
def select_candidate_at_position(df, pos):
    pos = df[df['position'] == pos]
    relevant_features = ['constituency', 'candidate', 'party', '_vote_share', 'turnout']
    return pos[relevant_features]

In [57]:
def constituency_winners(df):
    """
    Per constituency, selects the winner and returns their
    name, party affiliation, _vote_share and turnout. 
    """
    return select_candidate_at_position(df, pos='Winner')

In [84]:
def was_close_contest(df):
    """
    Per constituency, checks whether the vote share difference
    between the top two candidates was less than 10%???
    Returns a bool for the question plus the names and
    party affiliations of the top two.
    """
    winners = constituency_winners(df)
    second_place = select_candidate_at_position(df, '2')
    #assert winners.shape[0] == second_place.shape[0], (winners.shape, second_place.shape)
    #close = winners.pop('_vote_share') - second_place.pop('_vote_share') < 10.0
    compound = pd.merge(winners, 
                        second_place, 
                        how='outer', 
                        on='constituency', suffixes=('_first', '_second'))
    
    #compound['was_close_contest'] = close
    return compound

In [97]:
def constituency_electoral_record(years, timeseries, by='party'):
    assert by in {'party', 'candidate'}
    dfs = zip(years, timeseries)
    print(years)
    df_final = reduce(lambda left, right: pd.merge(left[1],
                                                   right[1],
                                                   suffixes=(left[0], right[0]),
                                                   on='constituency'),
                      dfs)
    return df_final

In [98]:
constituency_winners_by_election_year = [(y, constituency_winners(df)) for y, df in NA_by_election_year.iteritems()]
#constituency_competitiveness = [(y, was_close_contest(df)) for y, df in NA_by_election_year.iteritems()]

In [99]:
#closeness_record_party = constituency_electoral_record(*zip(*constituency_competitiveness), by='party')
record_by_party = constituency_electoral_record(*zip(*constituency_winners_by_election_year), by='party')
record_by_candidate = constituency_electoral_record(*zip(*constituency_winners_by_election_year), by='candidate')

(2002, 2008, 2013)


KeyError: 1

In [None]:
def candidates_electoral_history(constituency_history, candidate_name):
    return