# SixFifty GE2017 Model
## Create modelling datasets
For more information please see [SixFifty.org.uk](https://sixfifty.org.uk) or the [SixFifty Hackathon repo](https://github.com/six50/hackathon).

## Import datasets and pre-flight checks

In [1]:
# Libaries
import feather
import matplotlib
import numpy as np
from pathlib import Path
import pandas as pd

# Config
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
DATA_DIR = Path('../data/')
%matplotlib inline

Running `data/retrieve_data.py` from the `hackathon` repo root will download the following datasets into the required location.

In [2]:
ge_data_dir = DATA_DIR / 'general_election' / 'electoral_commission' / 'results'
ge_2010 = pd.read_feather(ge_data_dir / 'ge_2010_results.feather')
ge_2015 = pd.read_feather(ge_data_dir / 'ge_2015_results.feather')

In [3]:
ge_2010.head(3)

Unnamed: 0,Press Association Reference,Constituency Name,Region,Election Year,Electorate,Votes,AC,AD,AGS,APNI,APP,AWL,AWP,BB,BCP,Bean,Best,BGPV,BIB,BIC,Blue,BNP,BP Elvis,C28,Cam Soc,CG,Ch M,Ch P,CIP,CITY,CNPG,Comm,Comm L,Con,Cor D,CPA,CSP,CTDP,CURE,D Lab,D Nat,DDP,DUP,ED,EIP,EPA,FAWG,FDP,FFR,Grn,GSOT,Hum,ICHC,IEAC,IFED,ILEU,Impact,Ind1,Ind2,Ind3,Ind4,Ind5,IPT,ISGB,ISQM,IUK,IVH,IZB,JAC,Joy,JP,Lab,Land,LD,Lib,Libert,LIND,LLPB,LTT,MACI,MCP,MEDI,MEP,MIF,MK,MPEA,MRLP,MRP,Nat Lib,NCDV,ND,New,NF,NFP,NICF,Nobody,NSPS,PBP,PC,Pirate,PNDP,Poet,PPBF,PPE,PPNV,Reform,Respect,Rest,RRG,RTBP,SACL,Sci,SDLP,SEP,SF,SIG,SJP,SKGP,SMA,SMRA,SNP,Soc,Soc Alt,Soc Dem,Soc Lab,South,Speaker,SSP,TF,TOC,Trust,TUSC,TUV,UCUNF,UKIP,UPS,UV,VCCA,Vote,Wessex Reg,WRP,You,Youth,YRDPL
0,1.0,Aberavon,Wales,2010.0,50838.0,30958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,558.0,0.0,0.0,0.0,0.0,0.0,1276.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4411.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,919.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16073.0,0.0,5034.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2198.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,Aberconwy,Wales,2010.0,44593.0,29966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,137.0,0.0,0.0,0.0,0.0,0.0,10734.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7336.0,0.0,5786.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5341.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,Aberdeen North,Scotland,2010.0,64808.0,37701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,635.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16746.0,0.0,7001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8385.0,0.0,0.0,0.0,0.0,0.0,0.0,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
ge_2015.head(3)

Unnamed: 0,Press Association ID Number,Constituency ID,Constituency Name,Constituency Type,County,Region ID,Region,Country,Election Year,Electorate,Valid Votes,30-50,Above,Active Dem,AD,Alliance,AP,Apni,Atom,AWP,Beer BS,Birthday,BNP,Bournemouth,Bristol,Brit Dem,Brit Ind,C,Campaign,Change,Ch M,Ch P,Christian,Class War,Comm,Comm Brit,Comm Lge,Communist,Community,Consensus,CPA,Croydon,CSA,CSP,Dem Ref,Digital,DP,DUP,Eccentric,Elmo,Eng Dem,EP,FPT,Green,Green Soc,Guildford,Hoi,Hospital,Humanity,IASI,IE,Ind,Ind2,Ind CHC,IPAP,ISWSL,IZB,JACP,JMB,Lab,Lab Co-op,LD,Lib,Lib GB,Lincs Ind,Loony,LP,LU,Magna Carta,Mainstream,Manston,Meb Ker,Nat Lib,ND,NE Party,New IC,NF,NHAP,Northern,Patria,PBP,PC,Peace,PF,Pilgrim,Pirate,Plural,Poole,PPP,PP UK,PSP,Real,Realist,Reality,Rep Soc,Respect,Restore,RFAC,Rochdale,Roman,RTP,Scottish CP,SCP,SDLP,SEP,SF,S New,SNP,Soc Dem,Soc Lab,Song,Southport,Speaker,SPGB,SSP,TEP,Thanet,TSPP,TUSC,TUV,Ubuntu,UKIP,UKPDP,U Party,Uttlesford,UUP,Vapers,VAT,Wessex Reg,Whig,Wigan,Worth,WP,WRP,WVPTFP,Yorks,Young,Zeb
0,1.0,W07000049,Aberavon,County,West Glamorgan,W92000004,Wales,Wales,2015.0,49821.0,31523.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3742.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,711.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1137.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15416.0,0.0,1397.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3663.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,352.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,134.0,0.0,0.0,4971.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,W07000058,Aberconwy,County,Clwyd,W92000004,Wales,Wales,2015.0,45525.0,30148.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12513.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,727.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8514.0,0.0,1391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3536.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3467.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,S14000001,Aberdeen North,Burgh,Scotland,S92000003,Scotland,Scotland,2015.0,67745.0,43936.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5304.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11397.0,2050.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,186.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24793.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Before we get into the modelling, there's a bit of cleanup to do and a few checks to run:
- [Some MPs](https://en.wikipedia.org/wiki/Labour_Co-operative) are members of _both_ the [Labour Party](www.labour.org.uk) and the [Co-operative Party](http://www.party.coop/), which plays havoc with modelling. We will therefore consider them all members of the Labour party.
- Check that there are all constituencies in the 2010 data are also in 2015 data, and vice versa.
- Create `country_lookup`, a dictionary that returns the country of any constituency given its PANO (Press Association ID Number).

In [5]:
parties_lookup = {
    'Con': 'con',
    'Lab': 'lab',
    'LD': 'ld',
    'UKIP': 'ukip',
    'Grn': 'grn',
    'Other': 'other'
}
parties = list(parties_lookup.values())

In [6]:
parties_lookup_2015 = {
    'C': 'con',
    'Lab': 'lab',
    'LD': 'ld',
    'UKIP': 'ukip',
    'Green': 'grn',
    'SNP': 'snp',
    'PC': 'pc',
    'Other': 'other'
}
parties_17 = list(parties_lookup_2015.values())

In [7]:
# Merge Labour and Coop
ge_2015['Lab'] = ge_2015['Lab'] + ge_2015['Lab Co-op']
del ge_2015['Lab Co-op']

In [8]:
# Check constituencies are mergeable
print(set(ge_2010['Press Association Reference']).difference(set(ge_2015['Press Association ID Number'])))  # should be empty set
print(set(ge_2015['Press Association ID Number']).difference(set(ge_2010['Press Association Reference'])))  # should be empty set
print(len(ge_2010), len(ge_2010['Press Association Reference']))  # should both be 650
print(len(ge_2015), len(ge_2015['Press Association ID Number']))  # should both be 650

set()
set()
650 650
650 650


In [9]:
# Make PANO -> geo lookup
geo_lookup = [(x[1][0], x[1][1]) for x in ge_2015[['Press Association ID Number', 'Country']].iterrows()]
geo_lookup = dict(geo_lookup)
print(geo_lookup[14.0])  # should be "Northern Ireland"
# Add London boroughs
london_panos = ge_2015[ge_2015['County'] == 'London']['Press Association ID Number'].values
for pano in london_panos:
    geo_lookup[pano] = 'London'
print(geo_lookup[237.0])  # should be "London"
# Rename other England
for k in geo_lookup:
    if geo_lookup[k] == 'England':
        geo_lookup[k] = 'England_not_london'
    elif geo_lookup[k] == 'Northern Ireland':
        geo_lookup[k] = 'NI'

Northern Ireland
London


## 2015 polling

In [10]:
polls_data_dir = DATA_DIR / 'polls'

In [11]:
# Latest polling data (3 days before election, i.e. if election on 7th May 2015, polls as of 4th May)
polls = pd.read_feather(polls_data_dir / 'polls.feather')
polls.head()

Unnamed: 0,company,client,method,from,to,sample_size,con,lab,ld,ukip,grn,snp,pdf
0,Ipsos MORI,FT,,NaT,2005-05-19,,0.3,0.37,0.26,,,,
1,Ipsos MORI,First Tuesday,,NaT,2005-05-19,,0.27,0.4,0.26,,,,
2,YouGov,Telegraph,,NaT,2005-05-24,,0.31,0.38,0.23,,,,
3,Ipsos MORI,Social Research Institute,,NaT,2005-06-16,,0.29,0.42,0.21,,,,
4,ICM,Guardian,,NaT,2005-06-17,,0.31,0.38,0.23,,,,


In [12]:
pollsters = polls[(polls.to >= '2015-04-04') & (polls.to <= '2015-05-04')].company.unique()
pollsters

array(['YouGov', 'Populus', 'Kantar TNS', 'ComRes', 'Survation',
       'Opinium', 'Panelbase', 'ICM', 'Ashcroft', 'Ipsos MORI', 'BMG'],
      dtype=object)

In [13]:
# Use single last poll from each pollster in final week of polling then average out
polls = polls[(polls.to >= '2015-04-01') & (polls.to <= '2015-05-07')]
pop = polls.loc[:0]
for p in pollsters:
    pop = pop.append(polls[polls.company == p].tail(1))
pop

Unnamed: 0,company,client,method,from,to,sample_size,con,lab,ld,ukip,grn,snp,pdf
2691,YouGov,Sun,,NaT,2015-05-06,,0.34,0.34,0.1,0.12,0.04,,
2680,Populus,FT,,NaT,2015-05-04,,0.34,0.34,0.1,0.13,0.05,,
2678,Kantar TNS,,,NaT,2015-05-04,,0.33,0.32,0.08,0.14,0.06,,
2690,ComRes,Daily Mail/ITV News,,NaT,2015-05-06,,0.35,0.34,0.09,0.12,0.04,,
2689,Survation,Mirror,,NaT,2015-05-06,,0.33,0.33,0.09,0.16,0.04,,
2693,Opinium,,,NaT,2015-05-06,,0.35,0.34,0.08,0.12,0.06,,
2692,Panelbase,,,NaT,2015-05-06,,0.31,0.33,0.08,0.16,0.05,,
2686,ICM,Guardian,,NaT,2015-05-06,,0.34,0.35,0.09,0.11,0.04,,
2688,Ashcroft,,,NaT,2015-05-06,,0.33,0.33,0.1,0.11,0.06,,
2687,Ipsos MORI,Evening Standard,,NaT,2015-05-06,,0.36,0.35,0.08,0.11,0.05,,


In [14]:
# Create new polls dictionary by geo containing simple average across all pollsters
polls = {'UK': {}}
for p in ['con', 'lab', 'ld', 'ukip', 'grn']:
    polls['UK'][p] = pop[p].mean()
polls['UK'] = pd.Series(polls['UK'])
polls['UK']

con     0.338182
grn     0.048182
lab     0.337273
ld      0.090000
ukip    0.127273
dtype: float64

In [15]:
# Scotland, Wales, NI, London not available in 2015 data (we haven't extracted them yet!)
# Add Other
for geo in ['UK']:
    if 'other' not in polls[geo]:
        polls[geo]['other'] = 1 - sum(polls[geo])

In [16]:
# Reweight to 100%
for geo in ['UK']:
    polls[geo] = polls[geo] / polls[geo].sum()
polls

{'UK': con      0.338182
 grn      0.048182
 lab      0.337273
 ld       0.090000
 ukip     0.127273
 other    0.059091
 dtype: float64}

## 2017 polling

In [17]:
# Latest polling data
polls_17 = {'UK': {}}
polls_17_uk = pd.read_feather(polls_data_dir / 'polls.feather')
# Filter to recent data
polls_17_uk = polls_17_uk[polls_17_uk.to >= '2017-06-06']
# Add parties
for p in ['con', 'lab', 'ld', 'ukip', 'grn', 'snp']:
    polls_17['UK'][p] = (polls_17_uk.sample_size * polls_17_uk[p]).sum() / polls_17_uk.sample_size.sum()
polls_17['UK'] = pd.Series(polls_17['UK'], index=['con', 'lab', 'ld', 'ukip', 'snp', 'grn'])
polls_17

{'UK': con     0.426130
 lab     0.371942
 ld      0.070578
 ukip    0.041625
 snp     0.038982
 grn     0.019173
 dtype: float64}

In [18]:
# Repeat for Scotland polling...
polls_17['Scotland'] = {}
polls_17_tmp = pd.read_feather(polls_data_dir / 'polls_scotland.feather')
polls_17_tmp = polls_17_tmp[polls_17_tmp.to >= '2017-06-05']
for p in ['con', 'lab', 'ld', 'ukip', 'snp', 'grn']:
    polls_17['Scotland'][p] = (polls_17_tmp.sample_size * polls_17_tmp[p]).sum() / polls_17_tmp.sample_size.sum()
polls_17['Scotland'] = pd.Series(polls_17['Scotland'], index=['con', 'lab', 'ld', 'ukip', 'snp', 'grn'])
polls_17['Scotland']

con     0.271401
lab     0.236513
ld      0.053923
ukip    0.003895
snp     0.418192
grn     0.010513
dtype: float64

In [19]:
# ...and Wales
polls_17['Wales'] = {}
polls_17_tmp = pd.read_feather(polls_data_dir / 'polls_wales.feather')
polls_17_tmp = polls_17_tmp[polls_17_tmp.to >= '2017-06-07']
for p in ['con', 'lab', 'ld', 'ukip', 'pc', 'grn']:
    polls_17['Wales'][p] = (polls_17_tmp.sample_size * polls_17_tmp[p]).sum() / polls_17_tmp.sample_size.sum()
polls_17['Wales'] = pd.Series(polls_17['Wales'], index=['con', 'lab', 'ld', 'ukip', 'pc', 'grn'])
polls_17['Wales']

con     0.314166
lab     0.430377
ld      0.044139
ukip    0.045792
pc      0.148997
grn     0.000783
dtype: float64

In [20]:
# NI
polls_17['NI'] = (pd.read_feather(polls_data_dir / 'polls_ni_smoothed.feather')
                    .sort_values(by='date', ascending=False).iloc[0])
del polls_17['NI']['date']

# Collate all NI parties under other
for k in polls_17['NI'].index:
    if k not in parties_17:
        del polls_17['NI'][k]

del polls_17['NI']['other']
polls_17['NI']

grn     0.006
ukip    0.001
con     0.002
Name: 5, dtype: object

In [21]:
# London
polls_17['London'] = {}
polls_17_tmp = pd.read_feather(polls_data_dir / 'polls_london.feather')
polls_17_tmp = polls_17_tmp[polls_17_tmp.to >= '2017-05-31']
for p in ['con', 'lab', 'ld', 'ukip', 'grn']:
    polls_17['London'][p] = (polls_17_tmp.sample_size * polls_17_tmp[p]).sum() / polls_17_tmp.sample_size.sum()
polls_17['London'] = pd.Series(polls_17['London'], index=['con', 'lab', 'ld', 'ukip', 'grn'])
polls_17['London']

con     0.376424
lab     0.458455
ld      0.104133
ukip    0.031145
grn     0.021655
dtype: float64

In [22]:
# Estimate polling for England excluding London
survation_wts = {
    # from http://survation.com/wp-content/uploads/2017/06/Final-MoS-Post-BBC-Event-Poll-020617SWCH-1c0d4h9.pdf
    'Scotland': 85,
    'England': 881,
    'Wales': 67,
    'London': 137,
    'NI': 16
}
survation_wts['England_not_london'] = survation_wts['England'] - survation_wts['London']
survation_wts['UK'] = survation_wts['Scotland'] + survation_wts['England'] + survation_wts['Wales'] + survation_wts['NI']

def calculate_england_not_london(party):
    out = polls_17['UK'][party] * survation_wts['UK']
    for geo in ['Scotland', 'Wales', 'NI', 'London']:
        if party in polls_17[geo]:
            out = out - polls_17[geo][party] * survation_wts[geo]
    out = out / survation_wts['England_not_london']
    return out

polls_17['England_not_london'] = {'pc': 0, 'snp': 0}
for party in ['con', 'lab', 'ld', 'ukip', 'grn']:
    polls_17['England_not_london'][party] = calculate_england_not_london(party)

polls_17['England_not_london'] = pd.Series(polls_17['England_not_london'])
polls_17['England_not_london']

con     0.472164
grn     0.021644
lab     0.374221
ld      0.070201
pc      0.000000
snp     0.000000
ukip    0.048364
dtype: float64

In [23]:
# Fill in the gaps
for geo in ['UK', 'Scotland', 'Wales', 'NI', 'London', 'England_not_london']:
    for party in ['con', 'lab', 'ld', 'ukip', 'grn', 'snp', 'pc']:
        if party not in polls_17[geo]:
            print("Adding {} to {}".format(party, geo))
            polls_17[geo][party] = 0

Adding pc to UK
Adding pc to Scotland
Adding snp to Wales
Adding lab to NI
Adding ld to NI
Adding snp to NI
Adding pc to NI
Adding snp to London
Adding pc to London


In [24]:
# Fix PC (Plaid Cymru) for UK
polls_17['UK']['pc'] = polls_17['Wales']['pc'] * survation_wts['Wales'] / survation_wts['UK']

In [25]:
# Add Other
for geo in ['UK', 'Scotland', 'Wales', 'NI', 'London', 'England_not_london']:
    if 'other' not in polls_17[geo]:
        polls_17[geo]['other'] = 1 - sum(polls_17[geo])

# This doesn't work for UK or England_not_london; set current other polling to match 2015 result
polls_17['UK']['other'] = 0.03 # ge.other.sum() / ge['Valid Votes'].sum()
polls_17['England_not_london']['other'] = 0.01 # ge[ge.geo == 'England_not_london'].other.sum() / ge[ge.geo == 'England_not_london']['Valid Votes'].sum()

In [26]:
# Reweight to 100%
for geo in ['UK', 'Scotland', 'Wales', 'NI', 'London', 'England_not_london']:
    polls_17[geo] = polls_17[geo] / polls_17[geo].sum()

In [27]:
# Let's take a look!
polls_17

{'UK': con      0.422770
 lab      0.369010
 ld       0.070021
 ukip     0.041297
 snp      0.038675
 grn      0.019022
 pc       0.009441
 other    0.029763
 dtype: float64, 'Scotland': con      0.271401
 lab      0.236513
 ld       0.053923
 ukip     0.003895
 snp      0.418192
 grn      0.010513
 pc       0.000000
 other    0.005563
 dtype: float64, 'Wales': con      0.314166
 lab      0.430377
 ld       0.044139
 ukip     0.045792
 pc       0.148997
 grn      0.000783
 snp      0.000000
 other    0.015746
 dtype: float64, 'NI': grn      0.006
 ukip     0.001
 con      0.002
 lab          0
 ld           0
 snp          0
 pc           0
 other    0.991
 Name: 5, dtype: object, 'London': con      0.376424
 lab      0.458455
 ld       0.104133
 ukip     0.031145
 grn      0.021655
 snp      0.000000
 pc       0.000000
 other    0.008189
 dtype: float64, 'England_not_london': con      0.473778
 grn      0.021718
 lab      0.375500
 ld       0.070441
 pc       0.000000
 snp      0.0000

## Export polling data

In [28]:
polls_15_csv = pd.DataFrame(columns=['con', 'lab', 'ld', 'ukip', 'grn', 'snp', 'pc', 'other'])
for geo in polls:
    for party in polls[geo].index:
        polls_15_csv.loc[geo, party] = polls[geo].loc[party]
polls_15_csv.to_csv(polls_data_dir / 'final_polls_2015.csv', index=True)
polls_15_csv

Unnamed: 0,con,lab,ld,ukip,grn,snp,pc,other
UK,0.338182,0.337273,0.09,0.127273,0.0481818,,,0.0590909


In [29]:
polls_17_csv = pd.DataFrame(columns=['con', 'lab', 'ld', 'ukip', 'grn', 'snp', 'pc', 'other'])
for geo in polls_17:
    for party in polls_17[geo].index:
        polls_17_csv.loc[geo, party] = polls_17[geo].loc[party]
polls_17_csv.to_csv(polls_data_dir / 'final_polls_2017.csv', index=True)
polls_17_csv

Unnamed: 0,con,lab,ld,ukip,grn,snp,pc,other
UK,0.42277,0.36901,0.0700214,0.041297,0.0190216,0.0386751,0.00944149,0.0297635
Scotland,0.271401,0.236513,0.0539225,0.00389513,0.0105126,0.418192,0.0,0.0055632
Wales,0.314166,0.430377,0.0441389,0.0457916,0.000782789,0.0,0.148997,0.015746
NI,0.002,0.0,0.0,0.001,0.006,0.0,0.0,0.991
London,0.376424,0.458455,0.104133,0.0311447,0.0216548,0.0,0.0,0.00818921
England_not_london,0.473778,0.3755,0.0704406,0.0485293,0.0217184,0.0,0.0,0.0100342


## Reduce ge_2010 dataframe to above results only

In [30]:
# GE 2010 dataset has a lot of parties...
ge_2010.head()

Unnamed: 0,Press Association Reference,Constituency Name,Region,Election Year,Electorate,Votes,AC,AD,AGS,APNI,APP,AWL,AWP,BB,BCP,Bean,Best,BGPV,BIB,BIC,Blue,BNP,BP Elvis,C28,Cam Soc,CG,Ch M,Ch P,CIP,CITY,CNPG,Comm,Comm L,Con,Cor D,CPA,CSP,CTDP,CURE,D Lab,D Nat,DDP,DUP,ED,EIP,EPA,FAWG,FDP,FFR,Grn,GSOT,Hum,ICHC,IEAC,IFED,ILEU,Impact,Ind1,Ind2,Ind3,Ind4,Ind5,IPT,ISGB,ISQM,IUK,IVH,IZB,JAC,Joy,JP,Lab,Land,LD,Lib,Libert,LIND,LLPB,LTT,MACI,MCP,MEDI,MEP,MIF,MK,MPEA,MRLP,MRP,Nat Lib,NCDV,ND,New,NF,NFP,NICF,Nobody,NSPS,PBP,PC,Pirate,PNDP,Poet,PPBF,PPE,PPNV,Reform,Respect,Rest,RRG,RTBP,SACL,Sci,SDLP,SEP,SF,SIG,SJP,SKGP,SMA,SMRA,SNP,Soc,Soc Alt,Soc Dem,Soc Lab,South,Speaker,SSP,TF,TOC,Trust,TUSC,TUV,UCUNF,UKIP,UPS,UV,VCCA,Vote,Wessex Reg,WRP,You,Youth,YRDPL
0,1.0,Aberavon,Wales,2010.0,50838.0,30958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,558.0,0.0,0.0,0.0,0.0,0.0,1276.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4411.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,919.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16073.0,0.0,5034.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2198.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.0,Aberconwy,Wales,2010.0,44593.0,29966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,137.0,0.0,0.0,0.0,0.0,0.0,10734.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7336.0,0.0,5786.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5341.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,632.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,Aberdeen North,Scotland,2010.0,64808.0,37701,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,635.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16746.0,0.0,7001.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8385.0,0.0,0.0,0.0,0.0,0.0,0.0,268.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.0,Aberdeen South,Scotland,2010.0,64031.0,43034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,529.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8914.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,413.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15722.0,0.0,12216.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5102.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,Aberdeenshire West & Kincardine,Scotland,2010.0,66110.0,45195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,513.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13678.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6159.0,0.0,17362.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7086.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,397.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Top 15 parties
ge_2010.iloc[:, 11:].sum().sort_values(ascending=False).head(15)

Con        10703654.0
Lab         8606517.0
LD          6836248.0
UKIP         919471.0
BNP          564321.0
SNP          491386.0
Grn          285612.0
Ind1         175604.0
SF           171942.0
DUP          168216.0
PC           165394.0
SDLP         110970.0
UCUNF        102361.0
ED            64826.0
Respect       33251.0
dtype: float64

In [32]:
# Define other parties
other_parties = list(set(ge_2010.columns) - set(ge_2010.columns[:6]) - set(parties_lookup.keys()))
other_parties_2015 = list(set(ge_2015.columns) - set(ge_2015.columns[:11]) - set(parties_lookup_2015.keys()))

ge_2010['Other'] = ge_2010.loc[:, other_parties].sum(axis=1)
ge_2015['Other'] = ge_2015.loc[:, other_parties_2015].sum(axis=1)

In [33]:
# Filter ge to metadata cols + parties of interest
ge = ge_2010.loc[:, list(ge_2010.columns[:6]) + list(parties_lookup.keys())]
ge15 = ge_2015.loc[:, list(ge_2015.columns[:11]) + list(parties_lookup_2015.keys())]

# Rename parties
ge.columns = [parties_lookup[x] if x in parties_lookup else x for x in ge.columns]
ge15.columns = [parties_lookup_2015[x] if x in parties_lookup_2015 else x for x in ge15.columns]

# Calculate vote share
for party in parties:
    ge[party + '_pc'] = ge[party] / ge['Votes']

for party in parties_17:
    ge15[party + '_pc'] = ge15[party] / ge15['Valid Votes']

ge.head(3)

Unnamed: 0,Press Association Reference,Constituency Name,Region,Election Year,Electorate,Votes,con,lab,ld,ukip,grn,other,con_pc,lab_pc,ld_pc,ukip_pc,grn_pc,other_pc
0,1.0,Aberavon,Wales,2010.0,50838.0,30958,4411.0,16073.0,5034.0,489.0,0.0,4951.0,0.142483,0.519187,0.162607,0.015796,0.0,0.159926
1,2.0,Aberconwy,Wales,2010.0,44593.0,29966,10734.0,7336.0,5786.0,632.0,0.0,5478.0,0.358206,0.244811,0.193085,0.021091,0.0,0.182807
2,3.0,Aberdeen North,Scotland,2010.0,64808.0,37701,4666.0,16746.0,7001.0,0.0,0.0,9288.0,0.123763,0.444179,0.185698,0.0,0.0,0.24636


In [34]:
# Export to disk
ge.to_csv(DATA_DIR / 'model' / 'ge10.csv', index=False)
ge.to_feather(DATA_DIR / 'model' / 'ge10.feather')

ge15.to_csv(DATA_DIR / 'model' / 'ge15.csv', index=False)
ge15.to_feather(DATA_DIR / 'model' / 'ge15.feather')