First we compute the popularity of policy issues based on recent manifestos. 
We build a policy category classifier (8-classes) with all manifestos written in English.
Then we keep a popularity index based on the target country's recent election manifestos. For a petition, we classify the policy category of each of the sentences and aggregate the popularity score at the petition-level.

In [None]:
from bs4 import BeautifulSoup
import re

#We clean the manifestos from manifesto project https://manifesto-project.wzb.eu

def clean_manifesto(text, label): 
    sentence = [] 
    cmpcode = [] 
    for i in xrange(0, len(text)): 
        if len(str(text[i]))>3 and str(text[i])!=np.nan: 
            line = text[i].lower().split() 
            words = [w for w in line] 
            sentence.append(" ".join(words))            
            if (label[i]=='NA') or (np.isnan(label[i])): 
                cmpcode.append(0) 
            else: 
                cmpcode.append(int(label[i]/100))
    return sentence, cmpcode

import os, pandas as pd, numpy as np 
cmplabelsc = [] 

for i in os.listdir('EnglishManifestos/'): 
    x = pd.read_csv(i, header=0) 
    sentence, cmpcode = clean_manifesto(x.text, x.cmp_code) 
    cmplabelsc.extend(cmpcode)
                
from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_features = 8000, ngram_range=(1,1), stop_words='english')
sent_vec = tf_vectorizer.fit_transform(np.array(sentences))
sent_vect = np.array(sent_vec.toarray())
labelsc = np.array(cmplabelsc).reshape((len(cmplabelsc),1))

In [None]:
# Popularity score for each policy category 0 to 7
# Based on recent manifestos of the country

popularitysc = []
for i in os.listdir('RecentManifestos/'): 
    x = pd.read_csv(i, header=0) 
    sentence, cmpcode = clean_manifesto(x.text, x.cmp_code) 
    popularitysc.extend(cmpcode)
    
import collections
popularity = []
c = collections.Counter(popularitysc)
for i,j in c.iteritems():
    popularity.append((j*1.0)/len(popularitysc))
    
def popularitys(indx):
    return popularity[indx]    

In [None]:
#NN model for classifying sentences into one of 7 policy categories

import keras
from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout, Input
from keras.models import Model

modelc = Sequential() 
modelc.add(Dense(300, input_dim=8000, activation='relu')) 
modelc.add(Dense(8, activation='softmax', name='main_output'))

batch_size = 128 
nb_epoch = 3

Y = [0,1,2,3,4,5,6,7]

from keras.utils.np_utils import to_categorical
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(Y)

encoded_Y = encoder.transform(labelsc)
sentlabelsc = np_utils.to_categorical(encoded_Y, num_classes=8)

modelc.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['acc']) 

modelc.fit(sent_vect, sentlabelsc, batch_size=batch_size, nb_epoch=nb_epoch,verbose=1) 


In [None]:
#Compute sentence level policy area, and aggregate popularity based on recent election manifestos of the target country
import pandas as pd
petitions = pd.read_csv("WTP_dataset_1K.csv", header=0)
petitions['signs'] = petitions['signs'].astype('int')
petitions['fulltext'] = petitions['title'].astype('str')+' '+petitions['content'].astype('str')


from nltk import sent_tokenize
cdata = []
for i in petitions['fulltext']:
    count = 0
    x = tf_vectorizer.transform(sent_tokenize(i.decode('utf-8','ignore')))
    lab = modelc.predict(x.toarray(), verbose=0)
    for j in lab:
        if np.argmax(j) > 0:
            count += popularitys(np.argmax(j))
    cdata.append(count)

Here we compute political bias and left-right scale. We build a 3-class NN classifier for sentences using manifesto text. Mapping is taken from Volkens et al., 2013 work. For a petition, each sentence is classified into one of 3 classes, and political bias and left-right scale are computed by aggregating them at petition-level.

In [None]:
# Political bias and Left-right score 
# Here we do a similar manifesto cleaning, except we map classes to left/right/neutral classes

from bs4 import BeautifulSoup
import re

def getleftright(labels):
    newlabels = []
    left = [103, 105, 106, 107, 202, 403, 404, 406, 412, 413, 504, 506, 701]
    right = [104, 201, 203, 305, 401, 402, 407, 414, 505, 601, 603, 605, 606]

    for i in labels:
        if i in left:
                newlabels.append(1)
        elif i in right:
                newlabels.append(2)
        else:
                newlabels.append(0)

    return newlabels

def clean_manifesto(text, label): 
    sentence = [] 
    cmpcode = [] 
    for i in xrange(0, len(text)): 
        if len(str(text[i]))>3 and str(text[i])!=np.nan: 
            line = text[i].lower().split() 
            words = [w for w in line] 
            sentence.append(" ".join(words))
            
            if (label[i]=='NA') or (np.isnan(label[i])): 
                cmpcode.append(0) 
            else: 
                cmpcode.append(int(label[i]))
    labels = getleftright(cmpcode)
    return sentence, labels

import os, pandas as pd, numpy as np 

sentences = [] 
cmplabels = [] 

for i in os.listdir('EnglishManifestos/'): 
    x = pd.read_csv(i, header=0) 
    sentence, cmpcode = clean_manifesto(x.text, x.cmp_code) 
    sentences.extend(sentence) 
    cmplabels.extend(cmpcode)

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_features = 8000, ngram_range=(1,1), stop_words='english')
sent_vec = tf_vectorizer.fit_transform(np.array(sentences))
labels = np.array(cmplabels).reshape((len(cmplabels),1))
sent_vect = np.array(sent_vec.toarray())


In [None]:
import keras
from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout, Input
from keras.models import Model

model = Sequential() 
model.add(Dense(300, input_dim=8000, activation='relu')) 
model.add(Dense(3, activation='softmax', name='main_output'))

batch_size = 128 
nb_epoch = 3

Y = [0,1,2]

from keras.utils.np_utils import to_categorical
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit(Y)

encoded_Y = encoder.transform(labels)
sentlabels = np_utils.to_categorical(encoded_Y, num_classes=3)

model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['acc']) 

model.fit(sent_vect, sentlabels, batch_size=batch_size, nb_epoch=nb_epoch,verbose=1) 


In [None]:
from nltk import sent_tokenize
pbias = []
lrscale = []
for i in petitions['fulltext']:
    count = [0,0,0]
    x = tf_vectorizer.transform(sent_tokenize(i.decode('utf-8','ignore')))
    lab = model.predict(x.toarray(), verbose=0)
    for j in lab:
        count = count + j
    lr_ratio = (count[1] - count[2])/(count[1]+count[2])
    bias = (count[1] + count[2])/(count[1]+count[2]+count[0])
    lrscale.append(lr_ratio)
    pbias.append(bias)
        