In [None]:
import numpy as np
from sklearn.model_selection import ShuffleSplit

from scipy.stats import norm
from tqdm import tqdm

import gensim
from gensim import utils

from keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


import keras
from keras import backend as K
from keras.layers import Input, Dense, Lambda, Layer, Add, Multiply
from keras.models import Model, Sequential
import seaborn as sns

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style='whitegrid', palette='muted', font_scale=1.2)


In [None]:
df_bills = pd.read_csv('data/bill_all.csv')
print(df_bills.columns)
df_bills.tail()

In [None]:
df_votes = pd.read_csv('data/votes_all.csv')
print(df_votes.columns)
df_votes.tail()

In [None]:
df_final = pd.read_csv('data/df_vote_final.csv')
df_final.tail()

In [None]:

df_votes = df_votes[df_votes['legis_num'] != 'ADJOURN']



def gen_x():
    df_list = []
    for name, group in tqdm(df_votes.groupby('legis_num')):
    #     print('Bill', name)
    #     print('vote shape', group.shape)
        bill = df_bills[df_bills['legis_num'] == name]
        group.reset_index(inplace=True)
    #     print( bill['sponsor'])
        group.loc[:,'sponsor'] = bill.iloc[0]['sponsor']
        group.loc[:,'sponsor_id'] = bill.iloc[0]['sponsor_id']
        group.loc[:,'sponsor_party'] = bill.iloc[0]['sponsor_party']
        group.loc[:,'sponsor_state'] = bill.iloc[0]['sponsor_state']
        group.loc[:,'sponsor_uri'] = bill.iloc[0]['sponsor_uri']
        df_list.append(group)
    #     print(group.columns)
    #     break
    return df_list
   
# df_list = gen_x()
    
# df_final = pd.concat(df_list)

df_final.reset_index(inplace=True)
df_final.to_csv('data/df_vote_final.csv')
df_final.tail()

In [None]:
df_final = df_final[df_final['sponsor_party'].isin(['D', 'R'])]
print((df_final['sponsor'].value_counts()[:10]))
df_final['sponsor'].value_counts()[:10].plot(kind='bar', alpha=.5)
plt.show()
df_final['sponsor_party'].value_counts()[:10].plot(kind='bar', alpha=.5)

In [None]:
model = gensim.models.KeyedVectors.load_word2vec_format('/home/sonic/.keras/datasets/GoogleNews-vectors-negative300.bin',
                                                        binary=True)


In [None]:

max_words = 20000
MAX_SEQUENCE_LENGTH = 10000
def process_doc(X):
    tokenizer = Tokenizer(num_words=max_words,lower=True, split=' ', 
                          filters='"#%&()*+-/<=>@[\\]^_`{|}~\t\n',
                          char_level=False, oov_token=u'<UNK>')

    tokenizer.fit_on_texts(X)

    X_seq = tokenizer.texts_to_sequences(X)

    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(X)

    tf_transformer = TfidfTransformer().fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)

    x_emb =[]
    for doc in X: #look up each doc in model
        x_emb.append(document_vector(model, doc))


    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

#     X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH,  padding="post", truncating="post")
    return np.array(X_seq), word_index, np.array(x_emb), X_train_tf, X_train_counts

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)


def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [None]:
%%time


AYE = 1
NAY = -1

vote_matrix_all = {}
X_seq_all = {}
word_index_all = {}
X_train_tf_all = {}
X_train_counts_all = {}
X_emb_all = {}
legistlator_dict_all = {}
feature_set_all = {}


grouped_congress = df_bills.groupby('congress')


for name, group in grouped_congress:
    print('Processing congress', name)
    print('congress shape', group.shape)
    9
    df_votes_filtered = df_final[df_final['congress'] == name]
    
    # Feature set 
    feature_set = df_votes_filtered[['sponsor_id', 'sponsor_party', 'sponsor_state', 'vote']]
    
    num_legistlators = len(df_votes_filtered['name'].unique())
    print('number of legistlators', num_legistlators)
    vote_matrix = np.zeros((group.shape[0], num_legistlators))
    print('vote_matrix.shape', vote_matrix.shape)
    
    # Creat legistor dict mapping!
    legistlator_dict = {}
    i = 0
    for legitslator in df_votes_filtered['name'].unique():
        legistlator_dict[legitslator] = i
        i += 1
    
    legistlator_dict_all[name] = {v: k for k, v in legistlator_dict.items()} 
    
#     print(legistlator_dict)
    print('Processing congress votes')
    group.reset_index(inplace=True)
    
    vote_feature = []
    # Process Vote matrix here
    for index, row in tqdm(group.iterrows()):
        
        df_filtered = df_votes_filtered[df_votes_filtered['legis_num'] == row['legis_num']]

        for _, vote in df_filtered.iterrows():
            if vote['vote'] == 'Yea':
                vote_matrix[index, legistlator_dict[vote['name']]] = AYE
            elif vote['vote'] == 'Nay':
                vote_matrix[index, legistlator_dict[vote['name']]] = NAY

    # Process bill Representation Here
    print('processing congress bills')
    X_seq, word_index, x_emb, X_train_tf, X_train_counts = process_doc(group['billText'].apply(str))

    vote_matrix_all[name] = vote_matrix
    X_seq_all[name] = X_seq
    word_index_all[name] = word_index
    X_emb_all[name] = x_emb
    X_train_tf_all[name] = X_train_tf
    X_train_counts_all[name] = X_train_counts
    feature_set_all[name] = feature_set
    
    print('*' * 50)
#     break
    
# print('feature_all', feature_all[106].shape)
# print('y_all', y_all[106].shape)
   
print('vote_matrix_all.shape', vote_matrix_all[106].shape)
np.save('data/vote_matrix_all.npy', vote_matrix_all)
np.save('data/X_seq_all.npy', X_seq_all)
np.save('data/X_word_index_all.npy', word_index_all)
np.save('data/X_train_tf_all.npy', X_train_tf_all)
np.save('data/X_train_counts_all.npy', X_train_counts_all)
np.save('data/X_emb_all.npy', X_emb_all)
np.save('data/legistlator_all.npy', legistlator_dict_all)
np.save('data/feature_set_all.npy', feature_set_all)


In [None]:

vote_matrix_all = np.load('data/vote_matrix_all.npy' )
X_seq_all = np.load('data/X_seq_all.npy')
word_index_all = np.load('data/X_word_index_all.npy')
X_train_tf_all = np.load('data/X_train_tf_all.npy')
X_train_counts_all = np.load('data/X_train_counts_all.npy')
X_emb_all = np.load('data/X_emb_all.npy')
legistlator_all = np.load('data/legistlator_all.npy')
feature_set_all = np.load('data/feature_set_all.npy')

print('vote_matrix_all',vote_matrix_all.item()[106].shape)
print('X_seq_all', X_seq_all.item()[106].shape)
# print(word_index_all[106].shape)
print('X_train_tf_all', X_train_tf_all.item()[106].shape)
print('X_train_counts_all', X_train_counts_all.item()[106].shape)
print('X_emb_all', X_emb_all.item()[106].shape)
# print('legistlator_all', legistlator_all.item()[106].shape)
print('feature_set_all', feature_set_all.item()[106].shape )



# Collaborative Filtering


In [None]:
#Variational

def nll(y_true, y_pred):
    """ Negative log likelihood (Bernoulli). """

    # keras.losses.binary_crossentropy gives the mean
    # over the last axis. we require the sum
    return K.sum(K.binary_crossentropy(y_true, y_pred), axis=-1)


class KLDivergenceLayer(Layer):

    """ Identity transform layer that adds KL divergence
    to the final model loss.
    """

    def __init__(self, *args, **kwargs):
        self.is_placeholder = True
        super(KLDivergenceLayer, self).__init__(*args, **kwargs)

    def call(self, inputs):

        mu, log_var = inputs

        kl_batch = - .5 * K.sum(1 + log_var -
                                K.square(mu) -
                                K.exp(log_var), axis=-1)

        self.add_loss(K.mean(kl_batch), inputs=inputs)

        return inputs


def get_VAE(original_dim):
    decoder = Sequential([
        Dense(intermediate_dim, input_dim=latent_dim, activation='relu'),
        Dense(original_dim, activation='sigmoid')
    ])

    x = Input(shape=(original_dim,))
    h = Dense(intermediate_dim, activation='relu')(x)

    z_mu = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)

    z_mu, z_log_var = KLDivergenceLayer()([z_mu, z_log_var])
    z_sigma = Lambda(lambda t: K.exp(.5*t))(z_log_var)

    eps = Input(tensor=K.random_normal(stddev=epsilon_std,
                                       shape=(K.shape(x)[0], latent_dim)))
    z_eps = Multiply()([z_sigma, eps])
    z = Add()([z_mu, z_eps])

    x_pred = decoder(z)

    vae = Model(inputs=[x, eps], outputs=x_pred)
    
    loss = nll
    loss = 'mean_squared_error'
    vae.compile(optimizer='adam', loss=loss)

    encoder = Model(x, z_mu)
    return vae, encoder, decoder


In [None]:
from keras.initializers import glorot_uniform  # Or your initializer of choice

def reinitialize(model):
    initial_weights = model.get_weights()
    new_weights = [glorot_uniform()(w.shape).eval() for w in initial_weights]
    model.set_weights(new_weights)
    return model
      
    


In [None]:
X_emb = X_emb_all.item()[106]
vote_matrix = vote_matrix_all.item()[106]
print('X_emb', X_emb.shape)
print('vote_matrix', vote_matrix.shape)

# numpyMatrix = df.as_matrix().astype(float)
# scaled_data = preprocessing.scale(numpyMatrix)

from sklearn.preprocessing import scale, MinMaxScaler, StandardScaler
# X_emb = StandardScaler().fit_transform(X_emb.astype(float))
X_emb = scale(X_emb.astype(float))

X = []
X_meta = []
y = []
i = 0

#     mean = 0.0   # some constant
#     std = 1.0    # some constant (standard deviation)
#     meta = meta + np.random.normal(mean, std, meta.shape)
mu, sigma = 0, 0.1 # mean and standard deviation
noise_factor = 0.5

X_train = []
######
# Create Meta for each legistlator
for idx, legistlator in enumerate(vote_matrix.T):
#     print('np.vstack(legistlator)', np.vstack(legistlator).shape)
#     print('legistlator.shape', legistlator.shape)
#     legistlator = legistlator + np.random.normal(mu, sigma, legistlator.shape)

    meta = np.multiply(X_emb, np.vstack(legistlator)) # Eelementwise multiplication, introducing noise


    meta = meta + noise_factor * np.random.normal(mu, sigma, meta.shape)

#     print('meta.shape', meta.shape)
    
    X_meta.append(meta)
    X_train.append(X_emb)

#     break
######
X_meta = np.array(X_meta)
X_train = np.array(X_train)
print('X_meta', X_meta.shape)
print('X_train', X_train.shape)


# Reshape to flatten the dimentions
# X_train = X_train.reshape(X_train.shape[0], -1)
# X_meta = X_meta.reshape(X_meta.shape[0], -1)
# X_train = X_train.reshape(-1, X_train.shape[1], X_train.shape[2], 1)
# X_meta = X_meta.reshape(-1, X_meta.shape[1], X_meta.shape[2], 1)

X_train =  np.clip(X_train, -1., 1.)
X_meta = np.clip(X_meta, -1., 1.)
print('X_train new shape', X_train.shape)
print('X_meta new shape', X_meta.shape)

print(X_train[0].shape)
print(X_meta[0])



In [None]:
def deep_autoencoder(X_train):
    input_img = Input(shape=(X_train.shape[1], X_train.shape[2]))
    encoded = Dense(128, activation='relu', kernel_initializer='glorot_uniform')(input_img)
    encoded = Dense(64, activation='relu')(encoded)
    encoded = Dense(32, activation='relu',  name='encoded')(encoded)

    decoded = Dense(64, activation='relu')(encoded)
    decoded = Dense(128, activation='relu')(decoded)
    decoded = Dense(X_train.shape[2], activation='sigmoid')(decoded)
    autoencoder = Model(input_img, decoded)
#     loss = 'mean_squared_error'
    loss='binary_crossentropy'
    autoencoder.compile(optimizer='adam', loss=loss)
    return autoencoder

def denoiser_autoencoder(X_train):
#     input_img = Input(shape=(28, 28, 1))  # adapt this if using `channels_first` image data format
    input_img = Input(shape = (X_train.shape[1], X_train.shape[2], 1 ))
    x = Conv2D(32, (3, 3), activation='relu', padding='same', kernel_initializer='glorot_uniform')(input_img)
    x = MaxPooling2D((2, 2), padding='same')(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    encoded = MaxPooling2D((2, 2), padding='same')(x)

    # at this point the representation is (7, 7, 32)

    x = Conv2D(32, (3, 3), activation='relu', padding='same')(encoded)
    x = UpSampling2D((2, 2))(x)
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
    x = UpSampling2D((2, 2))(x)
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)

    autoencoder = Model(input_img, decoded)
    autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
    return autoencoder

    
from keras.layers import Input,Conv2D,MaxPooling2D,UpSampling2D


def conv_autoencoder(X_train):
    
    input_img = Input(shape = (1, X_train.shape[1], X_train.shape[2] ))
    #encoder
    #input = 28 x 28 x 1 (wide and thin)
    conv1 = Conv2D(32, (3, 3), activation='relu', padding='same')(input_img) #28 x 28 x 32
    pool1 = MaxPooling2D(pool_size=(2, 2))(conv1) #14 x 14 x 32
    conv2 = Conv2D(64, (3, 3), activation='relu', padding='same')(pool1) #14 x 14 x 64
    pool2 = MaxPooling2D(pool_size=(2, 2))(conv2) #7 x 7 x 64
    conv3 = Conv2D(128, (3, 3), activation='relu', padding='same', name='encoded')(pool2) #7 x 7 x 128 (small and thick)

    #decoder
    conv4 = Conv2D(128, (3, 3), activation='relu', padding='same')(conv3) #7 x 7 x 128
    up1 = UpSampling2D((2,2))(conv4) # 14 x 14 x 128
    conv5 = Conv2D(64, (3, 3), activation='relu', padding='same')(up1) # 14 x 14 x 64
    up2 = UpSampling2D((2,2))(conv5) # 28 x 28 x 64
    decoded = Conv2D(1, (3, 3), activation='sigmoid', padding='same', name='decoded')(up2) # 28 x 28 x 1
    
    autoencoder = Model(input_img, decoded)
    autoencoder.compile(loss='mean_squared_error', optimizer = 'RMSprop')
    return autoencoder

In [None]:
###################
# original_dim = 
intermediate_dim = 256
latent_dim = 128
batch_size = 256
epochs = 20
epsilon_std = 1.0
###################

# autoencoder, encoder, decoder = get_VAE(original_dim)
autoencoder = deep_autoencoder(X_train)
# autoencoder = denoiser_autoencoder(X_train)
# autoencoder = conv_autoencoder(X_train)
print(autoencoder.summary())

rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)
rs.get_n_splits(X_train)

print(rs)

def plot_history(history):
    print(history.history)
    df = pd.DataFrame(history.history)
#     print(df.tail())
    df.plot(xticks=range(epochs))
#     print(history.history.keys())

    
for train_index, test_index in rs.split(X_train):
#     print("TRAIN:", train_index, "TEST:", test_index)
    X_emb_train, X_emb_test = X_train[train_index], X_train[test_index]
    X_meta_train, X_meta_test = X_meta[train_index], X_meta[test_index]
    
    print(X_emb_train.shape, X_emb_test.shape)
    print(X_meta_train.shape, X_meta_test.shape)
#     break
    
    history = autoencoder.fit(X_emb_train,
        X_meta_train,
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size)
    plot_history(history)
    
    
    ###
    names = [weight.name for layer in autoencoder.layers for weight in layer.weights]
    weights = autoencoder.get_weights()

    for name, weight in zip(names, weights):
        print(name, weight.shape)
        
#     encoded_weight = 
#     print(model_weights['encoded'].shape, model_weights['encoded'])

    ###
    
    
    break

In [None]:
from sklearn.model_selection import train_test_split

feature_set = feature_set_all.item()[106]

X = feature_set[['sponsor_id', 'sponsor_party', 'sponsor_state']]
y = feature_set['vote']
le = preprocessing.LabelEncoder()
le.fit(y)
#     print(le.classes_)
y = le.transform(y)
    
# split into a training and testing set
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)