In [188]:
import networkx as nx
from networkx.algorithms import bipartite
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from tqdm import tqdm
import numpy as np
import pickle
import dgl
import re
import os
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# Convert Congress dataset into DGL

In [None]:
def convert_nx_to_dgl(graph):
        K = dgl.DGLGraph()
        digraph = nx.DiGraph(graph)
        K.from_networkx(digraph)
        return K

dgl_list = []
nx_list = []
cc_indices = {}
for i in range(93, 95):
    print('Congress ', i)
    path = '~/neu/gml/data/congress/edgelist_Congress'+str(i)+'.csv'
    df = pd.read_csv(path)
    
    G = nx.Graph()
    G.add_nodes_from(list(df['bill_id']), bipartite=0)
    G.add_nodes_from(list(df['thomas_id']), bipartite=1)
    edge_list = list(zip(df['bill_id'], df['thomas_id']))
    G.add_edges_from(edge_list)
    G = bipartite.weighted_projected_graph(G, list(df['bill_id']))
    print(len(G), df['bill_id'].nunique())
    nx_list.append(G)

In [145]:
bill_features = pd.read_csv('/home/sam/neu/gml/data/congress/original/bills_features_label.csv')
congressman_features = pd.read_csv('/home/sam/neu/gml/data/congress/original/congressman_features_v2.csv')
bills_passed = bill_features[bill_features['pass_law']==1]

In [128]:
def add_majority_party(dataframe):
    total = 0
    majority_party_dict = {}
    missing_data = set()
    for bill in list(dataframe['bill_id'].unique()):
        party = []
        for t_id in dataframe[dataframe['bill_id']==bill]['thomas_id'].unique():
            if t_id not in list(congressman_features['thomas_id']):
                missing_data.add(t_id)
                continue
            party_options = congressman_features[congressman_features['thomas_id']==t_id]['party_code']

            # If there are multiple parties, select a random one
            party.append(np.random.choice(party_options))
        # Let the majority party be the most common
        if not len(party):
            majority_party_dict[bill] = None
            continue
        majority_party_dict[bill] = Counter(party).most_common()[0][0]
    print('    Missing data for: ',missing_data)
    return majority_party_dict

In [274]:
nx_passed_list = []
# label_dict_master = {}
for i in range(93, 114):
    print('Congress ', i)
    path = '/home/sam/neu/gml/data/congress/original/edgelist_Congress'+str(i)+'.csv'
    df = pd.read_csv(path)
    df_passed = df[df['bill_id'].isin(bills_passed['bill_id'])]
#     print('    creating labels')
#     labels = add_majority_party(df_passed)
#     label_dict_master[i] = labels
#     path = '/home/sam/neu/gml/data/congress/pruned/labels/'+str(i)+'.json'
#     with open(path, 'wb') as f:
#         pickle.dump(labels, f)
#     print('    labels written')
#     print('    creating graph')
    G = nx.Graph()
    G.add_nodes_from(list(df_passed['bill_id'].unique()), bipartite=0)
    G.add_nodes_from(list(df_passed['thomas_id'].unique()), bipartite=1)
    edge_list = list(zip(df_passed['bill_id'], df_passed['thomas_id']))
    G.add_edges_from(edge_list)
    G = bipartite.weighted_projected_graph(G, list(df_passed['bill_id']))
    # Remove edges with weight < 6
    edge_list = list(G.edges(data=True))
    for edge in edge_list:
        if edge[2]['weight'] < 6:
            G.remove_edge(edge[0], edge[1])
    remove = [node for node,degree in dict(G.degree()).items() if degree < 1]
    G.remove_nodes_from(remove)
    print('   ',len(G), ' nodes,', len(G.edges), 'edges')
    nx_passed_list.append(G)
    path = '/home/sam/neu/gml/data/congress/pruned/graphs'+str(i)+'_graph.edgelist.gz'
    nx.write_edgelist(G, path)
    print()

Congress  93
    131  nodes, 1017 edges

Congress  94
    98  nodes, 289 edges

Congress  95
    156  nodes, 412 edges

Congress  96
    197  nodes, 3120 edges

Congress  97
    94  nodes, 1722 edges

Congress  98
    113  nodes, 4722 edges

Congress  99
    370  nodes, 24718 edges

Congress  100
    403  nodes, 25702 edges

Congress  101
    359  nodes, 22873 edges

Congress  102
    301  nodes, 16459 edges

Congress  103
    183  nodes, 4831 edges

Congress  104
    109  nodes, 1123 edges

Congress  105
    101  nodes, 1303 edges

Congress  106
    249  nodes, 4525 edges

Congress  107
    166  nodes, 1863 edges

Congress  108
    225  nodes, 3501 edges

Congress  109
    210  nodes, 3190 edges

Congress  110
    252  nodes, 5190 edges

Congress  111
    192  nodes, 2615 edges

Congress  112
    116  nodes, 1363 edges

Congress  113
    145  nodes, 2073 edges



# Relabel nodes

In [282]:
def interpret_feature_string(fs):
    return np.array([float(x) for x in fs[1:-1].split(', ')])

path = '/home/sam/neu/gml/data/congress'
dgl_gs = [None]*len(nx_passed_list)
node_to_int = {}
int_to_node = {}
df_features = pd.read_csv(os.path.join(path, 'features_df.csv'))
df_features = df_features.set_index('bill_id')
feature_map = [None]*len(nx_passed_list)
info = {} # labels dictionary
for i, g in tqdm(enumerate(nx_passed_list)):
#     Gcc = sorted(nx.connected_components(G0), key=len, reverse=True)
#     g = G0.subgraph(Gcc[0])
    congress_number = i+93
    features = np.array([interpret_feature_string(
                    df_features['vector'].loc[node]) for node in list(g.nodes)])
    feature_map[i] = features
    node_to_int[congress_number] = dict(zip(list(g.nodes), range(len(g.nodes))))
    int_to_node[congress_number] = dict(zip(range(len(g.nodes)), list(g.nodes)))
    for node in list(g.nodes):
        party = label_dict_master[congress_number][node]
        if party not in [100, 200]:
            party = np.random.choice([100,200])
        info[str(i)+'_'+str(node_to_int[congress_number][node])] = party
    

    g_relabeled = nx.relabel_nodes(g, node_to_int[congress_number])
    digraph = nx.DiGraph(g_relabeled)
    K = dgl.DGLGraph()
    K.from_networkx(digraph)
    dgl_gs[i] = K

with open(path + '/pruned/graph_dgl.pkl', 'wb') as f:
    pickle.dump(dgl_gs, f)

with open(path + '/pruned/label.pkl', 'wb') as f:
    pickle.dump(info, f)
    
with open(path + '/pruned/node_to_int.pkl', 'wb') as f:
    pickle.dump(int_to_node, f)

with open(path + '/pruned/int_to_node.pkl', 'wb') as f:
    pickle.dump(int_to_node, f)
    
    
np.save(path + '/pruned/features.npy', np.array(feature_map, dtype='object'))

21it [00:00, 127.04it/s]


# Create train, val, and test sets

In [277]:
num_of_labels = 2
df = pd.DataFrame.from_dict(info, orient='index').reset_index().rename(columns={"index": "name", 0: "label"})
val_graph = 2
test_graph = 16
train_graphs = list(range(len(dgl_gs)))
train_graphs.remove(val_graph)
train_graphs.remove(test_graph)


val_df = df[df.name.str.contains(str(val_graph)+'_')]
test_df = df[df.name.str.contains(str(test_graph)+'_')]

train_df = df[~df.index.isin(val_df.index)]
train_df = train_df[~train_df.index.isin(test_df.index)]
train_df.reset_index(drop = True).to_csv(path + '/pruned/train.csv')
val_df.reset_index(drop = True).to_csv(path + '/pruned/val.csv')
test_df.reset_index(drop = True).to_csv(path + '/pruned/test.csv')

# Create feature set

In [None]:
nltk.download() # follow the prompts to download "stopwords"

In [None]:
# Create bag o words feature vector
stops = stopwords.words('english')
def text_preprocess(text:str):
    # Ignoring case
    text = text.lower()
    # Ignoring punctuation
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Ignoring frequent words that don’t contain much information, called stop words, like “a,” “of,” etc.
    text = text.split(' ')
    text = [word for word in text if not word in stops]
    text = ' '.join(text)
    return text


def aggregate_features(x):
    return [*x['word_bag'], *x['bill_type_vec'], *x['control_vec'], *x['pass_law_vec']]


def one_hot_encoder(data:list):
    n_types = len(set(data))
    type_key = list(set(data))
    encoded = [None]*len(data)
    for i, x in enumerate(data):
        vec = [0]*n_types
        index = type_key.index(x)
        vec[index] = 1
        encoded[i] = vec
    return encoded

In [None]:
bill_features = pd.read_csv('/home/sam/neu/gml/data/congress/original/bills_features_label.csv')
bill_features = bill_features
clean_title_text = bill_features['title_text'].apply(text_preprocess)
# Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
vectorizer = CountVectorizer(analyzer = 'word',
                            tokenizer = None,
                            preprocessor = None,
                            stop_words = None,
                            max_features = 118)
# fit_transform() does two functions: First, it fits the model
# and learns the vocaulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
train_data_features = vectorizer.fit_transform(clean_title_text)

# Numpy arrays are easy to work with, so convert the result to an array
train_data_features = train_data_features.toarray()
train_data_features_normed = normalize(train_data_features)
print('Bag of words completed')
bill_features['word_bag'] = list(train_data_features_normed)
#  add party_control to the feature vector
bill_features['control_vec'] = bill_features['party_control_congress'].apply(
                                                    lambda x: [x/100-1])
bill_features['bill_type_vec'] = one_hot_encoder(list(bill_features['bill_type']))
bill_features['pass_law_vec'] = bill_features['pass_law'].apply(lambda x: [x])
bill_features['vector'] = bill_features.apply(aggregate_features, axis=1)

df_features = bill_features[['bill_id', 'vector']]
with open('/home/sam/neu/gml/data/congress/original/features_df.csv', 'wb') as f:
    df_features.to_csv(f)