In [1]:
import spacy
import pandas as pd
import multiprocessing
import numpy as np
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

In [2]:
df_bill = pd.read_csv('data/bill_all.csv')
df_bill.tail()

Unnamed: 0.1,Unnamed: 0,bill_text,congress,legis_num,link
0,0,[Congressional Bills 115th Congress]\n[From th...,115,H R 724,https://www.congress.gov/bill/115th-congress/h...
1,1,[Congressional Bills 115th Congress]\n[From th...,115,H R 3527,https://www.congress.gov/bill/115th-congress/h...
2,2,[Congressional Bills 115th Congress]\n[From th...,115,H R 3628,https://www.congress.gov/bill/115th-congress/h...
3,3,[Congressional Bills 115th Congress]\n[From th...,115,H RES 455,https://www.congress.gov/bill/115th-congress/h...
4,4,[Congressional Bills 115th Congress]\n[From th...,115,H R 2279,https://www.congress.gov/bill/115th-congress/h...


In [3]:
def _apply_df(args):
    df, func, kwargs = args
    df['bill_text_processed'] = df['bill_text'].apply(func, **kwargs)
    return df#df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

def sentence_tokenizer(raw_text):
    # Create doc = nlp(raw_text) and parse sentences. 
#     return u' '.join([sent.string.strip() for sent in nlp(raw_text).sents])
    return u' '.join([token.text for token in nlp(raw_text.lower())])

In [4]:
%%time

num_cores = multiprocessing.cpu_count() - 1
print(num_cores) 

df_bill = apply_by_multiprocessing(df_bill, sentence_tokenizer,  workers=num_cores)
   
df_bill.to_csv('data/bill_all_processed.tsv', sep='\t')

7
CPU times: user 2.38 s, sys: 248 ms, total: 2.63 s
Wall time: 2min 4s


In [6]:
df_bill.tail()

Unnamed: 0.1,Unnamed: 0,bill_text,congress,legis_num,link,bill_text_processed
3785,3785,[115th Congress Public Law 6]\n[From the U.S. ...,115,H R 255,https://www.congress.gov/bill/115th-congress/h...,[ 115th congress public law 6 ] \n [ from the ...
3786,3786,[Congressional Bills 115th Congress]\n[From th...,115,H R 2019,https://www.congress.gov/bill/115th-congress/h...,[ congressional bills 115th congress ] \n [ fr...
3787,3787,[Congressional Bills 115th Congress]\n[From th...,115,H R 2319,https://www.congress.gov/bill/115th-congress/h...,[ congressional bills 115th congress ] \n [ fr...
3788,3788,[Congressional Bills 115th Congress]\n[From th...,115,S 1471,https://www.congress.gov/bill/115th-congress/h...,[ congressional bills 115th congress ] \n [ fr...
3789,3789,[Congressional Bills 115th Congress]\n[From th...,115,H R 3212,https://www.congress.gov/bill/115th-congress/h...,[ congressional bills 115th congress ] \n [ fr...
