In [1]:
import spacy
import pandas as pd
import multiprocessing
import numpy as np


In [2]:
df_hansard  = pd.read_csv('data/hansard_all.csv')

df_group = df_hansard.groupby('subjectOfBusinessId')

q_a = []
for i, index in df_group.groups.items():
    # don't bother with odd pairs
    if (len(index) % 2 != 0): 
        continue

    # Create conversatoin pairs
    t = df_hansard.iloc[list(index)]['content'].values
    q_a.append(list(zip(t[::2], t[1::2])))

q_a = [item for sublist in q_a for item in sublist]

q_a = q_a[:100]

print('number of q & a', len(q_a))
    
df_q_a = pd.DataFrame(q_a)
df_q_a.columns = ['Q', 'A']
df_q_a.to_csv('data/q_a_all.csv')
df_q_a.tail()

number of q & a 100


Unnamed: 0,Q,A
95,"Mr. Speaker, the people of Moncton are shocked...","Mr. Speaker, first of all I congratulate the m..."
96,"Mr. Speaker, later today we will be voting on ...","Mr. Speaker, I thank the member for the questi..."
97,"Mr. Speaker, yesterday's budget shows no leade...","Mr. Speaker, in our effort to support the prov..."
98,The member for Papineau.,"Mr. Speaker, the agreement with the provinces ..."
99,"Mr. Speaker, as I mentioned earlier, the healt...",The hon. member for York West.


In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en')

def sentence_tokenizer(raw_text):
    # Create doc = nlp(raw_text) and parse sentences. 
    return u' '.join(['BOS ' + sent.string.strip() + ' EOS' for sent in nlp(raw_text).sents])

In [None]:
%%time

def _apply_df(args):
    df, func, kwargs = args
    df['Q'] = df['Q'].apply(func, **kwargs)
    df['A'] = df['A'].apply(func, **kwargs)
    return df#df.apply(func, **kwargs)

def apply_by_multiprocessing(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = multiprocessing.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))
    
num_cores = multiprocessing.cpu_count()
print(num_cores) 
    
df_q_a = pd.read_csv('data/q_a_all.csv')

df_q_a = apply_by_multiprocessing(df_q_a, sentence_tokenizer,  workers=num_cores)
   
df_q_a.to_csv('data/q_a_all.csv')
df_q_a.tail()


In [None]:

df_q_a = pd.read_csv('data/q_a_all.csv')
# df_q_a.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1, inplace=True)
# df_q_a.to_csv('data/q_a_all.csv')
# text= ' Mr. Speaker, when the Prime Minister interrupted a woman at a town hall, correcting her use of “mankind” with “peoplekind“, his mansplaining went viral. Around the world, the Prime Minister was mocked for his political correctness. The Prime Minister eventually conceded that it was a dumb joke, but his principal secretary, Gerald Butts, tweeted that any and all who criticized his boss were Nazis. The Prime Minister once said that any statement by Mr. Butts could be considered his own, and in this case?'
print(df_q_a['Q'][10])
# print(sentence_t.okenizer(df_q_a['Q'][10]))

# df_q_a['Q'] = df_q_a['Q'].apply(sentence_tokenizer)
# df_q_a['A'] = df_q_a['A'].apply(sentence_tokenizer)
# df_q_a.to_csv('data/q_a.csv')
df_q_a.tail()



In [None]:
df_q_a['Q_A'] = df_q_a[['Q', 'A']].apply(lambda x: u' '.join(x), axis=1)


print(len(df_q_a['Q_A']))
with open('data/Q_A_pairs.txt', 'w') as file:
    for row in df_q_a['Q_A']:
        file.write(row.strip() + '\n')
