In [1]:
import pandas as pd

In [2]:
blogs = pd.read_csv('blogs.csv')
blogs['num_words'] = blogs['text'].str.split().str.len()
blogs['num_sentences'] = blogs['text'].str.split('.').str.len()

filtered_blogs = blogs[(blogs['num_words'] > 7) & (blogs['tag'] == 'p')]
filtered_blogs.head()

Unnamed: 0.1,Unnamed: 0,text,tag,paragraph,article,num_words,num_sentences
1,1,One of the most helpful things in my own recov...,p,,/blog/nocd-support-groups-finding-help-and-hop...,56,5
2,2,There is something powerful about knowing that...,p,,/blog/nocd-support-groups-finding-help-and-hop...,60,4
3,3,Support is a key piece of your recovery journe...,p,,/blog/nocd-support-groups-finding-help-and-hop...,78,4
5,5,Support groups may help you realize that you a...,p,You are not alone,/blog/nocd-support-groups-finding-help-and-hop...,85,5
6,6,This is one of the main reasons I continue to ...,p,You are not alone,/blog/nocd-support-groups-finding-help-and-hop...,77,4


In [3]:
def get_text(df):
    passages = []
    for index, row in df.iterrows():
        passages.append(row['text'])
        
    return passages

In [4]:
passages = get_text(filtered_blogs)

for i, passage in enumerate(passages):
    print(passage + '\n')
    if i == 2: break

One of the most helpful things in my own recovery journey has been hearing about other people’s experiences with OCD. This might be because we relish in stories of triumph and are drawn toward people with shared experiences. I think this is true for many things in life. Personally, I learn and grow from these stories.

There is something powerful about knowing that someone else has walked the same path as you and that they have not only survived it, but possibly even thrived. I love hearing about the determination and grit of others who have faced similar obstacles. These stories of hope often inspired me to keep going, even when I didn’t think I could. 

Support is a key piece of your recovery journey. That’s why at NOCD, we provide a safe space for people in the OCD community and their families to share their experiences in our virtual support groups, which are available to members doing exposure and response prevention (ERP) therapy with NOCD Therapy. There are over 25 different gro

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'doc2query/msmarco-t5-base-v1'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
for passage in passages:
    break

# tokenize the passage
inputs = tokenizer(passage, return_tensors='pt')
# generate three queries
outputs = model.generate(
    input_ids=inputs['input_ids'].cuda(),
    attention_mask=inputs['attention_mask'].cuda(),
    max_length=64,
    do_sample=True,
    top_p=0.95,
    num_return_sequences=3
)

In [7]:
print("Paragraph:")
print(passage)

print("\nGenerated Queries:")
for i in range(len(outputs)):
    query = tokenizer.decode(outputs[i], skip_special_tokens=True)
    print(f'{i + 1}: {query}')

Paragraph:
One of the most helpful things in my own recovery journey has been hearing about other people’s experiences with OCD. This might be because we relish in stories of triumph and are drawn toward people with shared experiences. I think this is true for many things in life. Personally, I learn and grow from these stories.

Generated Queries:
1: how do i recover from ocd
2: does ocd affect your health
3: can someone have ocd


In [8]:
from tqdm.auto import tqdm  # this is our progress bar

batch_size = 128
num_queries = 3  # number of queries to generate for each passage
count = 0
lines = []
passage_batch = []

# reinitialize passage generator
passages = get_text(filtered_blogs)
# target = len(list(passages))

# print(f"processed {count}/{len(list(passages))}")
for passage in tqdm(passages):
    # remove tab + newline characters if present
    passage_batch.append(passage.replace('\t', ' ').replace('\n', ' '))

    # we encode in batches
    if len(passage_batch) == batch_size:
        # tokenize the passage
        inputs = tokenizer(
            passage_batch,
            truncation=True,
            padding=True,
            max_length=256,
            return_tensors='pt'
        )

        # generate three queries per doc/passage
        outputs = model.generate(
            input_ids=inputs['input_ids'].cuda(),
            attention_mask=inputs['attention_mask'].cuda(),
            max_length=64,
            do_sample=True,
            top_p=0.95,
            num_return_sequences=num_queries
        )

        # decode query to human readable text
        decoded_output = tokenizer.batch_decode(
            outputs,
            skip_special_tokens=True
        )

        # loop through to pair query and passages
        for i, query in enumerate(decoded_output):
            query = query.replace('\t', ' ').replace('\n', ' ')  # remove newline + tabs
            passage_idx = int(i/num_queries)  # get index of passage to match query
            lines.append((query,passage_batch[passage_idx]))
            count += 1

        passage_batch = []
#             progress.update(len(decoded_output))
#         print(f"processed {count}/{len(decoded_output)}")

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3662/3662 [07:29<00:00,  8.15it/s]


In [10]:
query_passage_df = pd.DataFrame(lines, columns=["query", "passage"])

In [11]:
query_passage_df.head()

Unnamed: 0,query,passage
0,what does ocd mean,One of the most helpful things in my own recov...
1,is it good to talk about an ocd,One of the most helpful things in my own recov...
2,does ocd really help recovery,One of the most helpful things in my own recov...
3,what is being a positive person,There is something powerful about knowing that...
4,who is an inspirational person,There is something powerful about knowing that...


In [12]:
query_passage_df.to_csv('query_passage.csv')