In [None]:
!pip install -q transformers tokenizers sentencepiece
!pip install -q git+https://github.com/PrithivirajDamodaran/Parrot_Paraphraser.git

In [None]:
import pandas as pd 
import gzip
import json
import requests
from io import BytesIO, StringIO
import urllib.request
import numpy as np
import torch
from parrot import Parrot
import warnings
warnings.filterwarnings("ignore")
import ssl
import os
from random import shuffle
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
def get_data(url,N,downloaded):
  '''
  Args: 
    url: url of the data
    N: number of rows to be returned
    downloaded: True if the data is already downloaded

  Returns a dataframe of N rows such that not all the data is loaded into memory
  '''

  im_path = url.split('/')[-1]
  final_path = im_path.replace('.gz','')
  if not downloaded:
    _ = urllib.request.urlretrieve(url, im_path)
    with gzip.open(im_path, 'rb') as infile:
      with open(final_path, 'wb') as outfile:
          for line in infile:
              outfile.write(line)
  it = pd.read_json(final_path,chunksize = 1000,lines= True)
  first_n_rows = pd.DataFrame()
  for chunk in it:
      first_n_rows = pd.concat([first_n_rows,chunk.head(N)])
      if len(first_n_rows) >= N:
          break
  return first_n_rows

In [None]:
desc = get_data('https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_Electronics.json.gz',500000,False).sample(frac = 1) #first 500000 rows

In [None]:
desc['description'] = desc['description'].apply(lambda item: ' '.join([y for y in ''.join(item).split('<') if '>' not in y]))
df_desc = desc[['title','asin', 'description']].drop_duplicates()

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap")
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5", use_gpu=False)

def get_question(answer, context, max_length=64):
  input_text = "answer: %s  context: %s </s>" % (answer, context)
  features = tokenizer([input_text], return_tensors='pt')
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
  features = features.to(device)
  output = model.to(device).generate(input_ids=features['input_ids'], 
               attention_mask=features['attention_mask'],
               max_length=max_length)

  return tokenizer.decode(output[0],skip_special_tokens = True,clean_up_tokenization_spaces = True)

def paraphrase(input):
  para_phrases = parrot.augment(input_phrase=input)
  return para_phrases[0][0]



IndentationError: ignored

In [None]:
df_filtered = df_desc[df_desc['description'].apply(lambda x:len(x.split()) > 70)]

NameError: ignored

In [None]:
descriptions = df_filtered['description'].unique().tolist()

In [None]:
len(descriptions)

In [None]:
import nltk
nltk.download('punkt')

In [None]:
del desc
del df_filtered

In [None]:
del df_desc

In [None]:
master = []
counter = 0
for context in descriptions:
  counter +=1
  print("Parsing:", counter, '       ', end = '\r')
  indiv = []
  sentences = nltk.tokenize.sent_tokenize(context)
  shuffle(sentences)
  sent_count = 0
  for answer in sentences:
    sent_count += 1
    question = get_question(answer, context)
    indiv.append({
        'question':question,
        'answer': paraphrase(answer),
    })
    if sent_count > 6:
      break

  master.append({
      'context': context,
      'result': indiv
  })



Parsing: 1        

In [None]:
import json

with open('q_and_a.json', 'w') as fp:
    json.dump(master, fp)

In [None]:
phrase

'The following camera brands and models have been tested for compatibility with GV-Software'

In [None]:
parrot.rephrase(phrase)

('the following camera brands and models have been tested for compatibility with the gv-software and software',
 29)

In [None]:
parrot.augment(phrase, do_diverse = True)

[('gv-software has been tested on the following camera brands and models for compatibility with',
  54),
 ('the following camera brands and models have been tested for compatibility with the gv software',
  17),
 ('the following camera brands and models have been tested for compatibility with the gv-software',
  16),
 ('the following camera brands and models have been tested for compatibility with gv software',
  13),
 ('the following camera brands and models have been tested for compatibility with gv-software',
  12)]