### Exploratory Data Analysis 
This notebook contains code that explores the [Amazon Reviews Dataset](https://jmcauley.ucsd.edu/data/amazon_v2/index.html).

This notebook implements various functions to load the training, testing and validation data from the [Amazon Questions and Answers Dataset](https://github.com/amazonqa/amazonqa) without having to load the entire dataset into memory. 

In [5]:
# !pip install -U sentence-transformers
# !pip install transformers
# !pip install nltk

In [1]:
import pandas as pd 
import gzip
import json
import requests
from io import BytesIO, StringIO
import urllib.request
import numpy as np
import ssl
import os
ssl._create_default_https_context = ssl._create_unverified_context

In [1]:
# _ = urllib.request.urlretrieve("https://amazon-qa.s3-us-west-2.amazonaws.com/train-qar.jsonl", "../Data/train-qar.jsonl")
# _ = urllib.request.urlretrieve("https://amazon-qa.s3-us-west-2.amazonaws.com/val-qar.jsonl", "../Data/var-qar.jsonl")

In [2]:
def get_data(url,N,downloaded):
  '''
  Args: 
    url: url of the data
    N: number of rows to be returned
    downloaded: True if the data is already downloaded

  Returns a dataframe of N rows such that not all the data is loaded into memory
  '''

  im_path = "../Data/" + url.split('/')[-1]
  final_path = im_path.replace('.gz','')
  if not downloaded:
    _ = urllib.request.urlretrieve(url, im_path)
    with gzip.open(im_path, 'rb') as infile:
      with open(final_path, 'wb') as outfile:
          for line in infile:
              outfile.write(line)
  it = pd.read_json(final_path,chunksize = 1000,lines= True)
  first_n_rows = pd.DataFrame()
  for chunk in it:
      first_n_rows = pd.concat([first_n_rows,chunk.head(N)])
      if len(first_n_rows) >= N:
          break
  return first_n_rows

In [12]:
desc = get_data('https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_Toys_and_Games.json.gz',500000,True) #first 500000 rows

In [13]:
desc['description'] = desc['description'].apply(lambda item: ' '.join([y for y in ''.join(item).split('<') if '>' not in y]))

In [14]:
df_desc = desc[['title','asin', 'description']].drop_duplicates()

In [15]:
df_ids = desc['asin'].unique().tolist()

In [4]:
def parse_jsonl_all(path,savepath):
  '''
  Args:
    path: path to the jsonl file

  The function is written to handle the csv in chunks of N rows
  Returns a csv file with all the data in the jsonl file
  '''


  N = 10000 #chunksize
  L = 0
  counter = 0
  dfs = []
  with open(path) as f:
    for i, line in enumerate(f):
      print('Parsing:', i, '            ',end='\r')
      try:
        dfs.append(pd.json_normalize(json.loads(line)))
        counter += 1
        L += 1
        if counter > N:
          df = pd.concat(dfs)
          dfs = []
          counter = 0
          if os.path.exists(savepath):
            df.to_csv(savepath,header=False,index=False,mode='a')
          else:
            df.to_csv(savepath,header=True,index=False)
      except:
        pass
  print('Done with',L, 'records from:', path)
  return 
parse_jsonl_all('../Data/train-qar.jsonl','../Data/train_processed.csv')
parse_jsonl_all('../Data/var-qar.jsonl','../Data/var_processed.csv')
parse_jsonl_all('../Data/test-qar_all.jsonl','../Data/test_processed.csv')
# from google.colab import files
# files.download('all_processed.csv') 

Done with 738776 records from: ../Data/train-qar.jsonl
Done with 92183 records from: ../Data/var-qar.jsonl
Done with 92726 records from: ../Data/test-qar_all.jsonl
