## Preprocess review & summary texts in Amazon dataset

In [1]:
import pandas as pd
import gzip
import glob
import spacy
from pathlib import Path
import concurrent.futures

nlp = spacy.load('en_core_web_sm')

In [2]:
def read_2_dataframe(path): 
    df = pd.read_json(path, compression='gzip', lines=True)
    return df

In [3]:
def custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])', '[!&:,()]']
    infix_re = spacy.util.compile_infix_regex(custom_infixes)

    tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab,
                                        nlp.Defaults.tokenizer_exceptions,
                                        prefix_re.search,
                                        suffix_re.search,
                                        infix_re.finditer,
                                        token_match=None)
    return lambda text: tokenizer(text)

def process_data_with_spacy(review_data):
    nlp = spacy.load('en_core_web_sm')
    return [text_to_seq(s, nlp) for s in review_data]

def text_to_seq (s, nlp):
    doc = nlp(s)
    tokens = []
    
    for tok in doc:
        if not tok.is_stop and not tok.is_punct and not tok.like_url and not tok.like_email:
            tokens.append(tok.lemma_.lower().strip() if tok.lemma_ != '-PRON-' else tok.lower_)
    return tokens

def text_to_text(s, nlp):
    return ' '.join(text_to_seq(s, nlp))

def process_data_with_spacy_df(df):
    
    df['reviewTextProc'] = df.apply (lambda row: text_to_text(row['reviewText'], nlp), axis=1)
    df['summaryProc'] = df.apply (lambda row: text_to_text(row['summary'], nlp), axis=1)
    
    return df


In [4]:
ds_gzip_path = r'D:\Datasets\amazon_reviews\gzips'
ds_proc_path = r'D:\Datasets\amazon_reviews\processed'

files = [Path(f) for f in glob.glob(ds_gzip_path + r"\*.gz", recursive=False)]
files.reverse()

In [6]:
import numpy as np
from multiprocessing import cpu_count, Pool
 
cores = cpu_count() - 4 #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    print('DF is splitted to {} partitions'.format(partitions))
    with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
        data_proc = pd.concat(executor.map(func, data_split))
        return data_proc
    
    return None

In [None]:
files = [Path(ds_gzip_path + '\\reviews_Books_5.json.gz')]

for f in files:
    print("Start processing " + f.stem)
    
    chunk_num = 0
    for df_chunky in pd.read_json(str(f), lines=True, compression = 'gzip', chunksize=100000):
        print('Chunk #%s' % chunk_num)
        chunk_num += 1
        
        print("Shape of DF: " + str(df_chunky.shape))
        df_proc = parallelize(df_chunky, process_data_with_spacy_df);
        #df_proc = process_data_with_spacy_df(df_chunky)

        print("Shape of processed DF: " + str(df_proc.shape))
        with open(ds_proc_path + "/" + f.stem, mode='a') as out_file:
            df_proc.to_csv(out_file, header=out_file.tell()==0,  mode='a')

    print("Processing of " + f.stem + " is finished")

Start processing reviews_Books_5.json
Chunk #0
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #1
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #2
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #3
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #4
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #5
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #6
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #7
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #8
Shape of DF: (100000, 9)
DF is splitted to 8 partitions
Shape of processed DF: (100000, 11)
Chunk #9
Shape of DF: (100000, 9)
DF is splitted to 8

### Example

In [4]:
files

[WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Video_Games_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Toys_and_Games_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Sports_and_Outdoors_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Movies_and_TV_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Kindle_Store_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Home_and_Kitchen_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Health_and_Personal_Care_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Electronics_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Clothing_Shoes_and_Jewelry_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Cell_Phones_and_Accessories_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_CDs_and_Vinyl_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_

In [5]:
import gzip
import json

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield l

def get_dataframe(path):
    i = 0
    df = {}
    for d in parse(path):
        dec = json.loads(d.decode('utf8'))
        df[i] = dec
        i += 1
        if i % 10000 == 0:
            break
    return pd.DataFrame.from_dict(df, orient='index')

In [27]:
df = get_dataframe(str(files[5]))

In [28]:
df.head()

Unnamed: 0,overall,summary,reviewTime,helpful,reviewText,unixReviewTime,reviewerID,asin,reviewerName
0,5.0,Best Price,"10 19, 2013","[0, 0]",My daughter wanted this book and the price on ...,1382140800,APYOBQE6M18AA,615391206,Martin Schwartz
1,5.0,zoku,"06 18, 2014","[0, 0]",I bought this zoku quick pop for my daughterr ...,1403049600,A1JVQTAGHYOL7F,615391206,Michelle Dinh
2,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...","05 5, 2013","[26, 27]",There is no shortage of pop recipes available ...,1367712000,A3UPYGJKZ0XTU4,615391206,mirasreviews
3,5.0,Creative Combos,"08 4, 2011","[14, 18]",This book is a must have if you get a Zoku (wh...,1312416000,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover"""
4,4.0,A must own if you own the Zoku maker...,"06 7, 2014","[0, 0]",This cookbook is great. I have really enjoyed...,1402099200,AHAI85T5C2DH3,615391206,PugLover


In [29]:
review_num = 5811
s = df.iloc[review_num].summary + ' '+ df.iloc[review_num].reviewText
s

'Good - large - some foods &#34;stick&#34; inside diswasher This is a good set - as others have noted each utensil is large - but I like that - they are heavy duty and strudy -  however, I have noticed some foods tend to stick to the unesils, even when run thru a dishwasher cycle'

In [105]:
s = df.iloc[39190].summary +' '+ df.iloc[39190].reviewText

In [106]:
s

"I Love this Game! This game is one of the best games for Game Cube. You play as a kid named Billy who goes on an exciting adventure to save a happy world from evil crows who are led by The Dark Raven who is led by The KING CROW. And if that doesn't sound fun enough this game also comes with a really fun two player battle mode where you hatch cute animals from eggs! This fun game is for all ages!"

In [114]:
nlp = spacy.load('en_core_web_sm')

In [118]:
doc = nlp(s)
tokens = []

for tok in doc:
    if not tok.is_stop and not tok.is_punct and not tok.like_url and not tok.like_email:
        tokens.append(tok.lemma_.lower().strip() if tok.lemma_ != '-PRON-' else tok.lower_)

In [119]:
' '.join(tokens)

'great i like thing it play color battery fast the sound not great quiet screen scratch easily'