## Preprocess review & summary texts in Amazon dataset

In [1]:
import pandas as pd
import gzip
import glob
import spacy
from pathlib import Path
import concurrent.futures

nlp = spacy.load('en_core_web_sm')

In [2]:
def read_2_dataframe(path): 
    df = pd.read_json(path, compression='gzip', lines=True)
    return df

In [3]:
def custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes)
    custom_infixes = ['\.\.\.+', '(?<=[0-9])-(?=[0-9])', '[!&:,()]']
    infix_re = spacy.util.compile_infix_regex(custom_infixes)

    tokenizer = spacy.tokenizer.Tokenizer(nlp.vocab,
                                        nlp.Defaults.tokenizer_exceptions,
                                        prefix_re.search,
                                        suffix_re.search,
                                        infix_re.finditer,
                                        token_match=None)
    return lambda text: tokenizer(text)

def process_data_with_spacy(review_data):
    nlp = spacy.load('en_core_web_sm')
    return [text_to_seq(s, nlp) for s in review_data]

def text_to_seq (s, nlp):
    doc = nlp(s)
    tokens = []
    
    for tok in doc:
        if not tok.is_stop and not tok.is_punct and not tok.like_url and not tok.like_email:
            tokens.append(tok.lemma_.lower().strip() if tok.lemma_ != '-PRON-' else tok.lower_)
    return tokens

def text_to_text(s, nlp):
    return ' '.join(text_to_seq(s, nlp))

def process_data_with_spacy_df(df):
    
    df['reviewTextProc'] = df.apply (lambda row: text_to_text(row['reviewText'], nlp), axis=1)
    df['summaryProc'] = df.apply (lambda row: text_to_text(row['summary'], nlp), axis=1)
    
    return df


In [5]:
ds_gzip_path = r'D:\Datasets\amazon_reviews\gzips'
ds_proc_path = r'D:\Datasets\amazon_reviews\processed'

files = [Path(f) for f in glob.glob(ds_gzip_path + r"\*.gz", recursive=False)]
files.reverse()

In [6]:
import numpy as np
from multiprocessing import cpu_count, Pool
 
cores = cpu_count() - 2 #Number of CPU cores on your system
partitions = cores #Define as many partitions as you want
 
def parallelize(data, func):
    data_split = np.array_split(data, partitions)
    print('DF is splitted to {} partitions'.format(partitions))
    with concurrent.futures.ThreadPoolExecutor(max_workers=cores) as executor:
        data_proc = pd.concat(executor.map(func, data_split))
        return data_proc
    
    return None

In [11]:
files = [Path(ds_gzip_path + '\\reviews_Kindle_Store_5.json.gz'), Path(ds_gzip_path + '\\reviews_Home_and_Kitchen_5.json.gz')]

for f in files:
    print("Start processing " + f.stem)

    df = read_2_dataframe(str(f))
    print("Shape of DF: " + str(df.shape))

    df_proc = parallelize(df, process_data_with_spacy_df);

    print("Shape of processed DF: " + str(df_proc.shape))
    df_proc.to_json(ds_proc_path + "/" + f.stem)

    print("Processing of " + f.stem + " is finished")

Start processing reviews_Kindle_Store_5.json
Shape of DF: (982619, 9)
DF is splitted to 10 partitions
Shape of processed DF: (982619, 11)
Processing of reviews_Kindle_Store_5.json is finished
Start processing reviews_Home_and_Kitchen_5.json
Shape of DF: (551682, 9)
DF is splitted to 10 partitions
Shape of processed DF: (551682, 11)
Processing of reviews_Home_and_Kitchen_5.json is finished


### Example

In [10]:
files

[WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Video_Games_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Toys_and_Games_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Sports_and_Outdoors_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Movies_and_TV_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Kindle_Store_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Home_and_Kitchen_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Health_and_Personal_Care_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Electronics_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Clothing_Shoes_and_Jewelry_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_Cell_Phones_and_Accessories_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_CDs_and_Vinyl_5.json.gz'),
 WindowsPath('D:/Datasets/amazon_reviews/gzips/reviews_

In [11]:
df = read_2_dataframe(str(files[0]))

In [120]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,700099867,"[8, 12]",1,Installing the game was a struggle (because of...,"07 9, 2012",A2HD75EMZR8QLN,123,Pay to unlock content? I don't think so.,1341792000
1,700099867,"[0, 0]",4,If you like rally cars get this game you will ...,"06 30, 2013",A3UR8NLLY1ZHCX,"Alejandro Henao ""Electronic Junky""",Good rally game,1372550400
2,700099867,"[0, 0]",1,1st shipment received a book instead of the ga...,"06 28, 2014",A1INA0F5CWW3J4,"Amazon Shopper ""Mr.Repsol""",Wrong key,1403913600
3,700099867,"[7, 10]",3,"I got this version instead of the PS3 version,...","09 14, 2011",A1DLMTOTHQ4AST,ampgreen,"awesome game, if it did not crash frequently !!",1315958400
4,700099867,"[2, 2]",4,I had Dirt 2 on Xbox 360 and it was an okay ga...,"06 14, 2011",A361M14PU2GUEG,"Angry Ryan ""Ryan A. Forrest""",DIRT 3,1308009600


In [117]:
s = df.iloc[1679].summary + ' '+ df.iloc[1679].reviewText
s

"GREAT!! I liked this thing. It played in color, but used up batteries fast. The sound isn't all that great (too quiet), and the screen scratches easily."

In [105]:
s = df.iloc[39190].summary +' '+ df.iloc[39190].reviewText

In [106]:
s

"I Love this Game! This game is one of the best games for Game Cube. You play as a kid named Billy who goes on an exciting adventure to save a happy world from evil crows who are led by The Dark Raven who is led by The KING CROW. And if that doesn't sound fun enough this game also comes with a really fun two player battle mode where you hatch cute animals from eggs! This fun game is for all ages!"

In [114]:
nlp = spacy.load('en_core_web_sm')

In [118]:
doc = nlp(s)
tokens = []

for tok in doc:
    if not tok.is_stop and not tok.is_punct and not tok.like_url and not tok.like_email:
        tokens.append(tok.lemma_.lower().strip() if tok.lemma_ != '-PRON-' else tok.lower_)

In [119]:
' '.join(tokens)

'great i like thing it play color battery fast the sound not great quiet screen scratch easily'