# Yelp Review Wrangling

In [1]:
%matplotlib inline

import pandas as pd
import datetime
import json
from glob import glob
import math

DRY_RUN = False

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text.lower()))
    
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

# Load Review Data

In [None]:
time_marker(text='Loading Reviews Data...')

data = pd.DataFrame()
source_data_file = '../source_data/review.json'

reviews_list = []
for line in open(source_data_file, 'r'):
    reviews_list.append(json.loads(line))

time_marker(text='creating dataframe...')
reviews_df = pd.DataFrame(reviews_list)

time_marker(text='done')

# append `business_id` prefix columns

In [None]:
time_marker('appending bid_prefix column...')
reviews_df['bid_prefix'] = reviews_df.business_id.apply(lambda x: x[:1])
reviews_df.head(8)

In [None]:
time_marker(text='Writing to files...')
file_count = len(reviews_df.bid_prefix.unique())

for i, prefix in enumerate(sorted(reviews_df.bid_prefix.unique())):
    df = reviews_df[reviews_df.bid_prefix == prefix].iloc[:,:-1].copy()
    df.reset_index(inplace=True, drop=True)
    file_name = '../clean_data/reviews/tmp/{}_{}_reviews_clean.csv'.format(str(i).zfill(2), prefix)
    time_marker(text='Writing {:d} records to file {}'.format(df.shape[0], file_name))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')
time_marker(text='Done!')

# Reload data from CSVs for next steps

In [5]:
import string
translator = str.maketrans('','', string.punctuation)

In [12]:
reviews = pd.DataFrame()

semi_cleaned_files = '../clean_data/reviews/tmp/*.csv'
file_list = glob(semi_cleaned_files)

# Chunk Settings
chunks = list()
chunksize = 10000

for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0, parse_dates=['date'])):

        # drop reviews with missing review text
        chunk = chunk[~chunk.text.isnull()].copy()

        # lowercase text and remove puncutation
        chunk['text'] = chunk['text'].apply(lambda text: text.translate(translator).lower())


        # append text length columns 
        chunk['review_length'] = chunk.text.str.len()
        
        # append business_id prefix column or file sorting
        chunk['bid_prefix'] = chunk.business_id.apply(lambda x: x[:1])


        # append chunk to chunks list
        chunks.append(chunk)        

#         if jj % 10 == 0 or jj == num_chunks:
#             time_marker(text='\tfinished chunk {} of {}'.format(str(jj+1).rjust(format_width), str(num_chunks).rjust(format_width)))

time_marker(text='merging to dataframe...')
reviews = pd.concat(chunks)

time_marker('reseting index...')
reviews.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')

[14:46:31.017799] reading 1 of 64 ../clean_data/reviews/tmp/00_-_reviews_clean.csv...
[14:46:33.796165] reading 2 of 64 ../clean_data/reviews/tmp/01_0_reviews_clean.csv...
[14:46:36.213556] reading 3 of 64 ../clean_data/reviews/tmp/02_1_reviews_clean.csv...
[14:46:38.358417] reading 4 of 64 ../clean_data/reviews/tmp/03_2_reviews_clean.csv...
[14:46:40.717474] reading 5 of 64 ../clean_data/reviews/tmp/04_3_reviews_clean.csv...
[14:46:44.092458] reading 6 of 64 ../clean_data/reviews/tmp/05_4_reviews_clean.csv...
[14:46:46.448280] reading 7 of 64 ../clean_data/reviews/tmp/06_5_reviews_clean.csv...
[14:46:48.986774] reading 8 of 64 ../clean_data/reviews/tmp/07_6_reviews_clean.csv...
[14:46:51.314714] reading 9 of 64 ../clean_data/reviews/tmp/08_7_reviews_clean.csv...
[14:46:53.739564] reading 10 of 64 ../clean_data/reviews/tmp/09_8_reviews_clean.csv...
[14:46:55.832587] reading 11 of 64 ../clean_data/reviews/tmp/10_9_reviews_clean.csv...
[14:46:58.107531] reading 12 of 64 ../clean_data/rev

# Write to File
<p>split by business prefix first character</p>

In [13]:
time_marker(text='Writing to files...')
file_count = len(reviews.bid_prefix.unique())

for i, prefix in enumerate(sorted(reviews.bid_prefix.unique())):
    df = reviews[reviews.bid_prefix == prefix].iloc[:,:-1].copy()
    df.reset_index(inplace=True, drop=True)
    file_name = '../clean_data/reviews/{}_{}_reviews_clean.csv'.format(str(i).zfill(2), prefix)
    time_marker(text='Writing {:d} records to file {}'.format(df.shape[0], file_name))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')
time_marker(text='Done!')

[14:49:40.073639] writing to files...
[14:49:49.526564] writing 84333 records to file ../clean_data/reviews/00_-_reviews_clean.csv
[14:49:53.945826] writing 75066 records to file ../clean_data/reviews/01_0_reviews_clean.csv
[14:49:57.987004] writing 65730 records to file ../clean_data/reviews/02_1_reviews_clean.csv
[14:50:01.609724] writing 68042 records to file ../clean_data/reviews/03_2_reviews_clean.csv
[14:50:05.455760] writing 79967 records to file ../clean_data/reviews/04_3_reviews_clean.csv
[14:50:09.796685] writing 71994 records to file ../clean_data/reviews/05_4_reviews_clean.csv
[14:50:14.388722] writing 76597 records to file ../clean_data/reviews/06_5_reviews_clean.csv
[14:50:19.072267] writing 70897 records to file ../clean_data/reviews/07_6_reviews_clean.csv
[14:50:23.424402] writing 73512 records to file ../clean_data/reviews/08_7_reviews_clean.csv
[14:50:27.782066] writing 64413 records to file ../clean_data/reviews/09_8_reviews_clean.csv
[14:50:31.577857] writing 70055 