# Text Mining
<p><b>Goals:</b></p>
<ul>
    <li>Extract key terms from different star ratings.  First by each star rating and then by 'bad' and 'good' reviews</li>
    <li>Extract key terms from different star ratings with reviews grouped by primary Restaurant Type.</li>
</ul>

In [1]:
%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from glob import glob

import seaborn as sns
sns.set_style('whitegrid')
sns.set_context("poster")

In [2]:
def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

import matplotlib
font = {'size' : 50}
matplotlib.rc('font', **font)

TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)
DO_WRITE_CHARTS = False

In [4]:
fname = './clean_data/_analysis/us_restaurant_bids_alt.txt'

with open(fname) as f:
    us_rest_bids = f.readlines()
us_rest_bids = [x.strip() for x in us_rest_bids] 

# Load Business Data for US Restaurants in Top Categories

In [5]:
time_marker(text='Loading Business Data...')

restaurants = pd.DataFrame()
file_path_slug = './clean_data/business/*.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0)):

        # only consider reviews for businesses that are US Restaurants
        chunk = chunk[chunk.business_id.isin(us_rest_bids)].copy()
        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
restaurants = pd.concat(chunks)

time_marker('reseting index...')
restaurants.reset_index(inplace=True, drop=True)

# drop columns of all zeros
restaurants = restaurants.loc[:, (restaurants != 0).any(axis=0)]

time_marker(text='Complete!')



[18:01:57.238589] Loading Business Data...
[18:01:57.244846] Reading 1 of 64 ./clean_data/business/00_-_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[18:01:58.263452] Reading 2 of 64 ./clean_data/business/01_0_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[18:01:59.165482] Reading 3 of 64 ./clean_data/business/02_1_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[18:02:00.010687] Reading 4 of 64 ./clean_data/business/03_2_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[18:02:00.841921] Reading 5 of 64 ./clean_data/business/04_3_business_clean.csv...
[18:02:01.698099] Reading 6 of 64 ./clean_data/business/05_4_business_clean.csv...
[18:02:02.621791] Reading 7 of 64 ./clean_data/business/06_5_business_clean.csv...


  interactivity=interactivity, compiler=compiler, result=result)


[18:02:03.470425] Reading 8 of 64 ./clean_data/business/07_6_business_clean.csv...
[18:02:04.317703] Reading 9 of 64 ./clean_data/business/08_7_business_clean.csv...
[18:02:05.169816] Reading 10 of 64 ./clean_data/business/09_8_business_clean.csv...
[18:02:05.982795] Reading 11 of 64 ./clean_data/business/10_9_business_clean.csv...
[18:02:06.843434] Reading 12 of 64 ./clean_data/business/11_A_business_clean.csv...
[18:02:07.747171] Reading 13 of 64 ./clean_data/business/12_B_business_clean.csv...
[18:02:08.634209] Reading 14 of 64 ./clean_data/business/13_C_business_clean.csv...
[18:02:09.540081] Reading 15 of 64 ./clean_data/business/14_D_business_clean.csv...
[18:02:10.395404] Reading 16 of 64 ./clean_data/business/15_E_business_clean.csv...
[18:02:11.262073] Reading 17 of 64 ./clean_data/business/16_F_business_clean.csv...
[18:02:12.187885] Reading 18 of 64 ./clean_data/business/17_G_business_clean.csv...
[18:02:13.160458] Reading 19 of 64 ./clean_data/business/18_H_business_clean.c

  interactivity=interactivity, compiler=compiler, result=result)


[18:02:18.338139] Reading 25 of 64 ./clean_data/business/24_N_business_clean.csv...
[18:02:19.200546] Reading 26 of 64 ./clean_data/business/25_O_business_clean.csv...
[18:02:20.103348] Reading 27 of 64 ./clean_data/business/26_P_business_clean.csv...
[18:02:20.910579] Reading 28 of 64 ./clean_data/business/27_Q_business_clean.csv...
[18:02:21.723513] Reading 29 of 64 ./clean_data/business/28_R_business_clean.csv...
[18:02:23.064262] Reading 30 of 64 ./clean_data/business/29_S_business_clean.csv...
[18:02:27.682771] Reading 31 of 64 ./clean_data/business/30_T_business_clean.csv...
[18:02:30.028621] Reading 32 of 64 ./clean_data/business/31_U_business_clean.csv...
[18:02:31.208132] Reading 33 of 64 ./clean_data/business/32_V_business_clean.csv...
[18:02:32.254370] Reading 34 of 64 ./clean_data/business/33_W_business_clean.csv...
[18:02:33.209805] Reading 35 of 64 ./clean_data/business/34_X_business_clean.csv...
[18:02:34.119136] Reading 36 of 64 ./clean_data/business/35_Y_business_clean

  interactivity=interactivity, compiler=compiler, result=result)


[18:02:48.472349] Reading 53 of 64 ./clean_data/business/52_o_business_clean.csv...
[18:02:49.374627] Reading 54 of 64 ./clean_data/business/53_p_business_clean.csv...
[18:02:50.235293] Reading 55 of 64 ./clean_data/business/54_q_business_clean.csv...
[18:02:51.084127] Reading 56 of 64 ./clean_data/business/55_r_business_clean.csv...
[18:02:51.957593] Reading 57 of 64 ./clean_data/business/56_s_business_clean.csv...
[18:02:52.807044] Reading 58 of 64 ./clean_data/business/57_t_business_clean.csv...
[18:02:53.834734] Reading 59 of 64 ./clean_data/business/58_u_business_clean.csv...
[18:02:54.670309] Reading 60 of 64 ./clean_data/business/59_v_business_clean.csv...
[18:02:56.181629] Reading 61 of 64 ./clean_data/business/60_w_business_clean.csv...
[18:02:57.142555] Reading 62 of 64 ./clean_data/business/61_x_business_clean.csv...
[18:02:57.973078] Reading 63 of 64 ./clean_data/business/62_y_business_clean.csv...
[18:02:58.824850] Reading 64 of 64 ./clean_data/business/63_z_business_clean

In [6]:
restaurants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15500 entries, 0 to 15499
Columns: 227 entries, business_id to wraps
dtypes: float64(28), int64(122), object(77)
memory usage: 26.8+ MB


In [7]:
restaurants.head(3)

Unnamed: 0,business_id,address,city,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,...,thai,turkish,ukrainian,uzbek,vegan,vegetarian,venezuelan,vietnamese,waffles,wraps
0,-2q4dnUw0gGJniGW2aPamQ,1805 S Neil St,Champaign,0,40.094068,-88.245785,Fiesta Ranchera,,61820,4,...,0,0,0,0,0,0,0,0,0,0
1,-49WY_TEa9ZEcRk_GnuLog,5205 Detroit Rd,Sheffield Village,1,41.425861,-82.080963,Cracker Barrel Old Country Store,,44054,27,...,0,0,0,0,0,0,0,0,0,0
2,-jKhfsXol4FxbRjK8aUsLA,"1615 W Camelback Rd, Ste 108",Phoenix,1,33.508173,-112.094777,Little Caesars Pizza,,85015,10,...,0,0,0,0,0,0,0,0,0,0


# Load Review Data for US Restaurants in Top Categories

In [8]:
time_marker(text='Loading Review Data...')

reviews = pd.DataFrame()
file_path_slug = './clean_data/reviews/*.csv'
file_list = glob(file_path_slug)

# Chunk Settings
chunks = list()
chunksize = 10000
for ii, file in enumerate(sorted(file_list)):
    time_marker('Reading {} of {} {}...'.format(ii+1, len(file_list), file))
    num_chunks = math.ceil(sum(1 for row in open(file, 'r'))/chunksize)
    format_width = len(str(num_chunks))

    # import file in chunks
    for jj, chunk in enumerate(pd.read_csv(file, chunksize=chunksize, iterator=True, index_col=0)):

        # only consider reviews for businesses that are US Restaurants
        chunk = chunk[chunk.business_id.isin(us_rest_bids)].copy()
        
        # append chunk to chunks list
        chunks.append(chunk)
        
time_marker(text='merging to dataframe...')
reviews = pd.concat(chunks)

time_marker('reseting index...')
reviews.reset_index(inplace=True, drop=True)
time_marker(text='Complete!')

[18:03:02.029024] Loading Review Data...
[18:03:02.100369] Reading 1 of 64 ./clean_data/reviews/00_-_reviews_clean.csv...
[18:03:03.790590] Reading 2 of 64 ./clean_data/reviews/01_0_reviews_clean.csv...
[18:03:05.274054] Reading 3 of 64 ./clean_data/reviews/02_1_reviews_clean.csv...
[18:03:06.563250] Reading 4 of 64 ./clean_data/reviews/03_2_reviews_clean.csv...
[18:03:07.882553] Reading 5 of 64 ./clean_data/reviews/04_3_reviews_clean.csv...
[18:03:09.436862] Reading 6 of 64 ./clean_data/reviews/05_4_reviews_clean.csv...
[18:03:10.902915] Reading 7 of 64 ./clean_data/reviews/06_5_reviews_clean.csv...
[18:03:12.503113] Reading 8 of 64 ./clean_data/reviews/07_6_reviews_clean.csv...
[18:03:13.949773] Reading 9 of 64 ./clean_data/reviews/08_7_reviews_clean.csv...
[18:03:15.391515] Reading 10 of 64 ./clean_data/reviews/09_8_reviews_clean.csv...
[18:03:16.681778] Reading 11 of 64 ./clean_data/reviews/10_9_reviews_clean.csv...
[18:03:18.129388] Reading 12 of 64 ./clean_data/reviews/11_A_revie

In [9]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917882 entries, 0 to 917881
Data columns (total 10 columns):
business_id      917882 non-null object
cool             917882 non-null object
date             917882 non-null object
funny            917882 non-null float64
review_id        917882 non-null object
stars            917882 non-null float64
text             917882 non-null object
useful           917881 non-null float64
user_id          917881 non-null object
review_length    917881 non-null float64
dtypes: float64(4), object(6)
memory usage: 70.0+ MB


In [10]:
reviews.head(3)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id,review_length
0,-gTX7XKGOntF2FQ1EDxHpA,0,2016-07-21,0.0,8BJUlXtGvVpkNq663suBUw,1.0,went thru the drive thru and could hardly unde...,0.0,AKtCpeBdst-OFYNmsfJgiw,336.0
1,-gTX7XKGOntF2FQ1EDxHpA,1,2017-07-25,0.0,umGwYIfHbKjj5Tvi2o4Oog,1.0,everytime i come here something is wrong a few...,1.0,5vTqiawndUhSPAqDMhPr5Q,293.0
2,-gTX7XKGOntF2FQ1EDxHpA,1,2015-10-18,1.0,mjiQ2bNWFTbuzdQN0IVcJw,1.0,bad decision extra crispy was not rollsbiscui...,3.0,7-nbzoSmoMzwo2fbetoycQ,228.0


In [11]:
print('Number of Businesses in Restaurants Data {:d}'.format(len(restaurants.business_id.unique())))
print('Number of Businesses in Reviews Data     {:d}'.format(len(reviews.business_id.unique())))

Number of Businesses in Restaurants Data 15500
Number of Businesses in Reviews Data     15500


# Drop Unwanted columns from `restaurants` dataframe

In [12]:
cols_to_keep_business_details = restaurants.columns[:12]
cols_to_keep_cuisine = restaurants.columns[107:]

# trim down cuisine columns to only include main type
top_cuisines = ['chicken_wings','breakfast_&_brunch',
                'american_(new)','italian','chinese',
                'burgers','american_(traditional)',
                'mexican','sandwiches','pizza','fast_food']

cols_to_drop = []

for col in cols_to_keep_cuisine:
    if col not in top_cuisines:
        cols_to_drop.append(col)

extra_drops = ['address', 'latitude', 'longitude', 'neighborhood', 'postal_code']
for col in extra_drops:
    cols_to_drop.append(col)

for col in restaurants.columns:
    if col not in cols_to_keep_business_details:
         if col not in cols_to_keep_cuisine:
            cols_to_drop.append(col)

restaurants_clean = restaurants.copy()
restaurants_clean.drop(cols_to_drop, axis=1, inplace=True)


# Merge Restaurant Data to Review Date
<p>Will be our master data frame</p>

In [13]:
restaurants_clean.head(3)

Unnamed: 0,business_id,city,is_open,name,review_count,stars,state,american_(new),american_(traditional),breakfast_&_brunch,burgers,chicken_wings,chinese,fast_food,italian,mexican,pizza,sandwiches
0,-2q4dnUw0gGJniGW2aPamQ,Champaign,0,Fiesta Ranchera,4,2.0,IL,0,0,0,0,0,0,0,0,1,0,0
1,-49WY_TEa9ZEcRk_GnuLog,Sheffield Village,1,Cracker Barrel Old Country Store,27,3.5,OH,0,1,1,0,0,0,0,0,0,0,0
2,-jKhfsXol4FxbRjK8aUsLA,Phoenix,1,Little Caesars Pizza,10,3.0,AZ,0,0,0,0,0,0,0,0,0,1,0


In [14]:
reviews.columns = ['business_id', 'review_cool', 'review_date', 'review_funny', 'review_id', 'review_stars', 'review_text', 'review_useful', 'review_user_id', 'review_length']
reviews.head(3)

Unnamed: 0,business_id,review_cool,review_date,review_funny,review_id,review_stars,review_text,review_useful,review_user_id,review_length
0,-gTX7XKGOntF2FQ1EDxHpA,0,2016-07-21,0.0,8BJUlXtGvVpkNq663suBUw,1.0,went thru the drive thru and could hardly unde...,0.0,AKtCpeBdst-OFYNmsfJgiw,336.0
1,-gTX7XKGOntF2FQ1EDxHpA,1,2017-07-25,0.0,umGwYIfHbKjj5Tvi2o4Oog,1.0,everytime i come here something is wrong a few...,1.0,5vTqiawndUhSPAqDMhPr5Q,293.0
2,-gTX7XKGOntF2FQ1EDxHpA,1,2015-10-18,1.0,mjiQ2bNWFTbuzdQN0IVcJw,1.0,bad decision extra crispy was not rollsbiscui...,3.0,7-nbzoSmoMzwo2fbetoycQ,228.0


In [15]:
reviews_clean = pd.DataFrame()

In [16]:
reviews_clean = reviews.merge(restaurants_clean, how='left', left_on='business_id', right_on='business_id')
reviews_clean.reset_index(inplace=True, drop=False)

In [17]:
reviews_clean.head(3)

Unnamed: 0,index,business_id,review_cool,review_date,review_funny,review_id,review_stars,review_text,review_useful,review_user_id,...,american_(traditional),breakfast_&_brunch,burgers,chicken_wings,chinese,fast_food,italian,mexican,pizza,sandwiches
0,0,-gTX7XKGOntF2FQ1EDxHpA,0,2016-07-21,0.0,8BJUlXtGvVpkNq663suBUw,1.0,went thru the drive thru and could hardly unde...,0.0,AKtCpeBdst-OFYNmsfJgiw,...,0,0,0,1,0,1,0,0,0,0
1,1,-gTX7XKGOntF2FQ1EDxHpA,1,2017-07-25,0.0,umGwYIfHbKjj5Tvi2o4Oog,1.0,everytime i come here something is wrong a few...,1.0,5vTqiawndUhSPAqDMhPr5Q,...,0,0,0,1,0,1,0,0,0,0
2,2,-gTX7XKGOntF2FQ1EDxHpA,1,2015-10-18,1.0,mjiQ2bNWFTbuzdQN0IVcJw,1.0,bad decision extra crispy was not rollsbiscui...,3.0,7-nbzoSmoMzwo2fbetoycQ,...,0,0,0,1,0,1,0,0,0,0


In [18]:
reviews_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 917882 entries, 0 to 917881
Data columns (total 28 columns):
index                     917882 non-null int64
business_id               917882 non-null object
review_cool               917882 non-null object
review_date               917882 non-null object
review_funny              917882 non-null float64
review_id                 917882 non-null object
review_stars              917882 non-null float64
review_text               917882 non-null object
review_useful             917881 non-null float64
review_user_id            917881 non-null object
review_length             917881 non-null float64
city                      917882 non-null object
is_open                   917882 non-null int64
name                      917882 non-null object
review_count              917882 non-null int64
stars                     917882 non-null float64
state                     917882 non-null object
american_(new)            917882 non-null int64
americ