# Applying Subtopic Ratings
<p>ADD INFOMRATION HERE</p>


## Step 0: Import packages

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [None]:
from gensim.models.ldamulticore import LdaMulticore
import itertools
from collections import Counter

import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
from math import ceil

import seaborn as sns
sns.set()

SUP_TITLE_FONT_SIZE = 25
TITLE_FONT_SIZE = 25
LABEL_FONT_SIZE = 15
TICK_FONT_SIZE  = 15

FIG_SIZE = (15,6)

DO_WRITE_CHARTS = True

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text))

## Step 1: Import Review Data with Subtopics

In [None]:
time_marker('Loading Restaurant Review data...')
reviews = pd.read_csv('../clean_data/az_restaurant_reviews_with_subtopics.csv', index_col=0, parse_dates=['date'], low_memory=False)

reviews.dropna(subset=['user_id'], inplace=True)

reviews.reset_index(inplace=True, drop=True)

reviews['chain_name'] = reviews['name'].str.lower().str.replace('[^\w\s]','').str.replace(' ', '_')

time_marker('done')

In [None]:
reviews.info()

## Step 2: Calculate Subtopic Ratings for Each Business
<p>Yelp Review Star Rounding is performed by rounding up to the nearest half star, the same way Yelp rounds off Business Star Ratings</p>

In [None]:
def yelp_star_rounding(stars):
    ''' 
        Yelp review stars are calculated by rounding the 
        average review scroe up to the nearest 0.5 so here 
        we use the same method for subtopic review stars.
    '''
    
    if stars > 0:
        result = ceil(stars*4)/4
    else:
        result = np.nan
    
    return result

In [None]:
# group reviews by business id
business_subtopic_ratings = reviews.groupby('business_id').mean()[reviews.columns[-10:-1]]

# apply yelp rounding to all ratings
# business_subtopic_ratings = business_subtopic_ratings.applymap(yelp_star_rounding)

business_subtopic_ratings.head(10)

In [None]:
business_subtopic_ratings.describe()

## Step 3: Import Restaurant Business Data
<p>Merge Subtopic Ratings to business information DataFrame</p>

In [None]:
time_marker('Loading Restaurant Business data...')
biz_data = pd.read_csv('../clean_data/az_restaurant_business_clean.csv', index_col=0)
biz_data = biz_data.iloc[:,:10].copy()

biz_data['chain_name'] = biz_data['name'].str.lower().str.replace('[^\w\s]','').str.replace(' ', '_')


In [None]:
# Merge Subtopic Ratings
biz_subtopics = biz_data.merge(business_subtopic_ratings, left_on='business_id', right_index=True)

In [None]:
biz_subtopics.head(3).transpose()

## Step 4: Plotting Subtopic Review Stars against Restaurant Star Ratings
<p>Restarants are givena single Star Rating, these ratings are compared to our calculated Rating for each of the 9 identified Sub Topics.  Using a set of box plots, we can see that Four possible trends appear.</p>


In [None]:
def plot_suptopic_comparisons(df, palette='Reds_r', title_prefix=''):
    
    nrows = 3
    ncols = 3
    size = 6

    subtopic_labels = list(df.columns[-9:])

    fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*size, nrows*size))

    for rr, row in enumerate(ax):
        for cc, col in enumerate(row):
            subtopic_label = subtopic_labels[rr*ncols + cc]

            ax = plt.subplot(nrows, ncols, rr*ncols + cc+1)
            ax = sns.boxplot(x="stars", y=df[subtopic_label], data=df, palette=palette)
            
            
            ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")

            ax.set_xlabel('Restaurant Star Rating', size=LABEL_FONT_SIZE)
            ax.set_ylabel('{} Rating'.format(subtopic_label).replace('_', ' ').title(), weight='bold',  size=LABEL_FONT_SIZE)
            
            ax.set_yticks([x/4 for x in range(4, 21)])
            

    plt.suptitle('{} Subtopic Rating Review'.format(title_prefix.title().title()), size=25, weight='bold', y=0.90)
    
    if DO_WRITE_CHARTS:
        plt.savefig('../charts/subtopic_review_{}.png'.format(title_prefix.replace(' ', '_').lower()))
    plt.show()
    plt.close()

In [None]:
plot_suptopic_comparisons(biz_subtopics, palette='Reds_r', title_prefix='All Restaurants')

In [None]:
plot_suptopic_comparisons(biz_subtopics[biz_subtopics.is_fast_food == 0], palette='Greens_r', 
                          title_prefix='Non Fast Food Restaurants')

In [None]:
plot_suptopic_comparisons(biz_subtopics[biz_subtopics.is_fast_food == 1], palette='Blues_r',
                          title_prefix='Fast Food Restaurants')

## Step 5:  Compare Restaurant to Others in the same Chain
<p>Given a `business_id` this report shows the status of the given business in comparison to others bearing the same name.</p>

In [None]:
biz_data.head(3)

### Collect Information on specific Business

In [None]:
def get_business_information(df, business_id):
    ''' Get dict of business attributes from a given business_id '''
    business_information = df[df.business_id == business_id].transpose().to_dict()
    results = list(business_information.values())[0]
    
    # convert categories from string to list
    results['categories'] = results['categories'].split("'")[1::2]
    
    return results

### Plot Business Rankings in Comparison to all Others in the Chain
<ul> Parameters
    <li><b>df</b>: the DataFrame of reviews to draw from</li>
    <li><b>bid</b>: the business_id of the business in question, this location is shown as a blue line in each Sub Rating</li>
    <li><b>min_reviews</b>: The minumum number of reviews for another business in the chain to be included</li>
    <li><b>min_biz</b>: The minumum number of other businesses in the chain.</li>
</ul>

In [None]:
def compare_to_chain(df, bid, min_reviews=10, min_biz=5):
    '''
        when passed a dataframe of reviews, collect ratings from all other businesses with the same name.
        Plot in red dots, the reviews in each Sub Category for each location.
    '''

    business_information = get_business_information(biz_subtopics, bid)
    

    business_address = '{} {} AZ'.format(business_information['address'], business_information['city'])
    
    # all locations with the same name
    chain_reviews = df[df.name == business_information['name']].copy()
    chain_reviews = chain_reviews.groupby('business_id').mean()
    
    # only generate report if the number of other locations is greater than min_biz 
    if chain_reviews.shape[0] < min_biz:
        time_marker('''Sorry, this set of reviews only includes {:d} other "{}" locations, we need at least {:d} other locations to generate a meaningful report.  
                        You can specify the "biz_min" argument lower in this function call and try again.'''.format(chain_reviews.shape[0], business_information['name'], min_biz))
        return False
    
    # get count of reviews from each location
    location_review_counts = df[df.name == business_information['name']].groupby('business_id').count()['name'].to_frame()

    # select only businesses with more than min_reviews reviews
    location_review_counts = location_review_counts[location_review_counts.name > min_reviews].copy()

    # subset chain reviews to include only those with more than min_reviews reviews
    chain_reviews[chain_reviews.index.isin(location_review_counts.index)].copy()
    
    
    # only generate report if the number of other locations is greater than min_biz 
    if chain_reviews.shape[0] < min_biz:
        time_marker('''Sorry, this set of reviews only includes {:d} other "{}" locations, with at least {:d} reviews, we need at least {:d} other locations to generate a meaningful report.  
                        You can specify the "biz_min" argument lower in this function call and try again.'''.format(chain_reviews.shape[0], business_information['name'], min_reviews, min_biz))
        return False
    
    # Plotting!
    plt.subplots(figsize=FIG_SIZE)
    
    # melt data frame for seaborn boxplot
    chain_melt = chain_reviews[chain_reviews.index != bid].iloc[:,-9:].melt()

    # sqarm plot of subtopic reviews
    ax = sns.swarmplot(x='variable', y='value', data=chain_melt, color='r', alpha=0.75)
    
    # plot the specific bid with lines
    sample_location_ratings = chain_reviews[chain_reviews.index == bid].mean()[-9:].values
    
    for ii, val in enumerate(sample_location_ratings):
        ax.axhline(val, xmin=ii/9, xmax=(ii+1)/9, color='b', linewidth=5)

    # formatting title and axis
    plt.suptitle('{} at {}'.format(business_information['name'], business_address), size=SUP_TITLE_FONT_SIZE, weight='bold')
    ax.set_title('Compared to all other {} Restaurants'.format(business_information['name']), size=LABEL_FONT_SIZE)

    ax.set_ylabel('Review Sub Categories', size=LABEL_FONT_SIZE)
    ax.set_xlabel('Sub Category Rating', size=LABEL_FONT_SIZE)
    
    ax.set_xticklabels([x.replace('_', ' ').title() for x in biz_subtopics.columns[-9:]])
    
    ax.set_yticks([(x/2)+1 for x in range(0, 9)])
    
    file_path = '../charts/reports/{}_{}_chain_comparison.png'.format(business_information['chain_name'].replace('_',''), bid)
    
    if DO_WRITE_CHARTS:
        plt.savefig(file_path)

    plt.show()
    plt.close()
    
    return file_path

## Step 8: Generate Chain Review Comparison Charts for Top 5 Largest Restaurant Chains
<p>A restaurant chain is defined as a group of Restaurants all sharing the same name.</p>

In [None]:
def get_top_n_chains(n):
 
    # extract top 10 chains, by number of locations
    top_n_chains = [x[0] for x in Counter(biz_data.chain_name).most_common(n)]

    # etract business_id values of all retaurants in top_ten_chains
    top_n_chain_bids = list(reviews[reviews.chain_name.isin(top_n_chains)].business_id.unique())

    print('Working with reviews for {:d} Restaurants across {:d} Chains'.format(len(top_n_chain_bids), n))
    
    return top_n_chain_bids
    
chain_bids = get_top_n_chains(5)

In [None]:
[compare_to_chain(reviews, x) for x in chain_bids]