# Data Processing
##### Sean Wade

In [1]:
import pandas as pd
import numpy as np
import json
from haversine import haversine

from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

from IPython.core.display import HTML
import urllib2
HTML(urllib2.urlopen('http://seanwade.com/jupyter.css').read())

In [2]:
def getCategory(df, category):
    return df[df['categories'].apply(lambda x: category in x.lower())]

In [3]:
def getState(df, state):
    return df[df['state'] == state]

In [4]:
def getClose(df, center, max_dist):
    ''' Get close buisnesses within a radius. '''
    return df[df['loc'].apply(lambda x: haversine(center, x)) <= max_dist]

In [5]:
def plotStarDistribution(df):
    try:
        df.groupby('stars').size().plot(kind='bar')    
    except:
        print "No Matching Results..."

## Load Data

In [6]:
business = pd.read_csv('../raw_data/csv/yelp_academic_dataset_business.csv')
review = pd.read_csv('../raw_data/csv/yelp_academic_dataset_review.csv')
user = pd.read_csv('../raw_data/csv/yelp_academic_dataset_user.csv')

IOError: File ../raw_data/csv/yelp_academic_dataset_business.csv does not exist

In [None]:
# Add business name to dataframes
business_dict = dict(zip(business['business_id'], business['name']))
review['biz_name'] = [business_dict[x] for x in review['business_id'].tolist()]

In [None]:
# for calc distance
business['loc'] = list(zip(business.latitude, business.longitude))

## All Cities

In [None]:
state_set = set()
print "City, Num Bizz"
for x in business.state.unique():
    if len(getState(business, x)) > 100: 
        state_set.add(x)
        print x, len(getState(business, x))

## All Types of businesses

In [None]:
category_set = set()
for x in business['categories'].tolist():
    for y in eval(x):
        category_set.add(y.lower())

In [None]:
category_set

## Get Specific Data

In [None]:
getClose(business, (40.3541155, -80.0146597), .3)

In [None]:
for state in state_set:
    state_bizz = getState(business, state)
    cat_bizz = getCategory(state_bizz, 'fashion')
    print state
    print cat_bizz.groupby('stars').size()
    print '\n'

In [None]:
import plotly.plotly as py
from plotly.graph_objs import *
py.sign_in('smwade', "a8zcz2zggpeP7MhyykAS")

In [None]:
def plotCategory(df, category, state):
    df = getCategory(df, category)
    df = getState(df, state)
    
    mapbox_access_token = 'pk.eyJ1Ijoic213YWRlIiwiYSI6ImNpdzU4MmR6ZDAwbGwyeHIzcWRkeHNmeWIifQ.dyJ-FEl459ry0ebLOy8f9g'

    data = Data([
        Scattermapbox(
            name="%s businesses" % category,
            lat=df['latitude'],
            lon=df['longitude'],
            mode='markers',
            marker=Marker(
                size=5,
                color='red',
                opacity=0.5,
            ),
            text=df['name'],
            hoverinfo='text',
        )     
    ])

    layout = Layout(
        title="%s business" % category,
        autosize=True,
        hovermode='closest',
        mapbox=dict(
            accesstoken=mapbox_access_token,
            bearing=0,
            center=dict(
                lat=df['latitude'].iloc[0],
                lon=df['longitude'].iloc[0]
            ),
            pitch=0,
            zoom=13
        ),
    )

    fig = dict(data=data, layout=layout)
    return py.iplot(fig, filename='temp', validate=False)

In [None]:
plotCategory(business, 'fast food', 'QC')

In [None]:
cut = np.linspace(0, review['votes.useful'].max(), 10)
group = review.groupby(pd.cut(review['votes.useful'], cut))
print group

In [None]:
group.agg({'user_id': np.size, 'stars': np.mean}).plot(kind='bar',y='stars')