# Seafood Business Dataset

In [1]:
import json
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def read_json(path):
    with open(path, encoding='utf-8') as f:
        iter_f = iter(f)
        line = f.readline()
        text = []
        for line in iter_f: 
            d = json.loads(line)
            text.append(d)
        f.close()
    return(text)

In [3]:
def write_json(path, data):
    with open(path, 'w', encoding="utf-8") as f:
        return json.dump(data, f, indent=2)

## Step 1: Filter business

In [4]:
raw_business = read_json("business.json")

In [5]:
len(raw_business)

192608

### Filtering rules:
* "Seafood", "Restaurant" **must** in the category of the business
* "Steakhouse" **must not** in the category of the business
* the length of category **must** less than or equal to 50
* the amount of reviews **must** greater than or equal to 50

We got **433** seafood business in total.

In [6]:
seafood_business = []
index = []

for i in range(len(raw_business)):
    if raw_business[i]['categories'] is None:
        continue
    if "Seafood" in raw_business[i]['categories'] and "Restaurants" in raw_business[i]['categories'] \
    and "Steakhouse" not in raw_business[i]['categories'] and len(raw_business[i]['categories']) <= 50:
        if raw_business[i]['review_count'] >= 50:
            seafood_business.append(raw_business[i])
            index.append(i)

In [7]:
len(seafood_business)

433

### See the range of:
* the length of category: 20 ~ 50
* the amount of reviews: 50 ~ 2423

In [8]:
length_max = 0
length_min = 195
index_max = None
index_min = None

for i in range(len(seafood_business)):   
    if len(seafood_business[i]['categories']) >= length_max:
        length_max = len(seafood_business[i]['categories'])
        index_max = i
        
for i in range(len(seafood_business)):   
    if len(seafood_business[i]['categories']) <= length_min:
        length_min = len(seafood_business[i]['categories'])
        index_min = i

In [9]:
print(length_max, length_min)

50 20


In [10]:
count_max = 0
count_min = 100000
index_max = 0
index_min = 0

for i in range(len(seafood_business)):   
    if seafood_business[i]['review_count'] >= count_max:
        count_max = seafood_business[i]['review_count']
        index_max = i
        
for i in range(len(seafood_business)):   
    if seafood_business[i]['review_count'] <= count_min:
        count_min = seafood_business[i]['review_count']
        index_min = i

In [11]:
print(count_max,count_min)

2423 50


## Step 2: Combine reviews to business

In [12]:
raw_review = read_json("review.json")

In [13]:
business_id=[]
for i in range(len(seafood_business)):
    business_id.append(seafood_business[i]['business_id'])
    
seafood_review={}
for i in range(len(seafood_business)):
    seafood_review[seafood_business[i]['business_id']]=[]
    
for i in range(len(raw_review)):
    if raw_review[i]['business_id'] in business_id:
        seafood_review[raw_review[i]['business_id']].append(raw_review[i])
        
#put reviews together
all_review=[]
for i in range(len(seafood_business)):
    all_review.extend(seafood_review[seafood_business[i]['business_id']])

In [14]:
len(seafood_review)

433

### See the 1st seafood business "Salt Cellar":

In [15]:
seafood_business[0]

{'address': '550 N Hayden Rd',
 'attributes': {'Alcohol': "'full_bar'",
  'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': False}",
  'BikeParking': 'False',
  'BusinessAcceptsCreditCards': 'True',
  'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  'Caters': 'False',
  'GoodForKids': 'False',
  'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': True, 'brunch': False, 'breakfast': False}",
  'HasTV': 'True',
  'NoiseLevel': "u'average'",
  'OutdoorSeating': 'False',
  'RestaurantsAttire': "u'casual'",
  'RestaurantsDelivery': 'False',
  'RestaurantsGoodForGroups': 'True',
  'RestaurantsPriceRange2': '3',
  'RestaurantsReservations': 'True',
  'RestaurantsTakeOut': 'False',
  'WiFi': "u'no'"},
 'business_id': 'nsNONDHbV7Vudqh21uicqw',
 'categories': 'Seafood, Restaurants',
 'city': 'S

In [16]:
business_id[0]

'nsNONDHbV7Vudqh21uicqw'

In [17]:
seafood_review['nsNONDHbV7Vudqh21uicqw']

[{'business_id': 'nsNONDHbV7Vudqh21uicqw',
  'cool': 0,
  'date': '2017-05-26 02:22:41',
  'funny': 0,
  'review_id': 'eKgibXL3A2DeRkQjF5BQDA',
  'stars': 1.0,
  'text': "Was very excited for happy hour and heard great things. Food was less then average we were charged $10 for BREAD.  Most apps were greasy or fried and still spent $250 on happy hour menu for a group of 4. First review on yelp so cared enough to take the time to write this. Sorry won't come back.",
  'useful': 0,
  'user_id': 'iFnEh9lsL2CIFnddSaEHAw'},
 {'business_id': 'nsNONDHbV7Vudqh21uicqw',
  'cool': 1,
  'date': '2018-05-08 17:18:20',
  'funny': 1,
  'review_id': '8NBE5q5UWLQUqWyM7GpQ2g',
  'stars': 2.0,
  'text': 'Seems old and tired!   I ate here frequently about 20 years ago and returned for happy hour this year.  This used to be a good place many years ago, but too many great restaurants have opened in recent years to eat here.  Strange rules about what you can eat in the bar area.  Food just meh.',
  'useful':

## Step 3: Save to csv files

In [19]:
seafood_business_df = pd.DataFrame(seafood_business)
seafood_business_df.to_csv('seafood_business.csv')

In [20]:
all_review_df = pd.DataFrame(all_review)
all_review_df.to_csv('all_review.csv')