In [6]:
import csv
import pdb
import dill as pkl
from collections import Counter
from operator import itemgetter
import psycopg2
from psycopg2.extras import NamedTupleConnection
import requests
from requests_oauthlib import OAuth1
from pprint import pprint

In [7]:
import yelp_api_machinery as yam
import cleaners
from table_builders import RestaurantsTableBuilder, CategoriesTableBuilder, NeighborhoodsTableBuilder

In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## _This Notebook: Download Raw Data, Clean, Create "inspections" table, Pull Restaurant Urls, Build "restaurants", "categories", and "neighborhoods" Tables_

#--------------------------------------------------------------------------------------------

## Set Up Database

###Download Data

In [None]:
json_url = 'https://data.cityofnewyork.us/api/views/xx67-kt59/rows.json?accessType=DOWNLOAD'
csv_url = 'https://data.cityofnewyork.us/api/views/xx67-kt59/rows.csv?accessType=DOWNLOAD'

In [None]:
print csv_url

In [None]:
r = requests.get(csv_url)
len(r.content)

In [None]:
with open('raw_inspection_data.csv', 'w') as f:
    f.write(r.content)

### Clean CSV

To a list...

In [None]:
%%time 

with open('./raw_inspection_data.csv', 'rt') as f:
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.reader(f, dialect)
    header = reader.next()
    
    entries = []
    for row in reader:
        entry_dict =  dict(zip(header, row))
        entries.append(cleaners.inspection_entry(entry_dict))


To disk...

In [None]:
%% time 

fieldnames = ['CAMIS',
            'DBA',
            'ADDRESS',
            'ZIPCODE',
            'PHONE',
            'INSPECTION_TYPE',
            'INSPECTION_DATE',
            'ACTION',
            'SCORE',
            'GRADE',
            'GRADE_DATE',
            'VIOLATION_CODE',
            'CRITIAL_FLAG',
            'RECORD_DATE']

csv.register_dialect('pipes', delimiter='|', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
with open('cleaned_inspection_data.csv', 'wt') as f:
    writer = csv.DictWriter(f, fieldnames, dialect='pipes')
    writer.writeheader()
    for entry in entries:
        writer.writerow(entry.values)
        

### Add "inspections" Table to Database

In [None]:
conn = psycopg2.connect("dbname=yelp")
c = conn.cursor()

Create table:

In [None]:
c.execute("DROP TABLE IF EXISTS inspections")
c.execute('''CREATE TABLE inspections (
CAMIS varchar(10),
DBA varchar(255),
ADDRESS  varchar(100),
ZIPCODE varchar(5),
PHONE varchar(12),
INSPECTION_TYPE varchar(64),
INSPECTION_DATE date,
ACTION varchar(150),
SCORE smallint,
GRADE varchar(1),
GRADE_DATE date,
VIOLATION_CODE varchar(3),
CRITICAL varchar(1),
RECORD_DATE date
)'''
)
conn.commit()

Populate with COPY:

In [None]:
copy_command = '''
COPY inspections 
FROM STDIN
WITH (
FORMAT CSV,
DELIMITER '|',
NULL 'NULL',
HEADER TRUE,
QUOTE '"'
);
'''

with open('./cleaned_inspection_data.csv', 'rt') as f:
    c.copy_expert(copy_command, f)

conn.commit()
conn.close()

#--------------------------------------------------------------------------------------------

## Yelp API

In [23]:
trwlr = yam.YelpApiCoordinator(start_read = 0, by_phone = True, by_addr_0 = False, by_addr_1= False,
                                report_interval = 25)

In [None]:
# seperate hit from misses
# record successes
# retry misses
# find best match over threshold
## match address, match name
# record second pass successes
# plan out how this happens


In [17]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [34]:
similar("KoJa Kitchen", "Koa Home-*Kitchen")

0.7586206896551724

In [24]:
start_record = 0
n = 100
trwlr.seek(start_record)
output = trwlr.read_next_n(n)

100 Restaurants to pull.
Pulling restaurant number 0...
Restaurant 0 completed.
Pulling restaurant number 25...
Restaurant 25 completed.
Pulling restaurant number 50...
Restaurant 50 completed.
Pulling restaurant number 75...
Restaurant 75 completed.
Completion: 100 restaurants pulled.


In [75]:
trwlr.close_conn()

In [30]:
successes = map(lambda x: (x[0].camis, x[1]['by_phone']), filter(lambda x: x[1]['by_phone'] is not None, output))
failures = map(itemgetter(0), filter(lambda x: x[1]['by_phone'] is None, output))

In [33]:
successes = filter(lambda x: x[1]['by_phone'] is not None, output)
successes = map(lambda x: (x[0].camis, x[1]['by_phone']), successes)
pprint(successes)

[('30075445',
  RestaurantExtract(yelp_name=u'Morris Pk Bake Shop', yelp_id=u'morris-pk-bake-shop-bronx', yelp_address=u'1007 Morris Park Avenue', yelp_city=u'Bronx', yelp_zipcode=u'10462', yelp_phone=u'17188924968', review_count=19, rating=4.5, categories=[u'Bakeries', u'Desserts'], neighborhoods=[u'Morris Park'])),
 ('30112340',
  RestaurantExtract(yelp_name=u"Wendy's", yelp_id=u'wendys-brooklyn-4', yelp_address=u'469 Flatbush Ave', yelp_city=u'Brooklyn', yelp_zipcode=u'11225', yelp_phone=u'17182875005', review_count=19, rating=2.0, categories=[u'Fast Food', u'Burgers'], neighborhoods=[u'Prospect Heights', u'Prospect Lefferts Gardens'])),
 ('30191841',
  RestaurantExtract(yelp_name=u'D.J. Reynolds', yelp_id=u'd-j-reynolds-new-york-3', yelp_address=u'351 W 57th St', yelp_city=u'New York', yelp_zipcode=u'10019', yelp_phone=u'12122452912', review_count=58, rating=3.5, categories=[u'Irish', u'Pubs'], neighborhoods=[u"Hell's Kitchen", u'Midtown West'])),
 ('40356018',
  RestaurantExtract(

In [None]:
def make_trwlr_output_pklable(output):
    
    def to_dicts_helper(result_tuple):
        pulled_dict = result_tuple[1]
        v = None
        if pulled_dict['by_phone'] is not None:
            v = pulled_dict['by_phone']._asdict()
            
        return (result_tuple[0]._asdict(), v)
    
    return map(to_dicts_helper, output)
    

In [None]:
pklable_output = make_trwlr_output_pklable(output)
with open('./{0}-{1}output.pkl'.format(start_record+1, start_record+n), 'wb') as f:
    pkl.dump(pklable_output, f)

In [None]:
print "Total restuarants processed: {0}".format(len(output))
print "Successfully paired: {0}".format(len(filter(lambda x: x[1]['by_phone'] is not None, output)))

###Retry Failed Phone Pull by Address Search

In [None]:
print len(successes)
print len(failures)

In [None]:
pprint(map(lambda x: (x.dba), failures)[:50])

In [None]:
yelp_interfacer = yelp_api_interfacer(by_phone = False, by_addr_0 = True, by_addr_1= True)

In [None]:
output_v2 = yelp_interfacer.pull_restaurants(failures)

In [None]:
def find_match(result_tuple):
    
    r = re.compile('\s+')
    
    record = result_tuple[0]
    address = record.address
    split_address = map(lambda a: a.strip(), address.split(' '))
    yelp_records = result_tuple[1]
    
    for k, yelp_record in yelp_records.items(): 
        
        if yelp_record is not None and yelp_record.address is not None:
            
            yel_addr = r.sub(' ', yelp_record.address).strip()
            split_yelp_addr = map(lambda a: a.strip(), yel_addr.split(' '))
            if len(split_yelp_addr) > 1 and len(split_address) > 1:
                if split_address[0] == split_yelp_addr[0] and \
                split_address[1].lower()[0] == split_yelp_addr[1].lower()[0]:
                    
                    return (record, yelp_record)

    return None

In [None]:
found = filter(None, map(find_match, output_v2))

In [None]:
pklable_found_p1 = map(lambda x: (x[0]._asdict(), x[1]._asdict()), found)
with open('./{0}-{1}found_second_pass.pkl'.format(start_record+1, start_record + n), 'wb') as f:
    pkl.dump(pklable_found_p1, f)

In [None]:
c = Counter(map(lambda x: (x.dba), failures))
c.most_common(30)

In [None]:
c_retry = Counter(map(lambda x: x[0].dba, found))
c_retry.most_common(30)

In [None]:
c.subtract(c_retry)

In [None]:
c.most_common(30)

#--------------------------------------------------------------------------------------------

## Build Restaurants Tables

In [None]:
# TODO make more systematic
output_p1_paths = ['./1-10000output.pkl', './10001-20000output.pkl', './20001-30000output.pkl']
output_p2_paths = ['./1-10000found_second_pass.pkl', './10001-20000found_second_pass.pkl', './20001-30000found_second_pass.pkl']

outputs_p1 = []
outputs_p2 = []
for path in output_p1_paths:
    with open(path, 'rb') as f:
        outputs_p1.extend(pkl.load(f))
        
for path in output_p2_paths:
    with open(path, 'rb') as f:
        outputs_p2.extend(pkl.load(f))
        

In [None]:
all_successes = []
all_successes.extend(filter(lambda x: x[1] is not None, outputs_p1))
all_successes.extend(outputs_p2)
print len(all_successes)

In [None]:
c = Counter(map(lambda x: x[0].get('camis'), all_successes))
print c.most_common(5)
# No duplicates? :-)

In [None]:
# restaurants
## (yelp_id, camis, yelp_name, address, zipcode)
# neighborhoods
## (yelp_id, neighborhood)
# categories
## (yeld_id, category)

## Restaurants Table

In [None]:
rtb = RestaurantsTableBuilder()
rtb.build_table()
rtb.add_records(all_successes)

## Categories Table

In [None]:
ctb = CategoriesTableBuilder()
ctb.build_table()
ctb.add_records(all_successes)

##Neighborhoods Table

In [None]:
ntb = NeighborhoodsTableBuilder()
ntb.build_table()
ntb.add_records(all_successes)