In [119]:
import psycopg2
import requests
from bs4 import BeautifulSoup
import csv
import re
import time

## Set Up Inpection Database

In [18]:
json_url = 'https://data.cityofnewyork.us/api/views/xx67-kt59/rows.json?accessType=DOWNLOAD'
csv_url = 'https://data.cityofnewyork.us/api/views/xx67-kt59/rows.csv?accessType=DOWNLOAD'

In [19]:
print csv_url

https://data.cityofnewyork.us/api/views/xx67-kt59/rows.csv?accessType=DOWNLOAD


In [5]:
r = requests.get(csv_url)
len(r.content)



188832637

In [6]:
with open('raw_inspection_data.csv', 'w') as f:
    f.write(r.content)

In [172]:
class inspection_entry():
    
    ## TODO data structure to build violation code table
    
    no_filter = ['CAMIS',
     'ZIPCODE',
     'INSPECTION DATE',
     'ACTION',
     'VIOLATION CODE',
     'GRADE',
     'SCORE',
     'GRADE DATE',
     'RECORD DATE',
     'INSPECTION TYPE']
    
    action_map = {'No violations were recorded at the time of this inspection.':'No violations cited.',
     'Violations were cited in the following area(s).':'Violations cited',
     'Establishment Closed by DOHMH.  Violations were cited in the following area(s) and those requiring immediate action were addressed.':'Establishment Closed by DOHMH'}
    
    def __init__(self, entry_dict, null_val = 'NULL'):
        self.null = null_val
        self.values = self._process(entry_dict)
    
    def _encode_clean(self, raw_str):
        return raw_str.replace('Â', '')
    
    def _action_process(self, action):
        
        if not action:
            return self.null
        else:
            return inspection_entry.action_map.get(action,action)
    
    def _crit_process(self, crit_val):
        
        if not crit_val:
            return self.null
        elif crit_val == 'Critical':
            return 1
        else:
            return 0
        
    def _addr_process(self, building, street):
        
        r = re.compile('\s+')
        addr = ' '.join((building, street))
        return r.sub(' ', addr)
        
    def _i_date_process(self, i_date):
    
        return i_date if i_date != '01/01/1900' else self.null
    
    def _process(self, entry_dict):
             
        na_fill = lambda v: v if v else self.null
        
        values = {}
        for field in inspection_entry.no_filter:
            values[field.replace(' ', '_')] = na_fill(entry_dict.get(field))
        
        values['DBA'] = self._encode_clean(entry_dict.get('DBA'))
        values['ADDRESS'] = self._addr_process(entry_dict.get('BUILDING'), entry_dict.get('STREET'))
        values['CRITIAL FLAG'.replace(' ', '_')] = self._crit_process(entry_dict.get('CRITICAL FLAG'))
        values['INSPECTION DATE'.replace(' ', '_')] = self._i_date_process(entry_dict.get('INSPECTION DATE'))
        values['ACTION'] = self._action_process(entry_dict.get('ACTION'))

        return values

In [173]:
st = time.clock()
with open('./raw_inspection_data.csv', 'rt') as f:
    dialect = csv.Sniffer().sniff(f.read(1024))
    f.seek(0)
    reader = csv.reader(f, dialect)
    header = reader.next()
    
    entries = []
    for row in reader:
        entry_dict =  dict(zip(header, row))
        entries.append(inspection_entry(entry_dict))
et = time.clock()
print et - st

16.880887


In [177]:
entries[0].values.keys()

['VIOLATION_CODE',
 'GRADE',
 'INSPECTION_TYPE',
 'CRITIAL_FLAG',
 'ZIPCODE',
 'DBA',
 'SCORE',
 'CAMIS',
 'GRADE_DATE',
 'ADDRESS',
 'ACTION',
 'INSPECTION_DATE',
 'RECORD_DATE']

In [188]:
st = time.clock()
fieldnames = ['CAMIS',
            'DBA',
            'ADDRESS',
            'ZIPCODE',
            'INSPECTION_TYPE',
            'INSPECTION_DATE',
            'ACTION',
            'SCORE',
            'GRADE',
            'GRADE_DATE',
            'VIOLATION_CODE',
            'CRITIAL_FLAG',
            'RECORD_DATE']

csv.register_dialect('pipes', delimiter='|', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
with open('cleaned_inspection_data.csv', 'wt') as f:
    writer = csv.DictWriter(f, fieldnames, dialect='pipes')
    writer.writeheader()
    for entry in entries:
        writer.writerow(entry.values)
        
ft = time.clock()
print "{0:f} seconds".format(ft-st)

5.632950 seconds


In [189]:
!head ./cleaned_inspection_data.csv

CAMIS|DBA|ADDRESS|ZIPCODE|INSPECTION_TYPE|INSPECTION_DATE|ACTION|SCORE|GRADE|GRADE_DATE|VIOLATION_CODE|CRITIAL_FLAG|RECORD_DATE
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Cycle Inspection / Initial Inspection|02/09/2015|Violations cited|6|A|02/09/2015|06C|1|06/24/2015
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Cycle Inspection / Initial Inspection|03/03/2014|Violations cited|2|A|03/03/2014|10F|0|06/24/2015
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Trans Fat / Second Compliance Inspection|10/10/2013|No violations cited.|NULL|NULL|NULL|NULL|0|06/24/2015
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Cycle Inspection / Re-inspection|09/11/2013|Violations cited|6|A|09/11/2013|04L|1|06/24/2015
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Cycle Inspection / Re-inspection|09/11/2013|Violations cited|6|A|09/11/2013|04N|1|06/24/2015
30075445|MORRIS PARK BAKE SHOP|1007 MORRIS PARK AVE|10462|Cycle Inspection / Initi

In [186]:
r = re.compile('"')
unusuals = set()
for e in entries:
    if r.match(e.values.get('DBA')):
        unusuals.add(e.values.get('DBA'))
print len(unusuals)
for u in unusuals:
    print u

0


In [179]:
r = re.compile('^[a-zA-Z&0-9#.:@() /\'$+!%?-]*$')
unusuals = set()
for e in entries:
    if not r.match(e.values.get('DBA')):
        unusuals.add((e.values.get('DBA'), e.values.get('ADDRESS')))
print len(unusuals)
for u in unusuals:
    print u[0]

130
DUNKIN' DONUTS, BASKIN ROBBINS
Long Wong Bakery II, Inc.
DUNKIN' DONUTS, BASKIN ROBBINS
BURGER KING, POPEYE'S CHICKEN & BISCUITS
DUNKIN' DONUTS, BASKIN ROBBINS
CHIC-FIL-A, QUIZNOS SUBS, JW'S, TOSSED
Tim Hortons, Soup Man, Tasti D. Lite
LI & LIN'S CHINA DRAGON, INC.
DUNKIN' DONUTS, BASKIN ROBBINS, SUBWAY
SOUPS,SALADS & BEYOND
SUBWAY, CARVEL ICE CREAM
NINE_D THAI
FLOYD, NY
BURGER KING, POPEYE'S
DUNKIN' DONUTS, BASKIN ROBBINS
1,001 NIGHTS
THE CANAL PARK PLAYHOUSE, INC
DUNKIN' DONUTS, BASKIN ROBBINS
DUNKIN' DONUTS, BASKIN ROBBINS
NORMA'S, BURGER JOINT
SUGAR HILL RESTAURANT, SUPPER CLUB AND DISCO
SKADDEN ARPS, SLATE
DUNKIN' DONUTS, BASKIN ROBBINS
CHINA MIA EXOTIC ASIAN FOOD, YOGURT & JUICE BAR
DUNKIN' DONUTS, BASKIN ROBBINS
HAMPTON CHUTNEY, CO.
DUNKIN' DONUTS, BASKIN ROBBINS
DUNKIN' DONUTS, BASKIN ROBBINS
DUNKIN' DONUTS, BASKIN ROBBINS
LENNY'S HANOVER, LLC
ROSE PIZZA, CHARLIE SUB GRILL, MOE'S SOUTHWEST GRILL
SUBWAY, CARVEL ICE CREAM
DADDY,S PIZZA & PASTA
Le Bernardin Privé
CALLAPIZZA US

In [170]:
for u in unusuals:
    print u
    print .replace('Â', '')
    print '************************'

Le Bernardin Privé
Le Bernardin Privé
************************
NIGHT°
NIGHT°
************************
99¢ FAMOUS PIZZA
99¢ FAMOUS PIZZA
************************
BUNK*R
BUNK*R
************************
CREDIT SUISSE B-2= CAFE
CREDIT SUISSE B-2= CAFE
************************
FDR 99¢ SLICE PIZZA
FDR 99¢ SLICE PIZZA
************************
99¢ HOT PIZZA
99¢ HOT PIZZA
************************
CAFÉ GUSTO
CAFÉ GUSTO
************************
NINE_D THAI
NINE_D THAI
************************
V {iv} Thai Restaurant & Bar
V {iv} Thai Restaurant & Bar
************************


In [144]:
first_cat = set()
second_cat = set()
for e in entries:
    itype = e.values['INSPECTION_TYPE']
    splits = itype.split('/')
    if len(splits) > 1:
        first_cat.add(splits[0].strip())
        second_cat.add(splits[1].strip())
print first_cat
print '****************************************'
print second_cat

set(['Cycle Inspection', 'Inter-Agency Task Force', 'Administrative Miscellaneous', 'Calorie Posting', 'Trans Fat', 'Pre-permit (Operational)', 'Smoke-Free Air Act', 'Pre-permit (Non-operational)'])
****************************************
set(['Compliance Inspection', 'Second Compliance Inspection', 'Reopening Inspection', 'Initial Inspection', 'Limited Inspection', 'Re-inspection'])


In [64]:
print header

['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE', 'PHONE', 'CUISINE DESCRIPTION', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE', 'VIOLATION DESCRIPTION', 'CRITICAL FLAG', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE']


In [21]:
conn = psycopg2.connect("dbname=yelp")

In [22]:
c = conn.cursor()

In [25]:
c.execute('''CREATE TABLE inspections (\
CAMIS varchar(10),
DBA varchar(255),
BORO varchar(15),
BUILDING varchar(10),
STREET varchar(100),
ZIPCODE varchar(5),
PHONE varchar(5),
C_DESC varchar(200),
INSPECTION_DATE date,
ACTION varchar(150),
VIOLATION_CODE varchar(3),
VIOLATION_DESC varchar(600),
CRITICAL varchar(15),
SCORE smallint,
GRADE varchar(1),
GRADE_DATE date,
RECORD_DATE date,
INSPECTION_TYPE varchar(64)
)'''
)

In [26]:
conn.commit()

In [40]:
c.execute('''
COPY inspections 
FROM './raw_inspection_data.csv'
WITH (
FORMAT CSV,
DELIMITER ',',
NULL '',
HEADER TRUE,
QUOTE '"'
)'''
)

ERROR: An unexpected error occurred while tokenizing input
The following traceback may be corrupted or invalid
The error message is: ('EOF in multi-line string', (1, 1))



ProgrammingError: must be superuser to COPY to or from a file
HINT:  Anyone can COPY to stdout or from stdin. psql's \copy command also works for anyone.


In [39]:
conn.rollback()

In [None]:
conn.commit()
conn.close()

In [51]:
dir(dialect)

['__doc__',
 '__init__',
 '__module__',
 '_name',
 '_valid',
 '_validate',
 'delimiter',
 'doublequote',
 'escapechar',
 'lineterminator',
 'quotechar',
 'quoting',
 'skipinitialspace']

In [57]:
dialect.

'\r\n'

['excel-tab', 'excel']

In [None]:
'''CREATE TABLE inspections (\
CAMIS varchar(10),
DBA varchar(255),
BORO varchar(15),
BUILDING varchar(10),
STREET varchar(100),
ZIPCODE varchar(5),
PHONE varchar(5),
C_DESC varchar(200),
INSPECTION_DATE date,
ACTION varchar(150),
VIOLATION_CODE varchar(3),
VIOLATION_DESC varchar(600),
CRITICAL varchar(15),
SCORE smallint,
GRADE varchar(1),
GRADE_DATE date,
RECORD_DATE date,
INSPECTION_TYPE varchar(64)
);

COPY inspections 
FROM './raw_inspection_data.csv'
WITH (
FORMAT CSV,
DELIMITER ',',
NULL '',
HEADER TRUE,
QUOTE '"'
);

'''

## Yelp API

In [None]:
cs

## Yelp Scraper