## Data analysis with python3 builtins

The goal of this notebook is to provide an overview of what can be achieve in terms of data analysis with the use of core features of the Python language only. **NO USE** of additional library required.

In [1]:
import csv

# first way: use ContextManager to work with the file
with open('Food_Inspections.csv', 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    food = [row for row in reader]

# alternative way (one-liner)
# food = list(csv.DictReader(open('Food_Inspections.csv')))

In [2]:
food[0]

OrderedDict([('Inspection ID', '2401393'),
             ('DBA Name', 'WILD HARE'),
             ('AKA Name', 'WILD HARE'),
             ('License #', '2133895'),
             ('Facility Type', 'Restaurant'),
             ('Risk', 'Risk 1 (High)'),
             ('Address', '2610 N HALSTED ST '),
             ('City', 'CHICAGO'),
             ('State', 'IL'),
             ('Zip', '60614'),
             ('Inspection Date', '09/18/2020'),
             ('Inspection Type', 'Canvass'),
             ('Results', 'No Entry'),
             ('Violations', ''),
             ('Latitude', '41.929462059691446'),
             ('Longitude', '-87.64904947805883'),
             ('Location', '(-87.64904947805883, 41.929462059691446)')])

In [3]:
type(food[0])

collections.OrderedDict

In [4]:
# use set comprehension to check all possible Results
# set comprehension grants uniqueness of items
{ row['Results'] for row in food }

{'Business Not Located',
 'Fail',
 'No Entry',
 'Not Ready',
 'Out of Business',
 'Pass',
 'Pass w/ Conditions'}

In [5]:
# let's get only the failed restaurants
fail = [ row for row in food if row['Results'] == 'Fail' ]
len(fail)

40848

In [6]:
from collections import Counter
worst = Counter(row['DBA Name'] for row in fail)
worst.most_common(5)

[('SUBWAY', 365),
 ('DUNKIN DONUTS', 226),
 ("MCDONALD'S", 116),
 ('7-ELEVEN', 66),
 ('MCDONALDS', 60)]

In [10]:
# Let's fix this McDonald's spelling

# we use a dict comprehension to create a dict on the run while maintaing the list data structure of `fail`
# the ** operator take the whole dictionary into a temporary object to be managed

fail = [ { **row, 'DBA Name': row['DBA Name'].replace("'", '').upper() }
          for row in fail ]

In [11]:
worst = Counter(row['DBA Name'] for row in fail)
worst.most_common(5)

[('SUBWAY', 393),
 ('DUNKIN DONUTS', 245),
 ('MCDONALDS', 228),
 ('7-ELEVEN', 74),
 ('CHIPOTLE MEXICAN GRILL', 69)]

In [12]:
# let's dive into the addresses
bad = Counter(row['Address'] for row in fail)
bad.most_common(5)

[('11601 W TOUHY AVE ', 346),
 ('2300 S THROOP ST ', 107),
 ('324 N LEAVITT ST ', 88),
 ('500 W MADISON ST ', 82),
 ('5700 S CICERO AVE ', 64)]

In [13]:
# Looks interesting. Let's tabulate the data by year.
# we create a default dictionary of Counter objects, and for every year we increment the Counter key
# related to a specific address every time we encounter it
from collections import defaultdict
by_year = defaultdict(Counter)
for row in fail:
    by_year[row['Inspection Date'][-4:]][row['Address']] += 1
by_year['2015'].most_common(5)

[('11601 W TOUHY AVE ', 39),
 ('500 W MADISON ST ', 13),
 ('307 S KEDZIE AVE ', 9),
 ('324 N LEAVITT ST ', 9),
 ('12 S MICHIGAN AVE ', 8)]

In [14]:
# concerning the string slicing: each string is a list of characters...
'09/18/2020'[-4:]

'2020'

In [15]:
# '11601 W TOUHY AVE ' is the Chicago airport :)
ohare = [ row for row in fail if row['Address'].startswith('11601 W TOUHY')]

In [16]:
len(ohare)

347

In [17]:
bad['11601 W TOUHY AVE ']

346

In [18]:
# what??
{row['Address'] for row in ohare }  # I'm not fixing this

{'11601 W TOUHY AVE ', '11601 W TOUHY AVE T2 F12'}

In [19]:
# information on the location within the airport maybe in the Also Known Name of the restaurant
c = Counter(row['AKA Name'] for row in ohare)
c.most_common(5)

[('MACARONI GRILL (T3-K2)', 12),
 ('ARGO TEA  (T3 ROTUNDA)', 9),
 ("CHILI'S TOO (T2  F4)", 8),
 ('TOCCO (T5 M-07)', 7),
 ('ARGO TEA (T2/E5)', 7)]

In [20]:
# Let's find out how many time a single restaurant has failed
inspections = defaultdict(list)
for row in ohare:
    inspections[row['License #']].append(row)

In [21]:
# e.g. Restaurant with License # = '34192'
[row['Inspection Date'] for row in inspections['34192']]

['03/20/2020',
 '09/14/2018',
 '05/31/2018',
 '02/16/2017',
 '04/07/2016',
 '09/04/2014',
 '09/20/2011',
 '01/26/2010']

In [24]:
# Let's focus on the Violations attribute
ohare[1]['Violations']

'2. FACILITIES TO MAINTAIN PROPER TEMPERATURE - Comments: FACILITIES DO NOT MAINTAIN PROPER TEMPERATURE. OBSERVED (1) 4 DOOR PREP COOLER AT COOKLINE WITH AN AIR TEMPERATURE OF 49.0F. THE COOLER WAS USED TO STORE POTENTIALLY HAZARDOUS FOODS SUCH AS FRESH CUT FRENCH FRIES,LETTUCE,POTATOES AND FISH. UNIT TAGGED HELD FOR INSPECTION. MUST EMPTY THE UNIT AND HAVE IT REPAIRED SO THAT IT MAINTAINS AN AIR TEMPERATURE BELOW 40F. CRITICAL VIOLATION 7-38-005A. | 3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATURE REQUIREMENT DURING STORAGE, PREPARATION DISPLAY AND SERVICE - Comments: POTENTIALLY HAZARDOUS FOODS STORED AT IMPROPER TEMPERATURES. OBSERVED SMOKED SALMON AT 45.5F,DELI MEAT AT 49.0F,CHEESE AT 50.1F,POTATOES AT 44.7F,HAMBURGERS AT 45.1F,FISH AT 45.5F, GUACAMOLE AT 44.7F.INSTRUCTED TO MONITOR AND MAINTAIN ALL COLD FOODS AT 40F OR BELOW AND ALL HOT FOODS AT 140F OR ABOVE AT ALL TIMES.  CRITICAL VIOLATION 7-38-005(A).  | 8. SANITIZING RINSE FOR EQUIPMENT AND UTENSILS:  CLEAN, PROPER TEMPERATURE

In [23]:
# There is structure. Let's exploit this to rank the most frequent violation in the failed restaurants
all_violations = [row['Violations'].split('|') for row in ohare]

In [25]:
c = Counter()
for violations in all_violations:
    for v in violations:
        c[v[:v.find('- Comments:')].strip()] += 1
c.most_common(5)

[('34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED',
  166),
 ('33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSILS CLEAN, FREE OF ABRASIVE DETERGENTS',
  158),
 ('35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS',
  139),
 ('18. NO EVIDENCE OF RODENT OR INSECT OUTER OPENINGS PROTECTED/RODENT PROOFED, A WRITTEN LOG SHALL BE MAINTAINED AVAILABLE TO THE INSPECTORS',
  118),
 ('32. FOOD AND NON-FOOD CONTACT SURFACES PROPERLY DESIGNED, CONSTRUCTED AND MAINTAINED',
  114)]