# Yelp Checkins Wrangling

In [1]:
%matplotlib inline

import pandas as pd
import datetime
import json
import numpy as np


DRY_RUN = False

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

def time_marker(text=''):
    print('[{}] {}'.format(datetime.datetime.now().time(), text.title()))
    
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

# Load Checkins Data

In [4]:
time_marker(text='Loading Check Ins Data...')

data = pd.DataFrame()
source_data_file = '../source_data/checkin.json'

checkins_list = []
for line in open(source_data_file, 'r'):
    checkins_list.append(json.loads(line))

time_marker(text='creating dataframe...')
checkins_df = pd.DataFrame(checkins_list)
checkins_df.head(3)

[19:08:56.276808] Loading Checkings Data...
[19:09:00.653863] Creating Dataframe...


Unnamed: 0,business_id,time
0,7KPBkxAOEtb3QeIL9PEErg,"{'Thursday': {'21:00': 4, '1:00': 1, '4:00': 1..."
1,kREVIrSBbtqBhIYkTccQUg,"{'Monday': {'13:00': 1}, 'Thursday': {'20:00':..."
2,tJRDll5yqpZwehenzE2cSg,"{'Monday': {'12:00': 1, '1:00': 1}, 'Saturday'..."


In [5]:
business_ids = checkins_df.business_id.to_frame()
business_ids.head(3)

Unnamed: 0,business_id
0,7KPBkxAOEtb3QeIL9PEErg
1,kREVIrSBbtqBhIYkTccQUg
2,tJRDll5yqpZwehenzE2cSg


# Unpack `time` column

In [6]:
time_marker(text='unpacking daily check in counts...')
checkins_df = unpack(checkins_df, 'time')
checkins_df.head(3)

[19:09:01.027549] Unpacking Daily Check In Counts...


Unnamed: 0,business_id,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,7KPBkxAOEtb3QeIL9PEErg,"{'16:00': 1, '14:00': 2, '10:00': 2, '23:00': ...","{'12:00': 1, '11:00': 1, '14:00': 1, '18:00': ...","{'21:00': 1, '23:00': 3, '18:00': 4, '10:00': ...","{'18:00': 1, '16:00': 1, '14:00': 1, '19:00': ...","{'21:00': 4, '1:00': 1, '4:00': 1, '2:00': 1, ...","{'18:00': 2, '12:00': 1, '13:00': 2, '16:00': ...","{'11:00': 2, '13:00': 2, '14:00': 1, '17:00': ..."
1,kREVIrSBbtqBhIYkTccQUg,,{'13:00': 1},"{'21:00': 1, '16:00': 1}",{'19:00': 1},"{'20:00': 1, '13:00': 1}",,{'17:00': 1}
2,tJRDll5yqpZwehenzE2cSg,,"{'12:00': 1, '1:00': 1}",{'16:00': 1},,{'0:00': 1},,


### If this is a dry run, only process the first 20 business records

In [7]:
if DRY_RUN:
    checkins_df = checkins_df[:20].copy()

# Unpack Hourly checkins from each Day Column

In [8]:
time_marker(text='splitting daily counts into hourly columns...')
chunks = []

for day in checkins_df.columns[1:]:
    data = checkins_df[day].copy().to_frame()

    # fill missing daily checkin dicts with empty dict
    data=data.applymap(lambda x: {} if pd.isnull(x) else x)

    # unpack dict to hourly
    chunk = data[day].apply(pd.Series)

    chunk.columns = [int(str(x).split(':')[0]) for x in chunk.columns]

    # fill in missing hours
    for n in range(0, 24, 1):
        if n not in chunk.columns:
            chunk[n] = np.nan
    chunk['day'] = day

    chunks.append(chunk)

checkins_df = pd.concat(chunks)
checkins_df.fillna(0, inplace=True)
checkins_df.head(3)

[19:09:01.688504] Splitting Daily Counts Into Hourly Columns...


Unnamed: 0,0,10,11,12,13,14,15,16,17,18,...,23,2,3,4,5,6,7,8,9,day
0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Friday
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Friday
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Friday


# Append `business_id` to each daily checkin row, left merge

In [9]:
time_marker(text='appending business_id columns to hourly records...')
checkins_clean = checkins_df.merge(business_ids, left_index=True, right_index=True, how='left')
checkins_clean.head(3)

[19:16:53.267939] Appending Business_Id Columns To Hourly Records...


Unnamed: 0,0,10,11,12,13,14,15,16,17,18,...,2,3,4,5,6,7,8,9,day,business_id
0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Friday,7KPBkxAOEtb3QeIL9PEErg
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Monday,7KPBkxAOEtb3QeIL9PEErg
0,1.0,1.0,0.0,1.0,3.0,1.0,1.0,2.0,3.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Saturday,7KPBkxAOEtb3QeIL9PEErg


# Cleanup and Reset Index

In [10]:
time_marker(text='cleaning up and reset index...')
checkins_clean.fillna(0, inplace=True)
checkins_clean.reset_index(inplace=True, drop=True)
checkins_clean.head(3)

[19:16:54.120451] Cleaning Up And Reset Index...


Unnamed: 0,0,10,11,12,13,14,15,16,17,18,...,2,3,4,5,6,7,8,9,day,business_id
0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Friday,7KPBkxAOEtb3QeIL9PEErg
1,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Monday,7KPBkxAOEtb3QeIL9PEErg
2,1.0,1.0,0.0,1.0,3.0,1.0,1.0,2.0,3.0,4.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Saturday,7KPBkxAOEtb3QeIL9PEErg


# Write to File
<p>Split by day of week</p>

In [11]:
time_marker(text='Writing to files...')
day_count = len(checkins_clean.day.unique())
for i, day in enumerate(checkins_clean.day.unique()):
    df = checkins_clean[checkins_clean.day == day].copy()
    df.reset_index(inplace=True, drop=True)

    file_name = '../clean_data/checkins/{}_{}_checkins_clean.csv'.format(str(i).zfill(2), day.lower())
    time_marker(text='Writing {} records file...'.format(day))
    if DRY_RUN:
        pass
    else:
        df.to_csv(file_name, encoding='utf-8')

[19:16:54.636819] Writing To Files...
[19:16:54.963056] Writing Friday Records File...
[19:16:59.203796] Writing Monday Records File...
[19:17:03.277101] Writing Saturday Records File...
[19:17:07.517902] Writing Sunday Records File...
[19:17:12.261991] Writing Thursday Records File...
[19:17:17.323008] Writing Tuesday Records File...
[19:17:21.466190] Writing Wednesday Records File...
