# Yelp Checkins Wrangling

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime

import json

import seaborn as sns
sns.set()
# sns.set_style('whitegrid')
# sns.set_context("poster")

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [3]:
day_labels = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Load Checkins Data

In [4]:
print('[%s] Loading Checkins Data...' % datetime.datetime.now().time())

data = pd.DataFrame()
source_data_file = '../source_data/checkin.json'

checkins_list = []
for line in open(source_data_file, 'r'):
    checkins_list.append(json.loads(line))

print('[%s] creating dataframe...' % datetime.datetime.now().time())    
checkins_df = pd.DataFrame(checkins_list)

print('[%s] data type cleanup...' % datetime.datetime.now().time())    
# tips_df.date        = pd.to_datetime(tips_df.date)
# tips_df.likes       = tips_df.likes.astype('int')
    
print('[%s] Complete!' % datetime.datetime.now().time())

[02:37:37.347645] Loading Checkins Data...
[02:37:46.757365] creating dataframe...
[02:37:47.298917] data type cleanup...
[02:37:47.299149] Complete!


In [5]:
checkins_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135148 entries, 0 to 135147
Data columns (total 2 columns):
business_id    135148 non-null object
time           135148 non-null object
dtypes: object(2)
memory usage: 2.1+ MB


In [6]:
checkins_df.head()

Unnamed: 0,business_id,time
0,7KPBkxAOEtb3QeIL9PEErg,"{'Thursday': {'21:00': 4, '1:00': 1, '4:00': 1..."
1,kREVIrSBbtqBhIYkTccQUg,"{'Monday': {'13:00': 1}, 'Thursday': {'20:00':..."
2,tJRDll5yqpZwehenzE2cSg,"{'Monday': {'12:00': 1, '1:00': 1}, 'Saturday'..."
3,nhZ1HGWD8lMErdn3FuWuTQ,"{'Sunday': {'18:00': 1, '17:00': 2, '22:00': 1..."
4,vDoXZGE7p6xAkKQ0XQPvoA,"{'Thursday': {'15:00': 1}, 'Saturday': {'23:00..."


In [7]:
def unpack(df, column, fillna=None):
    ret = None
    if fillna is None:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems()))], axis=1)
        del ret[column]
    else:
        ret = pd.concat([df, pd.DataFrame((d for idx, d in df[column].iteritems())).fillna(fillna)], axis=1)
        del ret[column]
    return ret

checkins_daily_df = unpack(checkins_df, 'time')

In [None]:
checkins_daily_df.head()

Unnamed: 0,business_id,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,7KPBkxAOEtb3QeIL9PEErg,"{'16:00': 1, '14:00': 2, '10:00': 2, '23:00': ...","{'12:00': 1, '11:00': 1, '14:00': 1, '18:00': ...","{'21:00': 1, '23:00': 3, '18:00': 4, '10:00': ...","{'18:00': 1, '16:00': 1, '14:00': 1, '19:00': ...","{'21:00': 4, '1:00': 1, '4:00': 1, '2:00': 1, ...","{'18:00': 2, '12:00': 1, '13:00': 2, '16:00': ...","{'11:00': 2, '13:00': 2, '14:00': 1, '17:00': ..."
1,kREVIrSBbtqBhIYkTccQUg,,{'13:00': 1},"{'21:00': 1, '16:00': 1}",{'19:00': 1},"{'20:00': 1, '13:00': 1}",,{'17:00': 1}
2,tJRDll5yqpZwehenzE2cSg,,"{'12:00': 1, '1:00': 1}",{'16:00': 1},,{'0:00': 1},,
3,nhZ1HGWD8lMErdn3FuWuTQ,"{'18:00': 1, '22:00': 2, '1:00': 1, '0:00': 1}","{'17:00': 1, '18:00': 1, '19:00': 1, '2:00': 1...","{'20:00': 1, '23:00': 2, '22:00': 3, '17:00': ...","{'18:00': 1, '17:00': 2, '22:00': 1, '0:00': 1...","{'3:00': 1, '0:00': 1, '20:00': 1, '21:00': 1,...","{'17:00': 1, '1:00': 1, '23:00': 3, '22:00': 3}","{'3:00': 1, '0:00': 1, '22:00': 1}"
4,vDoXZGE7p6xAkKQ0XQPvoA,,,{'23:00': 1},,{'15:00': 1},,


In [None]:
chunks = []
for day in checkins_daily_df.columns[1:]:
    data = checkins_daily_df[day].copy().to_frame()
    chunk = data[day].apply(pd.Series)
    chunk.drop([0], axis=1, inplace=True)
    chunk.columns = [int(str(x).split(':')[0]) for x in chunk.columns]

#     fill in missing hours
    for n in range(0, 24, 1):
        if n not in chunk.columns:
            chunk[n] = np.nan
    chunk['day'] = day

    chunks.append(chunk)
    
    
checkins_hourly_df = pd.concat(chunks)
checkins_hourly_df

  result = result.union(other)
  result = result.union(other)
  union = _union_indexes(indexes)
  union = _union_indexes(indexes)


In [None]:
weekly_checkins = checkins_hourly_df.merge(checkins_daily_df, left_index=True, right_index=True, how='left')
weekly_checkins.drop(day_labels, axis=1, inplace=True)
weekly_checkins.fillna(0, inplace=True)
weekly_checkins.reset_index(inplace=True, drop=True)
weekly_checkins.head(3)

# Write to File
<p>Split by day of week</p>

In [None]:
day_count = len(weekly_checkins.day.unique())
for i, day in enumerate(weekly_checkins.day.unique()):
    biz_df = weekly_checkins[weekly_checkins.day == day].copy()
    biz_df.reset_index(inplace=True, drop=True)
    print('%s of %s' % (str(i+1), str(day_count)))
    biz_df.to_csv('../clean_data/02_checkins/%s_checkins_clean.csv' % (day))

In [None]:
weekly_checkins.shape[0]