In [1]:
#Importing libraries and the visitor log obtained from gym owners.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import dates as mdates
from matplotlib import patches as patches
from matplotlib.dates import DateFormatter
import seaborn as sns
from datetime import date, timedelta, datetime,time
import os
visitor_log = pd.read_csv('../input/visitor-log/visitor_log.csv')

In [2]:
#Checking what the visitor log looks like
visitor_log.head(10)

Unnamed: 0,Visit Date,Visit Time,Member ID,Membership Name,Entry Access
0,2016-12-01,19:08:41,5078.0,Regular Monthly Membership,Successful
1,2016-12-01,19:08:42,5078.0,Regular Monthly Membership,Successful
2,2016-12-02,8:57:31,5354.0,Personal Training,Successful
3,2016-12-04,9:02:23,5078.0,,Successful
4,2016-12-04,9:02:24,5078.0,Regular Monthly Membership,Successful
5,2016-12-05,16:57:19,,,Denied: New Card
6,2016-12-05,16:54:01,5146.0,*ZDNU* Unlimited Monthly Membership (In Person...,Successful
7,2016-12-05,16:54:03,5146.0,*ZDNU* Unlimited Monthly Membership (In Person...,Successful
8,2016-12-05,16:58:01,5282.0,,Successful
9,2016-12-05,16:58:20,5282.0,,Successful


In [3]:
#Making a Timestamp index by combining the Date/Time columns
visitor_log['Timestamp'] = pd.to_datetime(visitor_log['Visit Date'] + ' ' + visitor_log['Visit Time'])
visitor_log.set_index(['Timestamp'])

#Adding the day of the week, and taking out the 'Denied' entries
visitor_log['Weekday'] = visitor_log.Timestamp.dt.day_name()
visitor_log = visitor_log[visitor_log['Entry Access'] != 'Denied']

#Converting the Membership Name column to strings
visitor_log['Membership Name'] = visitor_log['Membership Name'].astype(str)
visitor_log['Weekday'] = visitor_log['Weekday'].astype(str)

In [4]:
#Checking on column types
visitor_log.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 102423 entries, 0 to 102422
Data columns (total 7 columns):
Visit Date         102423 non-null object
Visit Time         102423 non-null object
Member ID          99672 non-null float64
Membership Name    102423 non-null object
Entry Access       102423 non-null object
Timestamp          102423 non-null datetime64[ns]
Weekday            102423 non-null object
dtypes: datetime64[ns](1), float64(1), object(5)
memory usage: 6.3+ MB


In [5]:
#Checking what the visitor log looks like again with the changes
visitor_log.head(10)

Unnamed: 0,Visit Date,Visit Time,Member ID,Membership Name,Entry Access,Timestamp,Weekday
0,2016-12-01,19:08:41,5078.0,Regular Monthly Membership,Successful,2016-12-01 19:08:41,Thursday
1,2016-12-01,19:08:42,5078.0,Regular Monthly Membership,Successful,2016-12-01 19:08:42,Thursday
2,2016-12-02,8:57:31,5354.0,Personal Training,Successful,2016-12-02 08:57:31,Friday
3,2016-12-04,9:02:23,5078.0,,Successful,2016-12-04 09:02:23,Sunday
4,2016-12-04,9:02:24,5078.0,Regular Monthly Membership,Successful,2016-12-04 09:02:24,Sunday
5,2016-12-05,16:57:19,,,Denied: New Card,2016-12-05 16:57:19,Monday
6,2016-12-05,16:54:01,5146.0,*ZDNU* Unlimited Monthly Membership (In Person...,Successful,2016-12-05 16:54:01,Monday
7,2016-12-05,16:54:03,5146.0,*ZDNU* Unlimited Monthly Membership (In Person...,Successful,2016-12-05 16:54:03,Monday
8,2016-12-05,16:58:01,5282.0,,Successful,2016-12-05 16:58:01,Monday
9,2016-12-05,16:58:20,5282.0,,Successful,2016-12-05 16:58:20,Monday


In [6]:
#Setting up prarameters for occupants at a given hour
sign_ins = visitor_log['Timestamp']
now = sign_ins[0]

#A dataframe is going to be made that contains the occupants within the last 1.5hrs, a timestamp, a date column and hour column
#Min/Max times are used to determine if particular sign-in has occurred within 1.5hrs of the hour
occupants = []
realtime = []
occupant_time = []
occupant_date = []
day_delta = timedelta(days=1)
start_date = datetime(2017, 1, 1)
end_date = datetime(2019, 6, 11)
min_time = timedelta(hours=0)
max_time = timedelta(hours=1.5)

In [None]:
#Loop runs through each hour of the day, looking at the current hour(ref) it will subtract each sign-in time(i).
#If the subtraction is between 0-1.5hrs, add to the sum of occupants in that hour
for day in range((end_date - start_date).days):
    dt = (start_date + day*day_delta).isoformat()
    for hour in range(0, 24):
        ref_day = pd.to_datetime(dt)
        ref = datetime.combine(ref_day, time(hour, 0))
        occupants.append(sum(min_time <= ref-i <= max_time for i in sign_ins))
        realtime.append(datetime.combine(ref_day.date(), ref.time()))
        occupant_time.append(hour)
        occupant_date.append(ref_day.date())

In [None]:
#Creating a dataframe counting number of visitors in the last 1.5 hours, indexed per hour
#Added a weekday column
occupant_log = pd.DataFrame({"Occupants": occupants, "realtime": realtime, "Date": occupant_date, "Time": occupant_time})
occupant_log['Day'] = occupant_log['realtime'].dt.dayofyear
occupant_log['Weekday'] = occupant_log['realtime'].dt.weekday
occupant_log['Month'] = occupant_log['realtime'].dt.month
occupant_log['realtime'] =  pd.to_datetime(occupant_log['realtime'])
occupant_log.set_index('realtime')

In [None]:
#Exporting the occupants in each hour, in a CSV to be used with weather data.
#Occupants.to_csv('Occupants.csv')

from IPython.display import HTML
import base64

def create_download_link(occupant_log, title = "Download CSV file", filename = "occupant_log.csv"):  
    csv = occupant_log.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(occupant_log)