In [3]:
#Importing libraries and the visitor log obtained from gym owners.
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from matplotlib import dates as mdates
from matplotlib import patches as patches
from matplotlib.dates import DateFormatter
import seaborn as sns
from datetime import date, timedelta, datetime,time
import os
visitor_log = pd.read_csv('../input/visitor-log/visitor_log.csv')

In [4]:
#Checking what the visitor log looks like
visitor_log.head(10)

Unnamed: 0,Visit Date,Visit Time,Member ID,Membership Name,Entry Access
0,2017-01-01,10:52:53,5206.0,Regular Monthly Membership,Successful
1,2017-01-01,10:52:53,5206.0,Regular Monthly Membership,Successful
2,2017-01-02,6:51:34,5048.0,Regular Monthly Membership,Successful
3,2017-01-02,6:51:34,5048.0,Regular Monthly Membership,Successful
4,2017-01-02,7:02:27,5071.0,Regular Monthly Membership,Successful
5,2017-01-02,7:02:28,5071.0,,Successful
6,2017-01-03,17:57:42,,,Denied: New Card
7,2017-01-03,5:22:08,5206.0,Regular Monthly Membership,Successful
8,2017-01-03,5:22:08,5206.0,Regular Monthly Membership,Successful
9,2017-01-03,16:24:22,5073.0,Regular Monthly Membership,Successful


In [22]:
#Making a Timestamp column by combining the Date/Time columns
visitor_log['Timestamp'] = pd.to_datetime(visitor_log['Visit Date'] + ' ' + visitor_log['Visit Time'])
visitor_log.set_index(['Timestamp'])

#Taking out the 'Denied' entries and 'Nan' memberships
visitor_log = visitor_log[visitor_log['Entry Access'] != 'Denied: New Card']
visitor_log = visitor_log.dropna(subset=['Membership Name'])

In [23]:
#Checking what the visitor log looks like again with the changes
visitor_log.head(10)

Unnamed: 0,Visit Date,Visit Time,Member ID,Membership Name,Entry Access,Timestamp
0,2017-01-01,10:52:53,5206.0,Regular Monthly Membership,Successful,2017-01-01 10:52:53
1,2017-01-01,10:52:53,5206.0,Regular Monthly Membership,Successful,2017-01-01 10:52:53
2,2017-01-02,6:51:34,5048.0,Regular Monthly Membership,Successful,2017-01-02 06:51:34
3,2017-01-02,6:51:34,5048.0,Regular Monthly Membership,Successful,2017-01-02 06:51:34
4,2017-01-02,7:02:27,5071.0,Regular Monthly Membership,Successful,2017-01-02 07:02:27
7,2017-01-03,5:22:08,5206.0,Regular Monthly Membership,Successful,2017-01-03 05:22:08
8,2017-01-03,5:22:08,5206.0,Regular Monthly Membership,Successful,2017-01-03 05:22:08
9,2017-01-03,16:24:22,5073.0,Regular Monthly Membership,Successful,2017-01-03 16:24:22
10,2017-01-03,16:24:22,5073.0,Regular Monthly Membership,Successful,2017-01-03 16:24:22
11,2017-01-03,16:52:05,5069.0,1 Month Unlimited Membership - Student,Successful,2017-01-03 16:52:05


In [24]:
#Setting up prarameters for occupants at a given hour
sign_ins = visitor_log['Timestamp']
now = sign_ins[0]

#A dataframe is going to be made that contains the occupants within the last 1.5hrs, a timestamp, a date column and hour column
#Min/Max times are used to determine if particular sign-in has occurred within 1.5hrs of the hour
occupants = []
realtime = []
occupant_time = []
occupant_date = []
day_delta = timedelta(days=1)
start_date = datetime(2017, 1, 1)
end_date = datetime(2019, 6, 11)
min_time = timedelta(hours=0)
max_time = timedelta(hours=1.5)

In [None]:
#Loop runs through each hour of the day, looking at the current hour(ref) it will subtract each sign-in time(i).
#If the subtraction is between 0-1.5hrs, add to the sum of occupants in that hour
for day in range((end_date - start_date).days):
    dt = (start_date + day*day_delta).isoformat()
    for hour in range(0, 24):
        ref_day = pd.to_datetime(dt)
        ref = datetime.combine(ref_day, time(hour, 0))
        occupants.append(sum(min_time <= ref-i <= max_time for i in sign_ins))
        realtime.append(datetime.combine(ref_day.date(), ref.time()))
        occupant_time.append(hour)
        occupant_date.append(ref_day.date())
        
#Note: This computation takes a very long time, so this process was split up into 6 month periods by changing the start_date and end_date
#Once each computation was completed, the occupant_log's were combined to be used in the exploration and prediction

In [None]:
#Creating a dataframe counting number of visitors in the last 1.5 hours, indexed per hour
occupant_log = pd.DataFrame({"Occupants": occupants, "realtime": realtime, "Date": occupant_date, "Time": occupant_time})

#Also adding other parameters like day of the year, day of the week, and month
occupant_log['Day'] = occupant_log['realtime'].dt.dayofyear
occupant_log['Weekday'] = occupant_log['realtime'].dt.weekday
occupant_log['Month'] = occupant_log['realtime'].dt.month

#Making sure the 'realtime' column is the correct format
occupant_log['realtime'] =  pd.to_datetime(occupant_log['realtime'])
occupant_log.set_index('realtime')

In [None]:
#Exporting the occupants in each hour, in a CSV to be used with weather data.
from IPython.display import HTML
import base64

def create_download_link(occupant_log, title = "Download CSV file", filename = "occupant_log.csv"):  
    csv = occupant_log.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(occupant_log)