# Create a Foot Traffic Data Set by street 

Based off the XPlore logic, this time load in more more foot traffic data based on the full history downloaded and saved to /data_files_raw/foot_traffic_melb/ folder

This time, rather than aggregate all the street numbers to get a total melbourne number, cherry pick some of the streets. Convert it to a long format with the street name

In [563]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import re

from utilities import data_basic_utility as databasic
from utilities import regex_utility as reutil

# File Variables

Set the start and end date of the time frame of data to load in and combine into one unified file

This is the list of all Streets that exist in all files from Jan 2013 to July 2022
['Date', 'Hour', 'Bourke Street Mall (North)', 'Bourke Street Mall (South)', 'Melbourne Central', 'Town Hall (West)', 'Princes Bridge', 'Birrarung Marr', 'Webb Bridge', 'Southern Cross Station', 'Victoria Point', 'Waterfront City', 'New Quay', 'Flagstaff Station', 'Sandridge Bridge', 'State Library', 'Collins Place (South)', 'Collins Place (North)', 'Chinatown-Swanston St (North)', 'Flinders St-Elizabeth St (East)', 'Spencer St-Collins St (South)', 'Spencer St-Collins St (North)', 'QV Market-Peel St']

In [564]:
filesEndDate = datetime(2022, 7, 1)
#filesStartDate = datetime(2022, 4, 1)
filesStartDate = datetime(2013, 1, 1)

footTrafficFolder = "./data_files_raw/foot_traffic_melb/"
tempFolder = "./tmp/"

#Create a list of streets to use. Try to get a good cross section of the city, but not too many locations
streetsToUse = [ 
    "Bourke Street Mall (North)",
    "Melbourne Central",
    "Southern Cross Station",
    "Chinatown-Swanston St (North)",
    "Spencer St-Collins St (North)",
    "QV Market-Peel St",
    "Collins Place (North)"
]

Load all the weather data files like rain and temp

In [565]:
rain = pd.read_csv("IDCJAC0009_086338_1800_Data.csv")
max_temp = pd.read_csv("IDCJAC0010_086338_1800_Data.csv")
min_temp = pd.read_csv("IDCJAC0011_086338_1800_Data.csv")
solar_exp = pd.read_csv("IDCJAC0016_086338_1800_Data.csv")

In [566]:
# Helper functions for data files
def make_date_col(df):
    df["date"] = df['Day'].astype(str) + "/" + df['Month'].astype(str) + "/" + df['Year'].astype(str)
    df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y")
    return df['date']

    
def filter_weathers(df, startDate='2022-07-01', endDate='2022-07-31'):
    df = df.loc[(df['date'] >= startDate) & (df['date'] <= endDate)]
    return df    

## Rain clean

In [567]:
print(rain.info())
rain.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3523 entries, 0 to 3522
Data columns (total 8 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Product code                                    3523 non-null   object 
 1   Bureau of Meteorology station number            3523 non-null   int64  
 2   Year                                            3523 non-null   int64  
 3   Month                                           3523 non-null   int64  
 4   Day                                             3523 non-null   int64  
 5   Rainfall amount (millimetres)                   3366 non-null   float64
 6   Period over which rainfall was measured (days)  3365 non-null   float64
 7   Quality                                         3366 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 220.3+ KB
None


Unnamed: 0,Product code,Bureau of Meteorology station number,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality
0,IDCJAC0009,86338,2013,1,1,,,
1,IDCJAC0009,86338,2013,1,2,,,
2,IDCJAC0009,86338,2013,1,3,,,
3,IDCJAC0009,86338,2013,1,4,,,
4,IDCJAC0009,86338,2013,1,5,,,
5,IDCJAC0009,86338,2013,1,6,,,
6,IDCJAC0009,86338,2013,1,7,,,
7,IDCJAC0009,86338,2013,1,8,,,
8,IDCJAC0009,86338,2013,1,9,,,
9,IDCJAC0009,86338,2013,1,10,,,


In [568]:
rain = rain.drop(labels=['Product code', 'Bureau of Meteorology station number'], axis=1)

In [569]:
rain["date"] = make_date_col(rain)

In [570]:
rain.head(10)

Unnamed: 0,Year,Month,Day,Rainfall amount (millimetres),Period over which rainfall was measured (days),Quality,date
0,2013,1,1,,,,2013-01-01
1,2013,1,2,,,,2013-01-02
2,2013,1,3,,,,2013-01-03
3,2013,1,4,,,,2013-01-04
4,2013,1,5,,,,2013-01-05
5,2013,1,6,,,,2013-01-06
6,2013,1,7,,,,2013-01-07
7,2013,1,8,,,,2013-01-08
8,2013,1,9,,,,2013-01-09
9,2013,1,10,,,,2013-01-10


In [571]:
rain.rename(columns={'Rainfall amount (millimetres)':"total_rain"},inplace=True)

In [572]:
rain.head()

Unnamed: 0,Year,Month,Day,total_rain,Period over which rainfall was measured (days),Quality,date
0,2013,1,1,,,,2013-01-01
1,2013,1,2,,,,2013-01-02
2,2013,1,3,,,,2013-01-03
3,2013,1,4,,,,2013-01-04
4,2013,1,5,,,,2013-01-05


In [573]:
rain.sort_index(ascending=False).head()

Unnamed: 0,Year,Month,Day,total_rain,Period over which rainfall was measured (days),Quality,date
3522,2022,8,24,0.4,1.0,N,2022-08-24
3521,2022,8,23,8.8,1.0,N,2022-08-23
3520,2022,8,22,0.0,1.0,N,2022-08-22
3519,2022,8,21,0.0,1.0,N,2022-08-21
3518,2022,8,20,4.4,1.0,N,2022-08-20


## Foot traffic clean

In [574]:
# Have a look at one of the files
foot_traffic = pd.read_csv("July_2022.csv")

print(foot_traffic.info())
foot_traffic.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 744 entries, 0 to 743
Data columns (total 83 columns):
 #   Column                                            Non-Null Count  Dtype 
---  ------                                            --------------  ----- 
 0   Date                                              744 non-null    object
 1   Hour                                              744 non-null    int64 
 2   Bourke Street Mall (North)                        744 non-null    object
 3   Bourke Street Mall (South)                        744 non-null    object
 4   Melbourne Central                                 744 non-null    object
 5   Town Hall (West)                                  744 non-null    int64 
 6   Princes Bridge                                    744 non-null    int64 
 7   Flinders Street Station Underpass                 744 non-null    int64 
 8   Birrarung Marr                                    744 non-null    object
 9   Webb Bridge                     

Unnamed: 0,Date,Hour,Bourke Street Mall (North),Bourke Street Mall (South),Melbourne Central,Town Hall (West),Princes Bridge,Flinders Street Station Underpass,Birrarung Marr,Webb Bridge,...,Bourke St - Spencer St (South),Spring St- Flinders St (West),Macaulay Rd-Bellair St,Harbour Esplanade (West) - Pedestrian Path,Harbour Esplanade (West) - Bike Path,Flinders St (South),Elizabeth St - Flinders St (East) - New footpath,Macaulay Rd (North),Queensberry St - Errol St (South),Errol St (West)
0,1/07/2022,0,26,50,243,104,76,125,na,8,...,na,1,na,na,10,79,156,4,2,6
1,1/07/2022,1,17,55,157,56,58,75,na,10,...,na,na,na,na,16,39,105,na,2,2
2,1/07/2022,2,13,18,118,42,39,21,1,na,...,na,1,na,1,na,35,93,na,na,na
3,1/07/2022,3,8,23,111,23,37,23,na,3,...,na,na,2,1,1,19,62,na,1,1
4,1/07/2022,4,10,10,101,18,34,32,na,na,...,na,3,na,na,1,9,65,na,na,2
5,1/07/2022,5,7,20,70,33,74,110,na,14,...,na,19,na,na,na,66,59,3,5,6
6,1/07/2022,6,25,44,110,95,162,376,na,39,...,na,80,4,4,3,130,215,27,19,20
7,1/07/2022,7,116,99,178,277,314,631,na,140,...,na,54,34,78,17,221,524,62,37,68
8,1/07/2022,8,339,271,382,717,629,1484,1,246,...,na,68,87,213,44,544,1093,117,65,122
9,1/07/2022,9,619,494,518,952,602,942,1,185,...,na,72,112,178,58,485,1016,133,87,159


Example of the working to convert a file so that it has counted up all the numbers for a day, but only for the streets to use, and then also pivoted into a longer format

In [575]:
foot_traffic=foot_traffic.replace(to_replace=["na","undefined"],value=0)
columnsToUse = [ "Date", "Hour" ]
columnsToUse = columnsToUse + streetsToUse
#print(columnsToUse)

foot_traffic = foot_traffic[columnsToUse]
foot_traffic.head()

Unnamed: 0,Date,Hour,Bourke Street Mall (North),Melbourne Central,Southern Cross Station,Chinatown-Swanston St (North),Spencer St-Collins St (North),QV Market-Peel St,Collins Place (North)
0,1/07/2022,0,26,243,22,0,112,22,11
1,1/07/2022,1,17,157,14,0,90,15,10
2,1/07/2022,2,13,118,6,0,36,10,7
3,1/07/2022,3,8,111,9,0,48,9,4
4,1/07/2022,4,10,101,11,0,38,3,0


In [576]:
dfFtStreet = pd.melt(foot_traffic, id_vars=[ "Date", "Hour" ], var_name="Street")
dfFtStreet = dfFtStreet.rename(columns = { "value":"people" })
dfFtStreet["people"] = dfFtStreet.apply(lambda x: int(x["people"]), axis=1)
print(dfFtStreet.info())
dfFtStreet.head(20)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5208 entries, 0 to 5207
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Date    5208 non-null   object
 1   Hour    5208 non-null   int64 
 2   Street  5208 non-null   object
 3   people  5208 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 162.9+ KB
None


Unnamed: 0,Date,Hour,Street,people
0,1/07/2022,0,Bourke Street Mall (North),26
1,1/07/2022,1,Bourke Street Mall (North),17
2,1/07/2022,2,Bourke Street Mall (North),13
3,1/07/2022,3,Bourke Street Mall (North),8
4,1/07/2022,4,Bourke Street Mall (North),10
5,1/07/2022,5,Bourke Street Mall (North),7
6,1/07/2022,6,Bourke Street Mall (North),25
7,1/07/2022,7,Bourke Street Mall (North),116
8,1/07/2022,8,Bourke Street Mall (North),339
9,1/07/2022,9,Bourke Street Mall (North),619


In [577]:
dfFtStreet = dfFtStreet.groupby([ "Date", "Street" ])["people"].sum().reset_index()

dfFtStreet.head(20)

Unnamed: 0,Date,Street,people
0,1/07/2022,Bourke Street Mall (North),20550
1,1/07/2022,Chinatown-Swanston St (North),0
2,1/07/2022,Collins Place (North),5668
3,1/07/2022,Melbourne Central,28015
4,1/07/2022,QV Market-Peel St,2642
5,1/07/2022,Southern Cross Station,10829
6,1/07/2022,Spencer St-Collins St (North),23293
7,10/07/2022,Bourke Street Mall (North),19475
8,10/07/2022,Chinatown-Swanston St (North),13179
9,10/07/2022,Collins Place (North),1393


In [578]:
# This is basically Freddie's logic of loading in one month's file, cleaning it and converting it to a total people count
def loadAndCountFootTrafficFile(dataFile):
    foot_traffic = pd.read_csv(footTrafficFolder + dataFile)
    foot_traffic=foot_traffic.replace(to_replace=["na","undefined"],value=0)
    # foot_traffic["total_people"] = foot_traffic.iloc[:,2:].sum(axis=1)

    # get the first date to test the structure
    firstDate = foot_traffic["Date"][0]
    if reutil.re_is_match(reutil.regex_DateDdMmYyyy(), firstDate):
        foot_traffic["Date"] = pd.to_datetime(foot_traffic["Date"], format="%d/%m/%Y")
    elif reutil.re_is_match(reutil.regex_DateDdMmYy(), firstDate):
        foot_traffic["Date"] = pd.to_datetime(foot_traffic["Date"], format="%d/%m/%y")
    elif reutil.re_is_match(reutil.regex_DateDdMmmYy(), firstDate):
        foot_traffic["Date"] = pd.to_datetime(foot_traffic["Date"], format="%d-%b-%y")    

    # First, filter out the streets we don't want to use
    columnsToUse = [ "Date", "Hour" ]
    columnsToUse = columnsToUse + streetsToUse
    foot_traffic = foot_traffic[columnsToUse]

    # Then, unpivot the data by the date and hour, so we have records of Street name and total_people
    foot_traffic = pd.melt(foot_traffic, id_vars=[ "Date", "Hour" ], var_name="street")
    foot_traffic = foot_traffic.rename(columns = { "value":"total_people" })
    foot_traffic.loc[foot_traffic["total_people"].isna(), "total_people"] = 0
    foot_traffic["total_people"] = foot_traffic.apply(lambda x: int(x["total_people"]), axis=1)

    # Then aggregate all the hourly numbers so we have a count by street and day
    foot_traffic = foot_traffic.groupby([ "Date", "street" ])["total_people"].sum().reset_index()
    foot_traffic.rename(columns={'Date':'date'}, inplace=True)
    return foot_traffic



In [579]:
# Test the function
dfFT_202207 = loadAndCountFootTrafficFile("July_2022.csv")

print(dfFT_202207.info())
dfFT_202207.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          217 non-null    datetime64[ns]
 1   street        217 non-null    object        
 2   total_people  217 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.2+ KB
None


Unnamed: 0,date,street,total_people
0,2022-07-01,Bourke Street Mall (North),20550
1,2022-07-01,Chinatown-Swanston St (North),0
2,2022-07-01,Collins Place (North),5668
3,2022-07-01,Melbourne Central,28015
4,2022-07-01,QV Market-Peel St,2642
5,2022-07-01,Southern Cross Station,10829
6,2022-07-01,Spencer St-Collins St (North),23293
7,2022-07-02,Bourke Street Mall (North),23469
8,2022-07-02,Chinatown-Swanston St (North),0
9,2022-07-02,Collins Place (North),2594


In [580]:
# First, start with the end month, load the data into a dataframe
fileName = filesEndDate.strftime("%B") + "_" + filesEndDate.strftime("%Y") + ".csv"
print(fileName)

dfFootTraffic = loadAndCountFootTrafficFile(fileName)
print(dfFootTraffic.info())
dfFootTraffic.head()

July_2022.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217 entries, 0 to 216
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          217 non-null    datetime64[ns]
 1   street        217 non-null    object        
 2   total_people  217 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 5.2+ KB
None


Unnamed: 0,date,street,total_people
0,2022-07-01,Bourke Street Mall (North),20550
1,2022-07-01,Chinatown-Swanston St (North),0
2,2022-07-01,Collins Place (North),5668
3,2022-07-01,Melbourne Central,28015
4,2022-07-01,QV Market-Peel St,2642


In [581]:
date1 = '1-Nov-13'
date2 = "23/5/21"

pattern = reutil.regex_DateDdMmmYy()
if re.match(pattern, date1):
    print("date1 regex_DateDdMmmYy MATCH")
else:
    print("date1 regex_DateDdMmmYy NO MATCH")


pattern = reutil.regex_DateDdMmmYy()
if re.match(pattern, date2):
    print("date2 regex_DateDdMmmYy MATCH")
else:
    print("date2 regex_DateDdMmmYy NO MATCH")

pattern = reutil.regex_DateDdMmYy()
if re.match(pattern, date2):
    print("date2 regex_DateDdMmmYy MATCH")
else:
    print("date2 regex_DateDdMmmYy NO MATCH")    

date1 regex_DateDdMmmYy MATCH
date2 regex_DateDdMmmYy NO MATCH
date2 regex_DateDdMmmYy MATCH


Find all streets that exist in all files

In [582]:
filesEndDate = datetime(2022, 7, 1)
#filesStartDate = datetime(2022, 4, 1)
filesStartDate = datetime(2013, 1, 1)

In [583]:
stepperDate = filesEndDate
stepperDate = stepperDate - relativedelta(months=1)
stepCount = 0

columnList = []

while stepperDate >= filesStartDate:
    fileName = stepperDate.strftime("%B") + "_" + stepperDate.strftime("%Y") + ".csv"

    foot_traffic = pd.read_csv(footTrafficFolder + fileName)

    if len(columnList) == 0:
        columnList = foot_traffic.columns
    else:
        # filter the list to include only if exists in other list
        # columnList = [x for x in columnList if x[0] in foot_traffic.columns]
        columnList = list(filter(lambda x: x in foot_traffic.columns, columnList))

    # step back
    stepperDate = stepperDate - relativedelta(months=1)

    # Sanity check, break in case of an infinite loop
    stepCount += 1
    if stepCount > 1000:
        break

print(columnList)

['Date', 'Hour', 'Bourke Street Mall (North)', 'Bourke Street Mall (South)', 'Melbourne Central', 'Town Hall (West)', 'Princes Bridge', 'Birrarung Marr', 'Webb Bridge', 'Southern Cross Station', 'Victoria Point', 'Waterfront City', 'New Quay', 'Flagstaff Station', 'Sandridge Bridge', 'State Library', 'Collins Place (South)', 'Collins Place (North)', 'Chinatown-Swanston St (North)', 'Flinders St-Elizabeth St (East)', 'Spencer St-Collins St (South)', 'Spencer St-Collins St (North)', 'QV Market-Peel St']


In [584]:
# Now go back to the month before
stepperDate = filesEndDate
stepperDate = stepperDate - relativedelta(months=1)
stepCount = 0

# For each month, load the data, append it to the total dataframe then step back another month
# keep going until we get all the way to the start date
while stepperDate >= filesStartDate:
    fileName = stepperDate.strftime("%B") + "_" + stepperDate.strftime("%Y") + ".csv"
    # print(fileName)

    dfMonth = loadAndCountFootTrafficFile(fileName)
    dfFootTraffic = pd.concat([dfFootTraffic, dfMonth])

    # step back
    stepperDate = stepperDate - relativedelta(months=1)

    # Sanity check, break in case of an infinite loop
    stepCount += 1
    if stepCount > 1000:
        break


# Order by the date desc
dfFootTraffic = dfFootTraffic.sort_values(["date"], ascending=False)

print(dfFootTraffic.info())
dfFootTraffic.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 24493 entries, 210 to 0
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          24493 non-null  datetime64[ns]
 1   street        24493 non-null  object        
 2   total_people  24493 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 765.4+ KB
None


Unnamed: 0,date,street,total_people
210,2022-07-31,Bourke Street Mall (North),15434
216,2022-07-31,Spencer St-Collins St (North),12349
215,2022-07-31,Southern Cross Station,1661
214,2022-07-31,QV Market-Peel St,3203
213,2022-07-31,Melbourne Central,23363


In [585]:
dfFootTraffic.sort_values(["date"], ascending=True).head()

dfFootTraffic.to_csv(tempFolder + "foottrafficstreet.csv")

# Join and plot 

In [586]:
max_temp['date'] = make_date_col(max_temp)
min_temp['date'] = make_date_col(min_temp)
solar_exp['date'] = make_date_col(solar_exp)

In [587]:
# Get out the range of dates that have been loaded from the foot traffic and filter the weather data accordingly
ftMinDate = dfFootTraffic["date"].min()
ftMaxDate = dfFootTraffic["date"].max()

print(ftMinDate)
print(ftMaxDate)

max_temp=filter_weathers(max_temp, ftMinDate, ftMaxDate)
min_temp=filter_weathers(min_temp, ftMinDate, ftMaxDate)
solar_exp=filter_weathers(solar_exp, ftMinDate, ftMaxDate)

2013-01-01 00:00:00
2022-07-31 00:00:00


In [588]:
max_temp.rename(columns={"Maximum temperature (Degree C)":"max_temp"},inplace=True)
min_temp.rename(columns={"Minimum temperature (Degree C)":"min_temp"},inplace=True)
solar_exp.rename(columns={"Daily global solar exposure (MJ/m*m)":"solar_exp"},inplace=True)

In [589]:
print(max_temp.info())
max_temp.head()


# max_temp.to_csv(tempFolder + "maxtemp.csv")

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3499 entries, 0 to 3498
Data columns (total 9 columns):
 #   Column                                       Non-Null Count  Dtype         
---  ------                                       --------------  -----         
 0   Product code                                 3499 non-null   object        
 1   Bureau of Meteorology station number         3499 non-null   int64         
 2   Year                                         3499 non-null   int64         
 3   Month                                        3499 non-null   int64         
 4   Day                                          3499 non-null   int64         
 5   max_temp                                     3347 non-null   float64       
 6   Days of accumulation of maximum temperature  3347 non-null   float64       
 7   Quality                                      3346 non-null   object        
 8   date                                         3499 non-null   datetime64[ns]
dty

Unnamed: 0,Product code,Bureau of Meteorology station number,Year,Month,Day,max_temp,Days of accumulation of maximum temperature,Quality,date
0,IDCJAC0010,86338,2013,1,1,,,,2013-01-01
1,IDCJAC0010,86338,2013,1,2,,,,2013-01-02
2,IDCJAC0010,86338,2013,1,3,,,,2013-01-03
3,IDCJAC0010,86338,2013,1,4,,,,2013-01-04
4,IDCJAC0010,86338,2013,1,5,,,,2013-01-05


Write out a datafile with the date and the total foot traffic numbers in Melbourne, for future use

Also, create a datafile with the foot traffic and all the weather columns by day for Melbourne, we can use that later as a source datafile for basic modelling

In [590]:
# Foot Traffic by day Data
outputFootTrafficFileName = "FootTrafficMelbStreet_" + filesStartDate.strftime("%Y%m%d") + "_" + filesEndDate.strftime("%Y%m%d") + ".csv"
dfFootTraffic.to_csv("./data_files/" + outputFootTrafficFileName, index=False)

print(dfFootTraffic.info())
dfFootTraffic.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24493 entries, 210 to 0
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          24493 non-null  datetime64[ns]
 1   street        24493 non-null  object        
 2   total_people  24493 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 765.4+ KB
None


Unnamed: 0,date,street,total_people
210,2022-07-31,Bourke Street Mall (North),15434
216,2022-07-31,Spencer St-Collins St (North),12349
215,2022-07-31,Southern Cross Station,1661
214,2022-07-31,QV Market-Peel St,3203
213,2022-07-31,Melbourne Central,23363


In [591]:
rain.head()

Unnamed: 0,Year,Month,Day,total_rain,Period over which rainfall was measured (days),Quality,date
0,2013,1,1,,,,2013-01-01
1,2013,1,2,,,,2013-01-02
2,2013,1,3,,,,2013-01-03
3,2013,1,4,,,,2013-01-04
4,2013,1,5,,,,2013-01-05


The data we want is total rain, and also Quality, which is a Y/N and shows whether the rain measurement has passed full quality control. If N, then the measurement might be suspect

In [592]:
dfRainToMerge = rain[["total_rain", "Quality", "date"]]
dfRainToMerge.rename(columns={"Quality":"rain_quality"},inplace=True)

dfFootTrafficWeather = pd.merge(dfFootTraffic, dfRainToMerge, how="inner", on="date")
dfFootTrafficWeather.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,date,street,total_people,total_rain,rain_quality
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N
2,2022-07-31,Southern Cross Station,1661,0.0,N
3,2022-07-31,QV Market-Peel St,3203,0.0,N
4,2022-07-31,Melbourne Central,23363,0.0,N


In [593]:
# Merge the Max Temp
dfMaxTempToMerge = max_temp[["max_temp", "Quality", "date"]]
dfMaxTempToMerge.rename(columns={"Quality":"max_temp_quality"},inplace=True)
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfMaxTempToMerge, how="inner", on="date")

# Merge the Max Temp
dfMinTempToMerge = min_temp[["min_temp", "Quality", "date"]]
dfMinTempToMerge.rename(columns={"Quality":"min_temp_quality"},inplace=True)
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfMinTempToMerge, how="inner", on="date")

dfFootTrafficWeather.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y


In [594]:
solar_exp.head()

Unnamed: 0,Product code,Bureau of Meteorology station number,Year,Month,Day,solar_exp,date
8401,IDCJAC0016,86338,2013,1,1,31.1,2013-01-01
8402,IDCJAC0016,86338,2013,1,2,31.9,2013-01-02
8403,IDCJAC0016,86338,2013,1,3,32.8,2013-01-03
8404,IDCJAC0016,86338,2013,1,4,33.5,2013-01-04
8405,IDCJAC0016,86338,2013,1,5,30.5,2013-01-05


In [595]:
# Merge the Solar Exposure
dfSolarExpToMerge = solar_exp[["solar_exp", "date"]]
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfSolarExpToMerge, how="inner", on="date")

print(dfFootTrafficWeather.info())
dfFootTrafficWeather.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24493 entries, 0 to 24492
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              24493 non-null  datetime64[ns]
 1   street            24493 non-null  object        
 2   total_people      24493 non-null  int64         
 3   total_rain        23394 non-null  float64       
 4   rain_quality      23394 non-null  object        
 5   max_temp          23429 non-null  float64       
 6   max_temp_quality  23422 non-null  object        
 7   min_temp          23422 non-null  float64       
 8   min_temp_quality  23422 non-null  object        
 9   solar_exp         24486 non-null  float64       
dtypes: datetime64[ns](1), float64(4), int64(1), object(4)
memory usage: 2.1+ MB
None


Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y,4.8
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y,4.8
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y,4.8
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y,4.8
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y,4.8


Add the day of the week as a feature

In [596]:
dfFootTrafficWeather["WeekDay"] = dfFootTrafficWeather.apply(lambda x: x["date"].weekday(), axis=1)

Adding annual population and growth rating to the data

In [597]:
# First, create a temp dateyear column for joining
dfFootTrafficWeather["date_year"] = dfFootTrafficWeather.apply(lambda x: x["date"].year, axis=1)

In [598]:
dfFootTrafficWeather.head()

Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,WeekDay,date_year
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y,4.8,6,2022
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y,4.8,6,2022
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y,4.8,6,2022
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y,4.8,6,2022
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y,4.8,6,2022


In [599]:
# Load the population file
dfPop = pd.read_csv("./data_files/greatermelb_population_annual.csv")
print(dfPop.shape)
print(dfPop.info())
dfPop.head()

(10, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date            10 non-null     object 
 1    Population     10 non-null     int64  
 2    Annual Change  10 non-null     float64
dtypes: float64(1), int64(1), object(1)
memory usage: 368.0+ bytes
None


Unnamed: 0,date,Population,Annual Change
0,31/12/2013,4217000,2.5
1,31/12/2014,4322000,2.49
2,31/12/2015,4430000,2.5
3,31/12/2016,4541000,2.51
4,31/12/2017,4654000,2.49


In [600]:
# Convert to just a date year column
dfPop["date_year"] = dfPop.apply(lambda x: pd.to_datetime(x["date"]).year, axis=1)
dfPop = dfPop.rename(columns={ " Population" : "population_annual", " Annual Change" : "population_change_annual" })
del dfPop["date"]
dfPop.head()

Unnamed: 0,population_annual,population_change_annual,date_year
0,4217000,2.5,2013
1,4322000,2.49,2014
2,4430000,2.5,2015
3,4541000,2.51,2016
4,4654000,2.49,2017


In [601]:
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfPop, on="date_year")
del dfFootTrafficWeather["date_year"]


In [602]:
# Load and join the Holidays file
dfHol = pd.read_csv("./data_files/Holidays_20130101_20220701.csv")
dfHol["date"] = pd.to_datetime(dfHol["date"])
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfHol, on="date", how="left")

In [603]:
# Load and join the Lockdown file
dfLockdown = pd.read_csv("./data_files/Melb_Lockdown_Dates.csv")
dfLockdown["date"] = pd.to_datetime(dfLockdown["date"])
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfLockdown, on="date", how="left")

Create a Date Year-Month key for joining monthly data files like retain and all ords

In [604]:
dfFootTrafficWeather["date_ym"] = dfFootTrafficWeather.apply(lambda x: str(pd.to_datetime(x["date"]).year) + "-" + str(pd.to_datetime(x["date"]).month), axis=1)

In [605]:
# Read in the Retail data and join it to our foot traffic data
dfRetail = pd.read_csv("./data_files/OFFLINE_Retail_Turnover_VIC.csv")

dfRetail["date_ym"] = dfRetail.apply(lambda x: str(pd.to_datetime(x["date"]).year) + "-" + str(pd.to_datetime(x["date"]).month), axis=1)
dfRetail=dfRetail.rename(columns= {"Original_Turnover":"OfflineRetail_Original_Turnover", "Seasonally_Adjusted_Turnover":"OfflineRetail_Seasonally_Adjusted_Turnover","Trend_Turnover":"OfflineRetail_Trend_Turnover"})
del dfRetail["date"]
dfRetail.head()

Unnamed: 0,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover,date_ym
0,5274.0,5364.9,5362.4,2013-1
1,4748.3,5407.4,5374.0,2013-2
2,5283.1,5383.7,5383.0,2013-3
3,5106.1,5366.6,5390.2,2013-4
4,5280.5,5409.7,5398.2,2013-5


In [606]:
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfRetail, on="date_ym")

In [607]:
# Read in the All Ords data and join it to our foot traffic data
dfAllOrds = pd.read_csv("./data_files_raw/aus-all-ords.csv")

#dfAllOrds["date_ym"] = dfAllOrds["Year"] + "-" + dfAllOrds["Month Num"]
dfAllOrds["date_ym"] = dfAllOrds.apply(lambda x: str(x["Year"]) + "-" + str(x["Month Num"]), axis=1)
del dfAllOrds["Year"]
del dfAllOrds["Month Name"]
del dfAllOrds["Month Num"]
dfAllOrds.head()

Unnamed: 0,all_ords,sp_asx200,dom_equity_market_cap,date_ym
0,7226.1,6986.8,2467758.0,2022-8
1,7173.8,6945.21,2453645.0,2022-7
2,6746.5,6568.1,2308049.0,2022-6
3,7455.2,7211.2,2523641.0,2022-5
4,7724.8,7435.0,2648847.0,2022-4


In [608]:
dfFootTrafficWeather = pd.merge(dfFootTrafficWeather, dfAllOrds, on="date_ym")

In [609]:
del dfFootTrafficWeather["date_ym"] 
print(dfFootTrafficWeather.info())
dfFootTrafficWeather.head(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24633 entries, 0 to 24632
Data columns (total 21 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   date                                        24633 non-null  datetime64[ns]
 1   street                                      24633 non-null  object        
 2   total_people                                24633 non-null  int64         
 3   total_rain                                  23534 non-null  float64       
 4   rain_quality                                23534 non-null  object        
 5   max_temp                                    23569 non-null  float64       
 6   max_temp_quality                            23562 non-null  object        
 7   min_temp                                    23562 non-null  float64       
 8   min_temp_quality                            23562 non-null  object        
 9   solar_

Unnamed: 0,date,street,total_people,total_rain,rain_quality,max_temp,max_temp_quality,min_temp,min_temp_quality,solar_exp,...,population_annual,population_change_annual,is_holiday,is_lockdown,OfflineRetail_Original_Turnover,OfflineRetail_Seasonally_Adjusted_Turnover,OfflineRetail_Trend_Turnover,all_ords,sp_asx200,dom_equity_market_cap
0,2022-07-31,Bourke Street Mall (North),15434,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
1,2022-07-31,Spencer St-Collins St (North),12349,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
2,2022-07-31,Southern Cross Station,1661,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
3,2022-07-31,QV Market-Peel St,3203,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
4,2022-07-31,Melbourne Central,23363,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
5,2022-07-31,Collins Place (North),1410,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
6,2022-07-31,Chinatown-Swanston St (North),11123,0.0,N,14.7,Y,4.3,Y,4.8,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
7,2022-07-30,Spencer St-Collins St (North),15937,0.0,N,13.0,Y,2.1,Y,11.3,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
8,2022-07-30,Southern Cross Station,2540,0.0,N,13.0,Y,2.1,Y,11.3,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0
9,2022-07-30,QV Market-Peel St,4457,0.0,N,13.0,Y,2.1,Y,11.3,...,5151000,1.78,,,8562.7,8947.3,,7173.8,6945.21,2453645.0


Write out all the joined data to file

In [611]:
outputFootTrafficWeatherFileName = "FT_Street_Melb_" + filesStartDate.strftime("%Y%m%d") + "_" + filesEndDate.strftime("%Y%m%d") + ".csv"
dfFootTrafficWeather.to_csv("./data_files/" + outputFootTrafficWeatherFileName, index=False)