In [54]:
# DataFrames and Math
import pandas as pd
import numpy as np
import tqdm
pd.options.display.max_columns = 100

# API management
import json
import base64
import requests
from getpass import getpass

# Plotting
import matplotlib.pyplot as plt

# Paths
import pathlib
import os
home_path = pathlib.Path().resolve().parent.parent/"rn_home"
data_output_path = home_path / "data" / "advan_data"
footfall_output_path = home_path / "data" / "advan_data_footfall"
processed_output_path = home_path / "data" / "advan_data_processed"

# Define required functions

In [56]:
# Define a helper function to download advan data
def download_advan_data(url, partition, output_path):
    # Get Raja's access token
    access_token = getpass("Enter the access token: ")

    # Get the links to download individual files
    results = requests.get(url=url,
                       headers={
                        "X-API-KEY": access_token,
                        'accept': 'application/json'
                       })
    print(results.json())
    # Download the files we want
    # Download all files for one month
    for item in results.json()['download_links']:
        if item['partition_key'] == partition:
            filename = item['file_name']
            print(f"Downloading {filename}")
            link = item['link']
            data = requests.get(link)
            open(output_path / filename, 'wb').write(data.content)

In [61]:
def extract_ct_data(input_path, partition, output_path, output_name):
    # Create blank data frame
    ct_data = pd.DataFrame([])

    # Iterate through all the files in the directory
    for file in input_path.iterdir():
        if partition in file.name:
            print(f"Reading file {file.name}")
            # Read the file
            data = pd.read_csv(file)
            # Extract rows for CT
            ct_rows = data[data["REGION"]=="CT"].reset_index(drop=True)
            
            # Update the DataFrame
            ct_data = pd.concat([ct_data, ct_rows]).reset_index(drop=True)

    # Save the master data frame
    print(f"Saving to {output_path}")
    ct_data.to_csv(output_path / f"{output_name}.csv")

    return(ct_data)

# Download footfall data

Download the latest Advan data we have to check if the same errors exist

In [59]:
download_advan_data("https://app.deweydata.io/external-api/v3/products/5acc9f39-1ca6-4535-b3ff-38f6b9baf85e/files",
                    "2020-07-01",
                    footfall_output_path)

Enter the access token:  ········


{'download_links': [{'link': 'https://amplifydata-production-dewey.s3.amazonaws.com/Monthly%20Patterns%20-%20Foot%20Traffic--5acc9f39-1ca6-4535-b3ff-38f6b9baf85e/2019-01-01/data_01af8720-0604-c39a-0043-0b8700fc7dda_23_3_0.csv.gz?response-content-disposition=attachment%3B%20filename%3DMonthly_Patterns_Foot_Traffic-0-DATE_RANGE_START-2019-01-01.csv.gz&AWSAccessKeyId=AKIASC5E62QHCPMXWWSX&Signature=CFOZa%2B%2BaGsfcRPm4vYzeZsQ6LbQ%3D&Expires=1699472050', 'partition_key': '2019-01-01', 'file_name': 'Monthly_Patterns_Foot_Traffic-0-DATE_RANGE_START-2019-01-01.csv.gz', 'file_extension': '.csv.gz', 'file_size_bytes': 209732481}, {'link': 'https://amplifydata-production-dewey.s3.amazonaws.com/Monthly%20Patterns%20-%20Foot%20Traffic--5acc9f39-1ca6-4535-b3ff-38f6b9baf85e/2019-01-01/data_01af8720-0604-c39a-0043-0b8700fc7dda_23_3_10.csv.gz?response-content-disposition=attachment%3B%20filename%3DMonthly_Patterns_Foot_Traffic-1-DATE_RANGE_START-2019-01-01.csv.gz&AWSAccessKeyId=AKIASC5E62QHCPMXWWSX&Sig

## Extract CT Footfall Data

In [None]:
ct_ff_dat = extract_ct_data(footfall_output_path, '2020-07-01', processed_output_path, "ct_ff_2020-07-01_110723")

In [181]:
ct_data = extract_ct_data(data_output_path, processed_output_path, "ct_advan_data_2018-01-01_110623_2")

Reading file Neighborhood_Patterns_US-3-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-5-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-0-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-6-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-4-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-1-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-2-DATE_RANGE_START-2018-01-01.csv.gz
Reading file Neighborhood_Patterns_US-7-DATE_RANGE_START-2018-01-01.csv.gz
Saving to /gpfs/gibbs/project/gillingham/rrn22/data/advan_data_processed


### Observe data for CT

In [101]:
print(f"There are {len(ct_data)} rows for CT, corresponding to this many block groups")

There are 2585 rows for CT, corresponding to this many block groups


In [146]:
ct_data = ct_data.sort_values("AREA")
ct_bg_codes = ct_data["AREA"].astype(str).str.zfill(12).unique().tolist()

Now observe how many home destinations there are

In [102]:
home_areas_list = []

for index, row in ct_data.iterrows():
    home_areas = json.loads(row["DEVICE_HOME_AREAS"])

    # For each one
    for home_area in home_areas.keys():
        if not home_area in home_areas_list:
            home_areas_list.append(home_area)

In [104]:
home_areas_list_ct = [x for x in home_areas_list if str(x)[0:2]=="09"]
print(f"Thus there are {len(home_areas_list)} home destinations of which {len(home_areas_list_ct)} are from CT")

Thus there are 18674 home destinations of which 2583 are from CT
