In [1]:
# DataFrames and Math
import pandas as pd
import numpy as np
import tqdm
pd.options.display.max_columns = 100

# API management
import json
import base64
import requests
from getpass import getpass

# Plotting
import matplotlib.pyplot as plt

# Paths
import pathlib
import os
import platform

# Set the paths
if platform.platform()[0:5] == 'macOS':
    on_cluster = False
    cd = pathlib.Path().resolve().parent
    str_project = cd / "Documents" 
    str_data = str_project / "tobin_working_data"
    footfall_output_path = str_data / "advan_data_footfall"
    processed_output_path = str_data / "advan_data_footfall_processed"

if platform.platform()[0:5] == 'Linux':
    on_cluster = True
    home_path = pathlib.Path().resolve().parent.parent/"rn_home"
    data_output_path = home_path / "data" / "advan_data"
    footfall_output_path = home_path / "data" / "advan_data_footfall"
    processed_output_path = home_path / "data" / "advan_data_processed"

# Define required functions

In [2]:
# Define a helper function to download advan data
def download_advan_data(url, partition, output_path):
    # Get Raja's access token
    access_token = "DhLxzliD.DAdc1X2Wd9DtI2OuUjtZBekzoKxJMPiCBXlMKLKIKbWM4r2mGKrL6e1X"

    # Get the links to download individual files
    results = requests.get(url=url,
                       headers={
                        "X-API-KEY": access_token,
                        'accept': 'application/json'
                       })
    # print(results.json())
    # Download the files we want
    # Download all files for one month
    for item in results.json()['download_links']:
       if item['partition_key'] == partition:
           filename = item['file_name']
           print(f"Downloading {filename}")
           link = item['link']
           data = requests.get(link)
           open(output_path / filename, 'wb').write(data.content)

In [3]:
def extract_ct_data(input_path, partitions, output_path, output_name, save = False):
    # Create blank data frame
    ct_data = pd.DataFrame([])

    # Iterate through all the files in the directory
    for file in input_path.iterdir():
        for partition in partitions:
            if partition in file.name:
                print(f"Reading file {file.name}")
                # Read the file
                data = pd.read_csv(file)
                # Extract rows for CT
                ct_rows = data[data["REGION"]=="CT"].reset_index(drop=True)
                
                # Update the DataFrame
                ct_data = pd.concat([ct_data, ct_rows]).reset_index(drop=True)

    # Save the master data frame
    if save:
        print(f"Saving to {output_path}")
        ct_data.to_csv(output_path / f"{output_name}.csv")
    else:
        print("Save setting set to false, not saving")

    return(ct_data)

# Download monthly patterns data

In [22]:
months_to_download = [[x +"-"+y+"-01" for y in [str(z).zfill(2) for z in range(1,13)]] for x in ["2021", "2022"]]
months_to_download = [item for sublist in months_to_download for item in sublist]

In [None]:
for month in months_to_download:
    download_advan_data("https://app.deweydata.io/external-api/v3/products/2dfcb598-6e30-49f1-bdba-1deae113a951/files",
                        month,
                        data_output_path)

In [20]:
months_to_download = [[x +"-"+y+"-01" for y in [str(z).zfill(2) for z in range(1,13)]] for x in ["2018", "2019"]]
months_to_download = [item for sublist in months_to_download for item in sublist]

In [None]:
for month in months_to_download:
    download_advan_data("https://app.deweydata.io/external-api/v3/products/2dfcb598-6e30-49f1-bdba-1deae113a951/files",
                        month,
                        data_output_path)

# Download footfall data

Download the latest Advan data we have to check if the same errors exist

In [7]:
download_advan_data("https://app.deweydata.io/external-api/v3/products/5acc9f39-1ca6-4535-b3ff-38f6b9baf85e/files",
                    "2020-07-01",
                    footfall_output_path)

KeyError: 'download_links'

In [None]:
ct_ff_dat = extract_ct_data(footfall_output_path, '2020-07-01', processed_output_path, "ct_ff_2020-07-01_110723")

# Extract CT Data

In [None]:
ct_data = extract_ct_data(data_output_path, months_to_download, processed_output_path, "ct_advan_data_2021-22_121123", save = True)

Reading file Neighborhood_Patterns_US-303-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-304-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-305-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-306-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-307-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-308-DATE_RANGE_START-2021-01-01.csv.gz
Reading file Neighborhood_Patterns_US-309-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-310-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-311-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-312-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-313-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-314-DATE_RANGE_START-2021-02-01.csv.gz
Reading file Neighborhood_Patterns_US-315-DATE_RANGE_START-2021-03-01.csv.gz

### Observe data for CT

In [101]:
print(f"There are {len(ct_data)} rows for CT, corresponding to this many block groups")

There are 2585 rows for CT, corresponding to this many block groups


In [146]:
ct_data = ct_data.sort_values("AREA")
ct_bg_codes = ct_data["AREA"].astype(str).str.zfill(12).unique().tolist()

Now observe how many home destinations there are

In [102]:
home_areas_list = []

for index, row in ct_data.iterrows():
    home_areas = json.loads(row["DEVICE_HOME_AREAS"])

    # For each one
    for home_area in home_areas.keys():
        if not home_area in home_areas_list:
            home_areas_list.append(home_area)

In [104]:
home_areas_list_ct = [x for x in home_areas_list if str(x)[0:2]=="09"]
print(f"Thus there are {len(home_areas_list)} home destinations of which {len(home_areas_list_ct)} are from CT")

Thus there are 18674 home destinations of which 2583 are from CT
