# Gather speed data from SFMTA

This notebook gathers data from the SFMTA website

In [3]:
import pandas as pd
import sys
import os

sys.path.insert(0, '../utils/')
from api import get_sfmta_data
from traffic_processing import coordinate_mapper
from decorators import timer

In [None]:
coordinate_mapper = timer(coordinate_mapper)

## 1. Get 2016 speed data
This step can take a while depending on internet speed. 

In [None]:
# base_url = 'ftp://avl-data.sfmta.com/avl_data/avl_raw/'
# file_directory = '../../raw_data/sf_speed_data/'

# get_sfmta_data(base_url, file_directory)

## 2. Clean 2016 speed data

*Note: Check that file path directories are properly configured*

In [None]:
# input_dir = '../../raw_data/sf_speed_data/'
# output_dir = '../../temp_data/sfdata_clean/'
# Make sure .DS_Store is removed.

# for file in os.listdir(input_dir):
#     SFDATA_file_cleaner(input_dir, output_dir, file)

## 3. load all data + format

In [None]:
# Filtered Census Zones (GIS data with GeoID)
shp_file = '../../temp_data/sf_GEOID_GIS_data.shp'

# Input directory
input_dir = '../../temp_data/sfdata_clean/'

# Output directory
output_dir = '../../temp_data/sfdata_mapped/'

In [None]:
# Load census data 
shp_file = gpd.GeoDataFrame.from_file(shp_file)
print('Size of census zones df: {}'.format(shp_file.shape))
shp_file.head()

In [None]:
# All files by date
for i, fname in enumerate(os.listdir(input_dir)):
    print(i, fname)

### 3.a. Map to coordinate regions

In [None]:
# Check that all 29 files for the month of feburuary is there
# Map each to corresponding census zones 
for fname in os.listdir(input_dir):
    coordinate_mapper(shp_file, input_dir, output_dir, fname)

### 3.b. Aggregate by census region

In [None]:
input_dir = '../../temp_data/sfdata_mapped/'
output_dir ='../../temp_data/region_data/'

In [None]:
def aggregate_by_region(input_dir, output_dir):
    """Aggregate files by corresponding region ID (geoid10)
    
    For each file, the script will partition data by region ID.
    Each region ID file will get updated every time a new file is read,
    Each of the resulting files will contain all data pertaining to a region ID.
    
    :param str input_dir: directory containing input files
    :param str output_dir: directory to save output files
    :param str file_name: name of file 
    :return: table of all data for one region
    :rtype: DataFrame 
    """
    
    base_fname = 'time_series_region'
    file_names = get_fname(input_dir, contains='2016')
    
    for file in file_names:
        aggr_data = pd.read_csv(input_dir + file)
        
        # Loop though data for each region 
        # open only one region file at a time to save memory
        for region_id, group_df in aggr_data.groupby('geoid10'):
            # Check if region file already exists
            output_fname = '{}/{}_{}.csv'.format(output_dir, base_fname, region_id)
            if os.path.exists(output_fname):
                f = open(output_fname,'a')
            else:
                f = open(output_fname,'w+')
                f.write(','.join(aggr_data.columns)+'\n')
                
            for i in group_df.itertuples():
                f.write(','.join([str(cell) for cell in list(i)[1:]])+'\n')
                
            f.close()

        print('finished partitioning {}'.format(file))

In [None]:
aggregate_by_region(input_dir, output_dir)

In [None]:
len(os.listdir(output_dir))

## 3.c. Format data into frequency level timeseries

In [None]:
input_dir = '../../temp_data/region_data/'
print('Number of region files: {}'.format(len(os.listdir(input_dir))))

In [None]:
def region_by_time_generator(path, columns=['REPORT_TIME'], Y='SPEED', unit='H', usecols=None):
    """Takes all regional time series data from a directory and aggregates them into one time series at desired time
    frequency

    Where the resulting DataFrame will contain the following columns:
    +-----------+----+----+-----+----+
    | region_ID | T1 | T2 | ... | TN |
    +-----------+----+----+-----+----+

    Each element in the columns T1...TN will be the averaged speed of all speeds recorded in a region at a specific time
    point.

    :param str path: input directory containing files of interest
    :param list columns: name of column to be converted to datetime
    :param str Y: name of column to be treated as the Y
    :param unit: specification of time granularity
    :param list usecols: specification of columns to read
    :return: formatted table
    :rtype: DataFrame
    """

    print("Reading files from directory: {}".format(path))
    file_names = get_fname(path, contains='')
    new_time_df = pd.DataFrame()

    for name in file_names:
        region_data = pd.read_csv(path + name, parse_dates=columns, infer_datetime_format=True, usecols=usecols)
        region_data.index = region_data[columns[0]]

        # group second data into one time unit.
        unit_aggregate = region_data[Y].resample(unit).mean()

        # turn a series of data into a row(with dataframe type).
        unit_aggregate = unit_aggregate.to_frame(name=re.sub("filtered_|time_series_|\.csv", "", name))
        unit_aggregate = unit_aggregate.transpose()

        # add into final result
        new_time_df = pd.concat([new_time_df, unit_aggregate])

    return new_time_df

In [None]:
sample_time_series = region_by_time_generator(input_dir, columns=['REPORT_TIME'], Y='SPEED',unit='H', usecols=[1,5])

In [None]:
sample_time_series.to_csv('../../output/region_by_time_series.csv', index=True)