In [98]:
import geopandas as gpd
import pandas as pd
from glob import glob

In [135]:
class DataExtractor:
    def __init__(self, data_path: str) -> None:
        self.data = pd.DataFrame([])
        self.df = pd.DataFrame([])
        self.data_path = data_path
        vals = pd.read_csv(f'{data_path}/TractID_List.csv', header=None, dtype={'0': str}).values
        self.tract_ids = [str(value[0]) for value in vals]
        self.destination = ''
        self.destinations = []
        dates = []
        for file in timeseries_files:
            if 'TractID_List' in file:
                continue
            else:
                dates.append(file.split('/')[-1].split('.')[0])
        dates.sort()
        self.dates = dates
    
    def set_destination(self, geoid: str) -> None:
        self.destination = geoid
        
    def set_destinations(self, geoids: [str]) -> None:
        self.destinations = geoids
        
    def set_date(self, date: str) -> None:
        self.date = date
        
    def set_data(self, data: pd.core.frame.DataFrame) -> None:
        self.data = data
        
    def load_data(self, date) -> None:
        self.set_date(date)
        self.set_data(pd.read_csv(f"{self.data_path}/{self.date}.csv", header=None))
        
    def get_trips_to_d(self) -> pd.core.series:
        d_index = self.tract_ids.index(self.destination)
        return self.data[d_index]
    
    def get_trips_df(self) -> pd.core.frame.DataFrame:
        trips_data = self.get_trips_to_d()
        df = pd.DataFrame(
            zip(
                self.tract_ids, 
                [self.destination for i in trips_data],
                trips_data, 
                [self.date for i in trips_data]
            )
        )
        df.columns = ['origin', 'destination', 'trips', 'date']
        df = df[df.trips > 0]
        return df
    
    def get_timeseries_df(self, single_d=True) -> pd.core.frame.DataFrame:
        for idx, date in enumerate(self.dates):
            self.load_data(date)
            if idx == 0:
                df = self.get_trips_df()
            else:
                df = pd.concat([df, self.get_trips_df()])
        if (single_d):
            self.df = df
        return df
    
    def get_timeseries_df_multi(self) -> pd.core.frame.DataFrame:
        for idx, date in enumerate(self.dates):
            self.load_data(date)
            for nested_idx, geoid in enumerate(self.destinations):
                self.set_destination(geoid)
                if idx+nested_idx == 0:
                    df = self.get_trips_df()
                else:
                    df = pd.concat([df, self.get_trips_df()])
        self.df = df
        return df
        
    def export_csv(self, path: str) -> None:
        self.df.to_csv(path, index=False)
        
    def export_parquet(self, path: str) -> None:
        self.df.to_parquet(path)

In [136]:
d_geoid = '17031838800'

chicago = DataExtractor('./data/chicago')
chicago.set_destination(d_geoid)
chicago.get_timeseries_df()
# chicago.export_parquet(f'./output/{destination_geoid}2.parquet')


Unnamed: 0,origin,destination,trips,date
5,17031010501,17031838800,205,2019-12-30
23,17031020901,17031838800,99,2019-12-30
33,17031030601,17031838800,271,2019-12-30
209,17031170500,17031838800,164,2019-12-30
239,17031210400,17031838800,188,2019-12-30
...,...,...,...,...
2135,18089043401,17031838800,142,2022-04-25
2136,18089043403,17031838800,215,2022-04-25
2150,18127050407,17031838800,142,2022-04-25
2151,18127050501,17031838800,77,2022-04-25


In [138]:
d_geoids = ['17031838800','17031550100']
chicago = DataExtractor('./data/chicago')
chicago.set_destinations(d_geoids)
chicago.get_timeseries_df_multi()
# chicago.export_parquet(f'./output/chicago_park_visits.parquet')


Unnamed: 0,origin,destination,trips,date
5,17031010501,17031838800,205,2019-12-30
23,17031020901,17031838800,99,2019-12-30
33,17031030601,17031838800,271,2019-12-30
209,17031170500,17031838800,164,2019-12-30
239,17031210400,17031838800,188,2019-12-30
...,...,...,...,...
2131,18089043102,17031550100,76,2022-04-25
2143,18127050101,17031550100,100,2022-04-25
2150,18127050407,17031550100,46,2022-04-25
2157,18127050509,17031550100,260,2022-04-25


In [139]:
chicago.df.destination.unique()

array(['17031838800', '17031550100'], dtype=object)