# Clobber Together Source Data
[Summarize]

In [None]:
import pandas as pd
import os
import logging
import requests, json
import numpy as np
import gc
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [None]:
DATA_DIR = "data/"
CSV_DIR = DATA_DIR + "tripdata_csv/"
NY_DIR = CSV_DIR + "NY/"
NJ_DIR = CSV_DIR + "NJ/"

DB_FILE = "data/tripdata.db"

logging.basicConfig(level=logging.WARNING)

JC_DATA = os.listdir(NJ_DIR)  # NOTE: this includes Hoboken and Jersey City
NYC_DATA = os.listdir(NY_DIR)

logging.info(
    f"{len(JC_DATA)} Jersey City files and {len(NYC_DATA)} New York City files"
)

SCHEMA_CHANGE_DATE = "2021-02"

# CSV paths for NYC, JC (pre and post schema change)
nyc_old = sorted([NY_DIR + f for f in os.listdir(NY_DIR) if f < SCHEMA_CHANGE_DATE])
nyc_new = sorted([NY_DIR + f for f in os.listdir(NY_DIR) if f >= SCHEMA_CHANGE_DATE])

jc_old = sorted([NJ_DIR + f for f in os.listdir(NJ_DIR) if f < SCHEMA_CHANGE_DATE])
jc_new = sorted([NJ_DIR + f for f in os.listdir(NJ_DIR) if f >= SCHEMA_CHANGE_DATE])

# Create Annual Data Tables

In [None]:
def clobber_year(year=2019, state="NY") -> pd.DataFrame:
    """
    Creates a dataframe from source CSVs that is all monthly trip data for that `year`

    :param year: the year for which to concatenate data files
    :param state: 'NY' or 'NJ'. default 'NY'
    :return: the merged dataframe
    """

    range_start = str(year) + "-01"
    range_end = str(year) + "-13"  # Not sure why I have to select 13 here...
    files = None
    if state == "NY":
        files = sorted(
            [NY_DIR + f for f in os.listdir(NY_DIR) if range_start <= f <= range_end]
        )
    elif state == "NJ":
        files = sorted(
            [NJ_DIR + f for f in os.listdir(NJ_DIR) if range_start <= f <= range_end]
        )
    else:
        raise IndexError(f"No data for state: {state}")

    logging.debug(f"Will merge these files: {files}, number of files: {len(files)}")

    # Concatenate all monthly data in range
    clobbered = pd.DataFrame()
    coltypes = {
        "tripduration": "int32",
        "starttime": "datetime64",
        "stoptime": "datetime64",
        "startstationid": "category",
        "startstationname": "category",
        "startstationlatitude": "category",
        "startstationlongitude": "category",
        "endstationid": "category",
        "endstationname": "category",
        "endstationlatitude": "category",
        "endstationlongitude": "category",
        "bikeid": "category",
        "usertype": "category",
        "birthyear": "category",
        "gender": "category",
    }
    for file in files:
        print("loading..." + file)
        df = pd.read_csv(file)
        print("formatting columns..." + file)
        df.columns = [col.lower().replace(" ", "") for col in df.columns]
        df.dropna(
            axis=0, how="any", inplace=True
        )  # for some reason runs faster with dropped NA and converting temp df types
        df = df.astype(coltypes)
        print("concating..." + file)
        clobbered = pd.concat([clobbered, df], axis=0, ignore_index=True)
        del df
        gc.collect()
        print("unloaded..." + file)

    # update dtypes - category conversion lost on concat
    clobbered = clobbered.astype(coltypes)
    print(year, "...dtypes converted")

    return clobbered

In [None]:
def gen_data_files(years=[2019], state="NY"):
    """
    Calls clobber_year and writes output to both csv and parquet to `data/`

    :param years: list of years to generate data files for
    :param state: 'NY' or 'NJ'. default 'NY'
    :return: nothing
    """

    for year in years:
        # clobber dataframe
        print("clobbering...", year)
        rides = clobber_year(year, state)
        gc.collect()
        print(year, "...clobbered")

        # extract station data and save to file
        print("extracting stations...", year)
        stations = pd.DataFrame()
        col_select = [
            "startstationid",
            "startstationname",
            "startstationlatitude",
            "startstationlongitude",
        ]
        stations = rides[col_select]
        col_rename = {
            "startstationid": "stationid",
            "startstationname": "stationname",
            "startstationlatitude": "latitude",
            "startstationlongitude": "longitude",
        }
        stations.columns = col_rename
        print(
            stations.info()
        )  # confirm dtypes were maintained - can delete this line after testing
        exportpath = "data/stations_" + str(year) + ".parquet"
        stations.to_parquet(exportpath)
        del stations
        gc.collect()
        print(year, "...stations extracted & saved")

        # create rebalances dataframe

        # remove uneeded cols from rides

        # save ride to file
        exportpath = "data/rides_" + str(year) + ".parquet"
        rides.to_parquet(exportpath)

        print(year, "...saved to parquet")

        # exportpath = "data/rides_" + str(year) + '.csv'
        # temp_df.to_csv(exportpath)
        # print(year,'...saved to csv')

        # unload rides dataframe
        del rides
        gc.collect()
        print(year, "...unloaded")

In [None]:
state = "NY"
years = [
    2019
]  # ,2018,2017,2016,2015,2014]  #did not select 2013 due to concern with months not existing

gen_data_files(years, state)

# error when converting 2014 datatypes (did not save files)
# ArrowInvalid: ('Could not convert 1899 with type str: tried to convert to double', 'Conversion failed for column birthyear with type category')
# have not investigated further

clobbering... 2019
loading...data/tripdata_csv/NY/2019-01.csv
formatting columns...data/tripdata_csv/NY/2019-01.csv
concating...data/tripdata_csv/NY/2019-01.csv
unloaded...data/tripdata_csv/NY/2019-01.csv
loading...data/tripdata_csv/NY/2019-02.csv
formatting columns...data/tripdata_csv/NY/2019-02.csv
concating...data/tripdata_csv/NY/2019-02.csv
unloaded...data/tripdata_csv/NY/2019-02.csv
loading...data/tripdata_csv/NY/2019-03.csv
formatting columns...data/tripdata_csv/NY/2019-03.csv
concating...data/tripdata_csv/NY/2019-03.csv
unloaded...data/tripdata_csv/NY/2019-03.csv
loading...data/tripdata_csv/NY/2019-04.csv
formatting columns...data/tripdata_csv/NY/2019-04.csv
concating...data/tripdata_csv/NY/2019-04.csv
unloaded...data/tripdata_csv/NY/2019-04.csv
loading...data/tripdata_csv/NY/2019-05.csv
formatting columns...data/tripdata_csv/NY/2019-05.csv
concating...data/tripdata_csv/NY/2019-05.csv
unloaded...data/tripdata_csv/NY/2019-05.csv
loading...data/tripdata_csv/NY/2019-06.csv
formatti

UnboundLocalError: local variable 'stations' referenced before assignment

# Create Master Rides Table

In [None]:
years = [2019, 2018, 2017, 2016, 2015]
paths = ["data/rides_" + str(y) + ".parquet" for y in years]

dfs = []

for path in paths:
    print("loading..." + path)
    df = pd.read_parquet(path)
    dfs.append(df)
    del df
    gc.collect()

print("contcating dfs")
rides_all = pd.concat(dfs, axis=0, ignore_index=True)
del dfs
gc.collect()

print("exporting to partquet")
temp_df.to_parquet("data/rides_all.parquet")

loading...data/rides_2019.parquet
loading...data/rides_2018.parquet
loading...data/rides_2017.parquet
loading...data/rides_2016.parquet
loading...data/rides_2015.parquet
contcating dfs


In [None]:
years = [2019, 2018, 2017, 2016, 2015]
paths = ["data/rides_" + str(y) + ".parquet" for y in years]

rides_all = pd.DataFrame()

for path in paths:
    print("loading..." + path)
    df = pd.read_parquet(path)
    print("concating..." + path)
    rides_all = pd.concat([rides_all, df], axis=0, ignore_index=True)
    del df
    gc.collect()

print("exporting to parquet")
rides_all.to_parquet("data/rides_all.parquet")

loading...data/rides_2019.parquet
concating...data/rides_2019.parquet
loading...data/rides_2018.parquet
concating...data/rides_2018.parquet
loading...data/rides_2017.parquet
concating...data/rides_2017.parquet
loading...data/rides_2016.parquet
concating...data/rides_2016.parquet


# Create Stations Data Table

In [None]:
exportpath = "data/NY_2019.parquet"
year_2019.to_parquet(exportpath)

In [None]:
exportpath = "data/NYC_2019"
locations.to_csv(exportpath, index=False)

In [None]:
year_2019.head()

Unnamed: 0,tripduration,starttime,stoptime,startstationid,startstationname,startstationlatitude,startstationlongitude,endstationid,endstationname,endstationlatitude,endstationlongitude,bikeid,usertype,birthyear,gender
0,320,2019-01-01 00:01:47.401,2019-01-01 00:07:07.581,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,316,2019-01-01 00:04:43.736,2019-01-01 00:10:00.608,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,591,2019-01-01 00:06:03.997,2019-01-01 00:15:55.438,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,2719,2019-01-01 00:07:03.545,2019-01-01 00:52:22.650,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.99643,21579,Subscriber,1990,1
4,303,2019-01-01 00:07:35.945,2019-01-01 00:12:39.502,229.0,Great Jones St,40.727434,-73.99379,503.0,E 20 St & Park Ave,40.738274,-73.98752,35379,Subscriber,1979,1


In [None]:
year_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20551697 entries, 0 to 20551696
Data columns (total 15 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   tripduration           int32         
 1   starttime              datetime64[ns]
 2   stoptime               datetime64[ns]
 3   startstationid         category      
 4   startstationname       category      
 5   startstationlatitude   category      
 6   startstationlongitude  category      
 7   endstationid           category      
 8   endstationname         category      
 9   endstationlatitude     category      
 10  endstationlongitude    category      
 11  bikeid                 category      
 12  usertype               category      
 13  birthyear              category      
 14  gender                 category      
dtypes: category(12), datetime64[ns](2), int32(1)
memory usage: 804.6 MB


### Load Missing (Bronx) data

In [None]:
year_2019.isnull().sum()

tripduration               0
starttime                  0
stoptime                   0
startstationid           180
startstationname         180
startstationlatitude       0
startstationlongitude      0
endstationid             180
endstationname           180
endstationlatitude         0
endstationlongitude        0
bikeid                     0
usertype                   0
birthyear                  0
gender                     0
dtype: int64

In [None]:
year_2019.loc[year_2019.startstationid.isna()]

Unnamed: 0,tripduration,starttime,stoptime,startstationid,startstationname,startstationlatitude,startstationlongitude,endstationid,endstationname,endstationlatitude,endstationlongitude,bikeid,usertype,birthyear,gender
248483,2358,2019-01-08 13:48:13.812,2019-01-08 14:27:32.016,,,40.854,-73.890,,,40.857,-73.881,34550,Subscriber,1992,1
249690,3358,2019-01-08 14:29:31.515,2019-01-08 15:25:29.713,,,40.857,-73.884,,,40.857,-73.884,34550,Subscriber,1992,1
336359,596,2019-01-10 14:35:39.212,2019-01-10 14:45:35.413,,,40.863,-73.890,,,40.866,-73.884,34550,Subscriber,1986,1
337008,816,2019-01-10 14:56:38.425,2019-01-10 15:10:14.812,,,40.857,-73.890,,,40.854,-73.902,34447,Subscriber,1986,1
396461,393,2019-01-12 12:36:01.144,2019-01-12 12:42:34.952,,,40.854,-73.890,,,40.857,-73.884,34536,Customer,1997,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13292288,419,2019-08-28 04:56:24.076,2019-08-28 05:03:23.578,,,40.860,-73.899,,,40.860,-73.887,34428,Customer,1998,1
13292321,856,2019-08-28 05:05:22.354,2019-08-28 05:19:39.288,,,40.860,-73.887,,,40.848,-73.908,34428,Customer,1998,1
13292453,949,2019-08-28 05:23:29.688,2019-08-28 05:39:19.687,,,40.848,-73.908,,,40.854,-73.902,34428,Customer,1998,1
13293165,739,2019-08-28 06:09:32.525,2019-08-28 06:21:52.251,,,40.854,-73.899,,,40.848,-73.902,34428,Customer,1998,1


In [None]:
year_2019.loc[year_2019.startstationid.isna()][
    ["startstationlatitude"]
].reset_index().startstationlatitude[0]

40.854

In [None]:
url = requests.get("https://gbfs.citibikenyc.com/gbfs/en/station_information.json")
text = url.text
data = json.loads(text)

station_details = pd.DataFrame.from_dict(data["data"]["stations"])
station_details

Unnamed: 0,eightd_has_key_dispenser,station_id,station_type,region_id,external_id,name,capacity,lat,electric_bike_surcharge_waiver,rental_uris,has_kiosk,lon,short_name,rental_methods,legacy_id,eightd_station_services
0,False,72,classic,71,66db237e-0aca-11e7-82f6-3863bb44ef7c,W 52 St & 11 Ave,55,40.767272,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-73.993929,6926.01,"[CREDITCARD, KEY]",72,[]
1,False,79,classic,71,66db269c-0aca-11e7-82f6-3863bb44ef7c,Franklin St & W Broadway,33,40.719116,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-74.006667,5430.08,"[CREDITCARD, KEY]",79,[]
2,False,82,classic,71,66db277a-0aca-11e7-82f6-3863bb44ef7c,St James Pl & Pearl St,27,40.711174,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-74.000165,5167.06,"[CREDITCARD, KEY]",82,[]
3,False,83,classic,71,66db281e-0aca-11e7-82f6-3863bb44ef7c,Atlantic Ave & Fort Greene Pl,62,40.683826,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-73.976323,4354.07,"[CREDITCARD, KEY]",83,[]
4,False,116,classic,71,66db28b5-0aca-11e7-82f6-3863bb44ef7c,W 17 St & 8 Ave,50,40.741776,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-74.001497,6148.02,"[CREDITCARD, KEY]",116,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616,False,4726,classic,71,7c7cb750-6613-48a4-bfae-9240968f1fad,E 26 St & 3 Ave,39,40.740693,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-73.981606,6089.11,"[CREDITCARD, KEY]",4726,[]
1617,False,4732,classic,71,7cb65297-c880-459a-b054-4e059576c02a,50 St & 7 Ave,22,40.642501,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-74.006055,3038.08,"[CREDITCARD, KEY]",4732,[]
1618,False,4738,classic,71,73f36547-e5a9-4039-bca7-a2ccdb78371c,E 106 St & 2 Ave,24,40.790586,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-73.941958,7482.15,"[CREDITCARD, KEY]",4738,[]
1619,False,4739,classic,71,2e5cde21-f0a2-4396-899e-6df275e99212,Valentine Ave & E 181 St,21,40.854124,False,"{'ios': 'https://bkn.lft.to/lastmile_qr_scan',...",True,-73.899322,8435.04,"[CREDITCARD, KEY]",4739,[]


In [None]:
station_details.eightd_has_key_dispenser.value_counts()

False    1621
Name: eightd_has_key_dispenser, dtype: int64

In [None]:
station_details.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1621 entries, 0 to 1620
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   legacy_id                       1621 non-null   object 
 1   rental_methods                  1621 non-null   object 
 2   name                            1621 non-null   object 
 3   eightd_has_key_dispenser        1621 non-null   bool   
 4   capacity                        1621 non-null   int64  
 5   lon                             1621 non-null   float64
 6   electric_bike_surcharge_waiver  1621 non-null   bool   
 7   short_name                      1621 non-null   object 
 8   has_kiosk                       1621 non-null   bool   
 9   rental_uris                     1621 non-null   object 
 10  eightd_station_services         1621 non-null   object 
 11  station_id                      1621 non-null   object 
 12  lat                             16

In [None]:
station_details = station_details.astype({"lon": "float", "lat": "float"})

In [None]:
type(station_details.lon[0])

numpy.float64

In [None]:
station_details.lon[0] == -73.99392888

True

In [None]:
min(station_details.lon)

-74.0867006778717

In [None]:
longitude = -74.0867006778717

station_details.loc[station_details.lon == longitude]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,eightd_station_services,station_id,lat,external_id,station_type,region_id
1529,4619,"[CREDITCARD, KEY]",Bergen Ave & Stegman St,False,19,-74.086701,False,JC108,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4619,40.706575,33a797e8-6be2-4868-b9f6-1be157eaa0a5,classic,70


In [None]:
station_details["lon_short"] = station_details.lon.astype(str).str[:7]
station_details["lat_short"] = station_details.lat.astype(str).str[:6]
station_details.head()

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,eightd_station_services,station_id,lat,external_id,station_type,region_id,lon_short,lat_short
0,72,"[CREDITCARD, KEY]",W 52 St & 11 Ave,False,55,-73.993929,False,6926.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],72,40.767272,66db237e-0aca-11e7-82f6-3863bb44ef7c,classic,71,-73.993,40.767
1,79,"[CREDITCARD, KEY]",Franklin St & W Broadway,False,33,-74.006667,False,5430.08,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],79,40.719116,66db269c-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.006,40.719
2,82,"[CREDITCARD, KEY]",St James Pl & Pearl St,False,27,-74.000165,False,5167.06,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],82,40.711174,66db277a-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.0,40.711
3,83,"[CREDITCARD, KEY]",Atlantic Ave & Fort Greene Pl,False,62,-73.976323,False,4354.07,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],83,40.683826,66db281e-0aca-11e7-82f6-3863bb44ef7c,classic,71,-73.976,40.683
4,116,"[CREDITCARD, KEY]",W 17 St & 8 Ave,False,50,-74.001497,False,6148.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],116,40.741776,66db28b5-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.001,40.741


In [None]:
station_details.loc[station_details.lon_short == "-73.890"]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,eightd_station_services,station_id,lat,external_id,station_type,region_id,lon_short,lat_short
1110,4146,"[CREDITCARD, KEY]",Garrison Ave & Manida St,False,30,-73.890247,False,7905.05,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4146,40.818743,81a34754-ae08-47a6-964d-4733849a8fab,classic,71,-73.89,40.818
1182,4227,"[CREDITCARD, KEY]",Crotona Park East & E 173 St,False,19,-73.890413,False,8182.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4227,40.837037,426862de-f8f2-4967-ba41-76680a4b460c,classic,71,-73.89,40.837
1242,4295,"[CREDITCARD, KEY]",Ditmars Blvd & 79 St,False,25,-73.890745,False,6994.06,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4295,40.768728,9157a3bc-43fa-4221-8bb8-17273fc09540,classic,71,-73.89,40.768
1479,4563,"[CREDITCARD, KEY]",Bainbridge Ave & E 196 St,False,20,-73.89042,False,8615.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4563,40.86648,f4f9a09c-010c-430c-a766-3390c44b4962,classic,71,-73.89,40.866


In [None]:
station_details.loc[station_details.lat_short == "40.854"]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,eightd_station_services,station_id,lat,external_id,station_type,region_id,lon_short,lat_short
1366,4434,"[CREDITCARD, KEY]",W 190 St & St. Nicholas Ave,False,28,-73.929513,False,8453.14,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4434,40.854788,c829b6af-002b-4d04-af56-6924d4ad0aa2,classic,71,-73.929,40.854
1438,4516,"[CREDITCARD, KEY]",Sedgwick Ave & W Burnside Ave,False,17,-73.916661,False,8465.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4516,40.854947,3e4a2c26-7527-469c-b4f4-1ef8d40fd20d,classic,71,-73.916,40.854
1525,4613,"[CREDITCARD, KEY]",Creston Ave & E 181 St,False,23,-73.90281,False,8456.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4613,40.8549,41f2113f-13f3-48b3-9919-179df29011b2,classic,71,-73.902,40.854
1619,4739,"[CREDITCARD, KEY]",Valentine Ave & E 181 St,False,21,-73.899322,False,8435.04,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,[],4739,40.854124,2e5cde21-f0a2-4396-899e-6df275e99212,classic,71,-73.899,40.854


In [None]:
station_details.loc[
    (station_details.lon_short == "-73.890") & (station_details.lat_short == "40.854")
]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,eightd_station_services,station_id,lat,external_id,station_type,region_id,lon_short,lat_short


can we fix this by geocode?

In [None]:
# initialize geocode
geolocator = Nominatim(user_agent="bikegeocode")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1, max_retries=0)

# pull geolocation data for each station
locations_lst = []
for index, row in station_details.iterrows():
    locations_lst.append(
        reverse("{}, {}".format(row["lat"], row["lon"])).raw["address"]
    )

json_locs = pd.DataFrame(locations_lst)
json_locs

Unnamed: 0,amenity,road,neighbourhood,suburb,county,state,postcode,country,country_code,city,...,shop,hamlet,town,office,emergency,commercial,place,industrial,landuse,craft
0,Citi Bike - W 52 St / 11 Ave,West 52nd Street,Theater District,Manhattan,New York County,New York,10019,United States,us,,...,,,,,,,,,,
1,Citi Bike - Franklin St & W Broadway,Franklin Street,Tribeca,Manhattan,,New York,10005,United States,us,New York,...,,,,,,,,,,
2,Citi Bike - Saint James Place & Pearl Street,Pearl Street,Two Bridges,Manhattan,,New York,10038,United States,us,New York,...,,,,,,,,,,
3,Citi Bike - Atlantic Avenue & Fort Greene Place,Atlantic Avenue,,Brooklyn,,New York,11208,United States,us,,...,,,,,,,,,,
4,Citi Bike - W 17 St & 8 Ave,West 17th Street,Chelsea,Manhattan,,New York,10011,United States,us,New York,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1616,,East 26th Street,Kips Bay,Manhattan,,New York,10016,United States,us,New York,...,,,,,,,,,,
1617,,50th Street,Sunset Park,,,New York,11220,United States,us,New York,...,,,,,,,,,,
1618,,East 106th Street,East Harlem,Manhattan,New York County,New York,10029,United States,us,,...,,,,,,,,,,
1619,,Valentine Avenue,,The Bronx,,New York,10457,United States,us,New York,...,,,,,,,,,,


In [None]:
json_locs.suburb.value_counts()

Manhattan        638
Brooklyn         303
The Bronx        258
Queens           146
Queens County     22
Name: suburb, dtype: int64

In [None]:
station_details[station_details.boro == "bronx"]

NameError: name 'station_details' is not defined

In [None]:
station_details[["neighborhood", "suburb", "postcode"]] = json_locs[
    ["neighbourhood", "suburb", "postcode"]
]

In [None]:
station_details.head()

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,...,station_id,lat,external_id,station_type,region_id,lon_short,lat_short,neighborhood,suburb,postcode
0,72,"[CREDITCARD, KEY]",W 52 St & 11 Ave,False,55,-73.993929,False,6926.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,72,40.767272,66db237e-0aca-11e7-82f6-3863bb44ef7c,classic,71,-73.993,40.767,Theater District,Manhattan,10019
1,79,"[CREDITCARD, KEY]",Franklin St & W Broadway,False,33,-74.006667,False,5430.08,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,79,40.719116,66db269c-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.006,40.719,Tribeca,Manhattan,10005
2,82,"[CREDITCARD, KEY]",St James Pl & Pearl St,False,27,-74.000165,False,5167.06,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,82,40.711174,66db277a-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.0,40.711,Two Bridges,Manhattan,10038
3,83,"[CREDITCARD, KEY]",Atlantic Ave & Fort Greene Pl,False,62,-73.976323,False,4354.07,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,83,40.683826,66db281e-0aca-11e7-82f6-3863bb44ef7c,classic,71,-73.976,40.683,,Brooklyn,11208
4,116,"[CREDITCARD, KEY]",W 17 St & 8 Ave,False,50,-74.001497,False,6148.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,116,40.741776,66db28b5-0aca-11e7-82f6-3863bb44ef7c,classic,71,-74.001,40.741,Chelsea,Manhattan,10011


In [None]:
station_details.loc[station_details.lon_short == "-73.890"]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,...,station_id,lat,external_id,station_type,region_id,lon_short,lat_short,neighborhood,suburb,postcode
1110,4146,"[CREDITCARD, KEY]",Garrison Ave & Manida St,False,30,-73.890247,False,7905.05,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4146,40.818743,81a34754-ae08-47a6-964d-4733849a8fab,classic,71,-73.89,40.818,,The Bronx,10474
1182,4227,"[CREDITCARD, KEY]",Crotona Park East & E 173 St,False,19,-73.890413,False,8182.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4227,40.837037,426862de-f8f2-4967-ba41-76680a4b460c,classic,71,-73.89,40.837,,The Bronx,10460
1242,4295,"[CREDITCARD, KEY]",Ditmars Blvd & 79 St,False,25,-73.890745,False,6994.06,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4295,40.768728,9157a3bc-43fa-4221-8bb8-17273fc09540,classic,71,-73.89,40.768,,Queens,11370
1479,4563,"[CREDITCARD, KEY]",Bainbridge Ave & E 196 St,False,20,-73.89042,False,8615.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4563,40.86648,f4f9a09c-010c-430c-a766-3390c44b4962,classic,71,-73.89,40.866,,The Bronx,10458


In [None]:
station_details.loc[station_details.lat_short == "40.854"]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,...,station_id,lat,external_id,station_type,region_id,lon_short,lat_short,neighborhood,suburb,postcode
1366,4434,"[CREDITCARD, KEY]",W 190 St & St. Nicholas Ave,False,28,-73.929513,False,8453.14,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4434,40.854788,c829b6af-002b-4d04-af56-6924d4ad0aa2,classic,71,-73.929,40.854,Fort George,Manhattan,10040
1438,4516,"[CREDITCARD, KEY]",Sedgwick Ave & W Burnside Ave,False,17,-73.916661,False,8465.01,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4516,40.854947,3e4a2c26-7527-469c-b4f4-1ef8d40fd20d,classic,71,-73.916,40.854,,The Bronx,10453
1525,4613,"[CREDITCARD, KEY]",Creston Ave & E 181 St,False,23,-73.90281,False,8456.02,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4613,40.8549,41f2113f-13f3-48b3-9919-179df29011b2,classic,71,-73.902,40.854,,The Bronx,10453
1619,4739,"[CREDITCARD, KEY]",Valentine Ave & E 181 St,False,21,-73.899322,False,8435.04,True,{'android': 'https://bkn.lft.to/lastmile_qr_sc...,...,4739,40.854124,2e5cde21-f0a2-4396-899e-6df275e99212,classic,71,-73.899,40.854,,The Bronx,10457


In [None]:
station_details.loc[
    (station_details.lon_short == "-73.890") & (station_details.lat_short == "40.854")
]

Unnamed: 0,legacy_id,rental_methods,name,eightd_has_key_dispenser,capacity,lon,electric_bike_surcharge_waiver,short_name,has_kiosk,rental_uris,...,station_id,lat,external_id,station_type,region_id,lon_short,lat_short,neighborhood,suburb,postcode


# Archive

In [None]:
bad code  #intentially break if run all cells is performed

# clobber all old nyc CSVs NOTE THIS CRASHES COMPUTER


# nyc_old_dfs = []
# for file in nyc_old:
#     print(f'file {NY_DIR + file}')
#     df = pd.read_csv(NY_DIR + file)
#     nyc_old_dfs.append(df)
#
# nyc_old_df = pd.concat(nyc_old_dfs, axis=0, ignore_index=True)

In [None]:
# import dask.dataframe as dd
# ddf = dd.read_csv(nyc_old,
#                   dtype={'birth year': 'object',
#                          'end station id': 'float64'})
#
# # columns are Sentence Cased for some CSVs and lower cased for others
# ddf = ddf.rename(columns=str.lower)

In [None]:
# ddf.describe().compute()

## Monthly Aggregation


In [None]:
# TODO only works for old schema at the moment
def summarise_months(outfilename: str, months: list):
    """
    Writes monthly summary given list of monthly trip data

    :param outfilename: where to write the summary csv
    :param months: list of CSVs for the monthly trip data
    :return: None
    """
    summaries = []

    for file in months:
        df = pd.read_csv(file)
        df.columns = [col.lower().replace(" ", "") for col in df.columns]
        # logging.debug(f'{file}: {list(df.columns)}')

        year_month = file.split("/")[-1].removesuffix(".csv")  # YYYYMM

        summary = pd.Series(dtype=object)
        summary["datetime"] = year_month
        summary["counttrips"] = df.shape[0]
        summary["meanduration"] = df.tripduration.mean()
        summary["modestartstationid"] = df.startstationid.mode()
        summary["modestartstationname"] = df.startstationname.mode()
        summary["modestartstationlatitude"] = df.startstationlatitude.mode()
        summary["modestartstationlongitude"] = df.startstationlongitude.mode()
        summary["modeendstationid"] = df.endstationid.mode()
        summary["modeendstationname"] = df.endstationname.mode()
        summary["modeendstationlatitude"] = df.endstationlatitude.mode()
        summary["modeendstationlongitude"] = df.endstationlongitude.mode()

        if "usertype" in df.columns:
            summary["usertypevalues"] = df.usertype.value_counts()
        elif "member_casual" in df.columns:
            summary["usertypevalues"] = df.member_casual.value_counts()

        if "gender" in df.columns:
            summary["gendervalues"] = df.gender.value_counts()

        summaries.append(summary)

    summary_df = pd.DataFrame()
    summary_df = summary_df.append(
        summaries
    )  # TODO use concat instead to suppress warning
    summary_df.set_index("datetime")
    summary_df.to_csv(outfilename)


# write summary data month by month for NYC and NJ
summarise_months(DATA_DIR + "summary_nyc_old_schema.csv", nyc_old)
summarise_months(DATA_DIR + "summary_jc_old_schema.csv", jc_old)

# read summary
nyc_old_schema_summary = pd.read_csv("data/summary_nyc_old_schema.csv", index_col=0)
nyc_old_schema_summary

# read JC summary
jc_old_schema_summary = pd.read_csv("data/summary_nyc_old_schema.csv", index_col=0)
jc_old_schema_summary

## Original Clober with logging

In [None]:
def clobber_year(year=2019, state="NY") -> pd.DataFrame:
    """
    Creates a dataframe from source CSVs that is all monthly trip data for that `year`

    :param year: the year for which to concatenate data files
    :param state: 'NY' or 'NJ'. default 'NY'
    :return: the merged dataframe
    """

    range_start = str(year) + "-01"
    range_end = str(year) + "-13"  # Not sure why I have to select 13 here...
    files = None
    if state == "NY":
        files = sorted(
            [NY_DIR + f for f in os.listdir(NY_DIR) if range_start <= f <= range_end]
        )
    elif state == "NJ":
        files = sorted(
            [NJ_DIR + f for f in os.listdir(NJ_DIR) if range_start <= f <= range_end]
        )
    else:
        raise IndexError(f"No data for state: {state}")

    logging.debug(f"Will merge these files: {files}, number of files: {len(files)}")

    # Concatenate all monthly data in range
    dfs = []
    for file in files:
        df = pd.read_csv(file)
        df.columns = [col.lower().replace(" ", "") for col in df.columns]
        logging.debug(f"Appending df file: {file}...")
        dfs.append(df)
        del df
        gc.collect()
    logging.debug(f"Merging dataframes...")
    clobbered = pd.concat(dfs, axis=0, ignore_index=True)

    # unload temp variables
    del dfs
    gc.collect()

    # update dtypes (doesn't carry through concat if done on read_csv...?)
    coltypes = {
        "tripduration": "int32",
        "starttime": "datetime64",
        "stoptime": "datetime64",
        "startstationid": "category",
        "startstationname": "category",
        "startstationlatitude": "category",
        "startstationlongitude": "category",
        "endstationid": "category",
        "endstationname": "category",
        "endstationlatitude": "category",
        "endstationlongitude": "category",
        "bikeid": "category",
        "usertype": "category",
        "birthyear": "category",
        "gender": "category",
    }
    clobbered = clobbered.astype(coltypes)
    print(year, "...dtypes converted")

    return clobbered


def gen_data_files(
    years=[2019], state="NY"
) -> pd.DataFrame:  # what does -> pd.dataframe do?
    """
    Calls clobber_year and writes output to both csv and parquet to `data/`

    :param years: list of years to generate data files for
    :param state: 'NY' or 'NJ'. default 'NY'
    :return: nothing
    """

    gc.collect()
    for year in years:
        # clobber dataframe
        print("clobbering...", year)
        temp_df = clobber_year(year, state)
        print(year, "...clobbered")

        # extract station data [only if uberparquet faisl]
        # perform reverse geocode ---> later
        # create yearly stations dataframe
        # save to file
        # unload related dfs

        # clean dataframe
        # drop NAs? - not implemented atm
        # drop station cols

        # save to files
        exportpath = "data/rides_" + str(year) + ".parquet"
        temp_df.to_parquet(exportpath)
        print(year, "...saved to parquet")

        exportpath = "data/rides_" + str(year) + ".csv"
        temp_df.to_csv(exportpath)
        print(year, "...saved to csv")

        # unload dataframe
        del temp_df
        gc.collect()
        print(year, "...unloaded")