## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

## Import Data

### --- Citi 2022 Data ---

In [11]:
# create a list with all files in the folder using a list comprehension
folderpath = r"Data" 
filepath = [os.path.join(folderpath, name) for name in os.listdir(folderpath)]

In [14]:
# import and merge the 36(!) datasets into 1
df = pd.concat(
    (pd.read_csv(f, low_memory=False) for f in filepath),
    ignore_index=True
)

In [15]:
df.head(3)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member


In [16]:
df.tail(3)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
29838803,11C8C5E0DB947B07,classic_bike,2022-12-01 05:56:14.903,2022-12-01 06:06:10.357,Avenue D & E 3 St,5436.09,Bleecker St & Crosby St,5679.08,40.720828,-73.977932,40.726156,-73.995102,member
29838804,5B9B083C534A5964,classic_bike,2022-12-02 11:54:15.871,2022-12-02 12:01:00.747,Montague St & Clinton St,4677.06,Sands St & Jay St,4821.03,40.694271,-73.992327,40.700119,-73.9862,member
29838805,91C286C462F89A50,classic_bike,2022-12-18 13:35:22.574,2022-12-18 13:37:27.193,Montague St & Clinton St,4677.06,Cadman Plaza E & Tillary St,4677.01,40.694271,-73.992327,40.695977,-73.990149,member


### --- API Weather Data ---

**1. Define Token**

In [10]:
# define your NOAA token
Token = 'eXraitIZjoujfReLRjoaFTCXDcudehmv'

**2. Compile URL**

In [23]:
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data?datasetid=GHCND&datatypeid=TAVG&limit=1000&stationid=GHCND:USW00014732&startdate=2022-01-01&enddate=2022-12-31', headers={'token':Token})

**3. Store Data in JSON Format**

In [30]:
# load the api response as a json
d = json.loads(r.text) 

**4. Wrangle Data Results**

In [31]:
# secure all items in the response that correspond to TAVG
avg_temps = [item for item in d['results'] if item['datatype']=='TAVG']

# get only the date field from all average temperature readings
dates_temp = [item['date'] for item in avg_temps]

# get the temperature from all average temperature readings
temps = [item['value'] for item in avg_temps]

In [32]:
# put the results in a dataframe
df_temp = pd.DataFrame()

In [33]:
# convert the date from dates_temp to a format where only the date is visible
# i.e. remove the time (hrs, mins, secs) component
df_temp['date'] = [datetime.strptime(d, "%Y-%m-%dT%H:%M:%S") for d in dates_temp]

# convert temperature from tenths of celsius to normal celsius
df_temp['avgTemp'] = [float(v)/10.0 for v in temps]

In [35]:
df_temp.tail(3)

Unnamed: 0,date,avgTemp
362,2022-12-29,6.4
363,2022-12-30,9.3
364,2022-12-31,8.2


In [36]:
df_temp.head(3)

Unnamed: 0,date,avgTemp
0,2022-01-01,11.6
1,2022-01-02,11.4
2,2022-01-03,1.4


In [39]:
# convert the started_at column to datetime format
df['started_at'] = pd.to_datetime(df['started_at'], format="%Y-%m-%d %H:%M:%S.%f")

In [40]:
# split the date from the time
df['date'] = pd.to_datetime(df['started_at'], format='%Y-%m-%d').dt.date

In [41]:
df['date'] = pd.to_datetime(df['date'])

In [42]:
%%time
# merge the bike data with the weather data
df_merged = df.merge(df_temp, how = 'left', on = 'date', indicator = True)

CPU times: total: 58.8 s
Wall time: 1min 49s


In [43]:
df_merged.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp,_merge
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,7650.05,Mt Morris Park W & W 120 St,7685.14,40.802117,-73.968181,40.804038,-73.945925,member,2022-01-21,-6.0,both
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,4028.04,Boerum Pl\t& Pacific St,4488.09,40.673746,-73.985649,40.688489,-73.99116,member,2022-01-10,1.6,both
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,6753.08,5 Ave & E 29 St,6248.06,40.761227,-73.96094,40.745168,-73.986831,member,2022-01-26,-2.3,both
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,7338.02,5 Ave & E 29 St,6248.06,40.783964,-73.947167,40.745168,-73.986831,member,2022-01-03,1.4,both
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,6364.1,5 Ave & E 29 St,6248.06,40.74964,-73.98805,40.745168,-73.986831,member,2022-01-22,-5.9,both


In [44]:
df_merged['_merge'].value_counts(dropna = False)

_merge
both          29838166
left_only          640
right_only           0
Name: count, dtype: int64

In [45]:
df_merged.to_csv('ny_data.csv')

In [46]:
df.shape

(29838806, 14)