In [35]:
# Install necessary packages
!pip install geopy



In [36]:
# Import packages
from geopy import Nominatim
import pandas as pd
import glob
from datetime import datetime

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [37]:
# Read taxi-zone-lookup.csv into a dataframe
url = 'https://raw.githubusercontent.com/fivethirtyeight/uber-tlc-foil-response/master/uber-trip-data/taxi-zone-lookup.csv'
location_codes = pd.read_csv(url)

In [38]:
# Get location objects from borough and zone
locator = Nominatim(user_agent='uber')
location_codes['query'] = location_codes['Borough'] + ", " + location_codes['Zone'].apply(lambda z: z.split('/')[0])
location_codes['location'] = location_codes['query'].apply(lambda q: locator.geocode(query = q))

In [39]:
# Extract latitude and longitude into new columns
location_codes['Lat'] = location_codes['location'].apply(lambda l: l.latitude if l is not None else None)
location_codes['Lon'] = location_codes['location'].apply(lambda l: l.longitude if l is not None else None)

memoize_borough = {}
memoize_neighborhood = {}
def memoize(row):
  coords = '{}, {}'.format(row['Lat'], row['Lon'])
  memoize_borough[coords] = row['Borough']
  memoize_neighborhood[coords] = row['Zone']

location_codes.apply(memoize, axis=1)

# Fill in missing values
location_codes.loc[location_codes['location'].isna(),'Lat'] = [40.5564,40.7932,40.7931,40.8837,40.7082,40.8972,40.8837,40.7082]
location_codes.loc[location_codes['location'].isna(),'Lon'] = [74.1735,73.9213,73.8860,73.8931,73.9567,73.8861,73.8931,73.9571]

# Drop unneccesary features
location_codes.drop(['Borough', 'Zone', 'query', 'location'], axis=1, inplace=True)

# Rename feature
location_codes.rename(columns={'LocationID':'locationID'},inplace=True)



In [40]:
# Read uber-raw-data-janjun-15.csv into a dataframe
data = pd.read_csv('/content/drive/Shared drives/Team Mean Green Learning Machine/raw datasets/uber-raw-data-janjune-15.csv')

# Add latitude and longitude
data = pd.merge(data, location_codes, on='locationID')

# Drop unneccesary features
data.drop(['locationID','Dispatching_base_num','Affiliated_base_num'],axis=1, inplace=True)

# Rename feature
data.rename(columns={'Pickup_date':'Date'},inplace=True)

In [41]:
# Drop rows with unknown pickup location
# data = data[data['Borough'] != 'Unknown']
# data = data[data['Neighborhood'] != 'Unknown']

In [42]:
files = glob.glob("/content/drive/Shared drives/Team Mean Green Learning Machine/raw datasets/uber-raw-data-*14.csv")
for file in files:
  add = pd.read_csv(file)
  add.drop('Base',axis=1,inplace=True)
  add.rename(columns={'Date/Time':'Date'},inplace=True)
  add['Date'] = add['Date'].apply(lambda d: datetime.strptime(d,"%m/%d/%Y %H:%M:%S"))
  
  # Add data to final dataframe
  data = data.append(add, ignore_index=True)

In [43]:
# Break up Date column
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

In [44]:
# Save as CSV
data.to_csv('/content/drive/Shared drives/Team Mean Green Learning Machine/processed data/data.csv')