In [1]:
#!/usr/bin/env python

# make sure to install these packages before running:
#!pip install pandas
#!pip install sodapy
from api_keys import key
import pandas as pd
from sodapy import Socrata
import numpy as np
import requests


In [2]:
select = [ 'dropoff_census_tract', 
       'dropoff_community_area', 
       'pickup_census_tract', 
       'pickup_community_area', 
       'trip_end_timestamp', 'trip_id', 'trip_miles', 'trip_seconds',
       'trip_start_timestamp', 'trip_total']
select_string = ""
for cat in select:
    select_string += cat
    if select[-1]  != cat:
        select_string += ', '
select_string

'dropoff_census_tract, dropoff_community_area, pickup_census_tract, pickup_community_area, trip_end_timestamp, trip_id, trip_miles, trip_seconds, trip_start_timestamp, trip_total'

In [58]:
#https://data.cityofchicago.org/resource/wrvz-psew.json?trip_start_timestamp=2017-01-01T16:45:00.000&
            
  
# Example authenticated client (needed for non-public datasets):
client = Socrata("data.cityofchicago.org",
                 key)

In [53]:
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("wrvz-psew", where="trip_start_timestamp between '2016-09-01' and '2017-09-01' and dropoff_community_area<>76 and pickup_community_area <> 76 ",
                    select=select_string ,limit=1000)


# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)   
results_df.head()

Unnamed: 0,dropoff_census_tract,dropoff_community_area,pickup_census_tract,pickup_community_area,trip_end_timestamp,trip_id,trip_miles,trip_seconds,trip_start_timestamp,trip_total
0,,22,,7,2016-10-01T00:45:00.000,6d165b1e87bfa63419c654b057bd985250edb49c,3.6,540,2016-10-01T00:45:00.000,11.5
1,,32,,8,2017-03-31T07:15:00.000,6d165b983fd7bed2733e3d826063bd54dfd76c9a,2.1,480,2017-03-31T07:00:00.000,11.5
2,,28,,7,2016-12-09T15:00:00.000,6d165e585a0644804e96fd7c76e33e9e24a23a7d,1.4,600,2016-12-09T15:00:00.000,8.0
3,17031081401.0,8,17031833000.0,28,2016-11-17T19:00:00.000,6d165f6dd500ce740cd3a4d71afe4fac477d2ca9,0.0,1080,2016-11-17T18:45:00.000,13.2
4,17031081402.0,8,17031320100.0,32,2016-11-10T19:45:00.000,6d1664116818f0c3b40bc890e6b232d6ac9db0ec,1.1,480,2016-11-10T19:45:00.000,10.0


In [34]:
help(client.get)

Help on method get in module sodapy:

get(dataset_identifier, content_type='json', **kwargs) method of sodapy.Socrata instance
    Read data from the requested resource. Options for content_type are json,
    csv, and xml. Optionally, specify a keyword arg to filter results:
    
        select : the set of columns to be returned, defaults to *
        where : filters the rows to be returned, defaults to limit
        order : specifies the order of results
        group : column to group results on
        limit : max number of results to return, defaults to 1000
        offset : offset, used for paging. Defaults to 0
        q : performs a full text search for a value
        query : full SoQL query string, all as one parameter
        exclude_system_fields : defaults to true. If set to false, the
            response will include system fields (:id, :created_at, and
            :updated_at)
    
    More information about the SoQL parameters can be found at the official
    docs:
   

In [50]:
results_df.columns

Index(['dropoff_census_tract', 'dropoff_community_area', 'pickup_census_tract',
       'pickup_community_area', 'trip_end_timestamp', 'trip_id', 'trip_miles',
       'trip_seconds', 'trip_start_timestamp', 'trip_total'],
      dtype='object')

In [53]:
#drop NA
results=results_df.dropna(axis=0)
results.head()
len(results)

695369

In [32]:
#drop unnessasry columns
#remove_columns_results_df=results.drop(columns=['company', 'extras', 'fare', 'payment_type','taxi_id', 'tips', 'tolls'])
#len(remove_columns_results_df)

1000

In [33]:
#drop ohair routs
remove_dropoff76_results_df=remove_columns_results_df[remove_columns_results_df.dropoff_community_area !="76"]
#remove_pickup76_results_df=remove_dropoff76_results_df[remove_dropoff76_results_df.pickup_community_area !="76"]
#remove_pickup76_results_df.head()
len( remove_dropoff76_results_df)

1000

In [None]:
#remove_pickup76_results_df.dropoff_centroid_latitude = remove_pickup76_results_df.dropoff_centroid_latitude.astype(float)
#remove_pickup76_results_df.dropoff_centroid_longitude = remove_pickup76_results_df.dropoff_centroid_longitude.astype(float)
#remove_pickup76_results_df.pickup_centroid_latitude = remove_pickup76_results_df.pickup_centroid_latitude.astype(float)
#remove_pickup76_results_df.pickup_centroid_longitude = remove_pickup76_results_df.pickup_centroid_longitude.astype(float)
#remove_pickup76_results_df.dtypes

In [None]:
#to calculate distances
# attempt one
!pip install pyproj
from pyproj import Geod

wgs84_geod = Geod(EPSG=3435) #Distance will be measured on this ellipsoid - more accurate than a spherical method

#Get distance between pairs of lat-lon points
def Distance(lat1,lon1,lat2,lon2):
  az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2) #Yes, this order is correct
  return dist

#Create test data
lat1 = np.random.uniform(-90,90,100)
lon1 = np.random.uniform(-180,180,100)
lat2 = np.random.uniform(-90,90,100)
lon2 = np.random.uniform(-180,180,100)

#Package as a dataframe
df = pd.DataFrame({'lat1':lat1,'lon1':lon1,'lat2':lat2,'lon2':lon2})

#Add/update a column to the data frame with the distances (in metres)
df['dist'] = Distance(df['lat1'].tolist(),df['lon1'].tolist(),df['lat2'].tolist(),df['lon2'].tolist())

In [56]:
#!pip install censusgeocode
import censusgeocode as cg

In [107]:
help(client.get)

Help on method get in module sodapy:

get(dataset_identifier, content_type='json', **kwargs) method of sodapy.Socrata instance
    Read data from the requested resource. Options for content_type are json,
    csv, and xml. Optionally, specify a keyword arg to filter results:
    
        select : the set of columns to be returned, defaults to *
        where : filters the rows to be returned, defaults to limit
        order : specifies the order of results
        group : column to group results on
        limit : max number of results to return, defaults to 1000
        offset : offset, used for paging. Defaults to 0
        q : performs a full text search for a value
        query : full SoQL query string, all as one parameter
        exclude_system_fields : defaults to true. If set to false, the
            response will include system fields (:id, :created_at, and
            :updated_at)
    
    More information about the SoQL parameters can be found at the official
    docs:
   

In [17]:
results_census_tract = client.get("wrvz-psew", where="trip_start_timestamp between '2016-09-01' and '2017-09-01' and dropoff_community_area",
                     limit=1000)


In [33]:
census_track_df.columns

Index(['dropoff_census_tract', 'dropoff_centroid_latitude',
       'dropoff_centroid_longitude', 'dropoff_community_area'],
      dtype='object')

In [98]:
ct_sel = 'dropoff_census_tract, dropoff_centroid_latitude, dropoff_centroid_longitude, dropoff_community_area'

In [105]:
results_census_tract = client.get("wrvz-psew",where="dropoff_census_tract is not  NULL",
                     select=ct_sel, limit =2000, group = 'dropoff_census_tract'
                                 )

HTTPError: 400 Client Error: Bad Request.
	Invalid SoQL query

In [103]:
census_track_df = pd.DataFrame.from_records(results_census_tract)

In [104]:
census_track_df

Unnamed: 0,dropoff_census_tract,dropoff_centroid_latitude,dropoff_centroid_longitude,dropoff_community_area
0,17031062100,41.942691844,-87.651770507,6
1,17031063302,41.934762456,-87.639853859,6
2,17031832600,41.914747305,-87.654007029,7
3,17031080300,41.90749193,-87.63576009,8
4,17031081500,41.892507781,-87.626214906,8
5,17031081700,41.892042136,-87.63186395,8
6,17031320100,41.884987192,-87.620992913,32
7,17031320100,41.884987192,-87.620992913,32
8,17031320600,41.870607372,-87.622172937,32
9,17031071100,41.921778188,-87.651061884,7


In [40]:
census_track_df.sort_values('dropoff_census_tract')

KeyError: 'dropoff_census_tract'