# SQL Assignment - uc288

Found a Python 2 and 3 compatibility cheat sheet online: http://python-future.org/compatible_idioms.html which I used to update the function by **@fedhere** to be compatible for Python 2 and 3.

In [1]:
try:
    # Python 3 packages
    from urllib.parse import urlparse, urlencode
    from urllib.request import urlopen, Request
    from urllib.error import HTTPError
except ImportError:
    # Python 2 packages
    from urlparse import urlparse
    from urllib import urlencode
    from urllib2 import urlopen, Request, HTTPError
    
# works for both python 2 and 3
from io import StringIO
import ast
import pandas as pd

In [2]:
SQL_SOURCE = 'https://uc288.carto.com/api/v2/sql?q='

In [3]:
def queryCartoDB(query, format='CSV', source=SQL_SOURCE):
    # added .encode('utf-8') to prevent errors in python 3
    data = urlencode({'format': format, 'q': query}).encode('utf-8')
    try:
        response = urlopen(source, data)
    # changed syntax to use 'as' keyword
    except HTTPError as e:
        raise ValueError('\n'.join(ast.literal_eval(e.readline())['error']))
    except Exception:
        raise
    return pd.read_csv(response)

## Task 1: Familiarize with SQL Clauses
* Sort data by start_station_id, tripduration (only checking trips with duration <= 3 hours
* Only show the top/last 10 records
* List all unique start_station_id values
* Aggregate functions:
    * Count the number of trips
    * Find the average / min / max trip duration

This query combines all the requested information and sorts based on the average trip duration per station.

In [4]:
sql_1 = '''SELECT DISTINCT start_station_id, 
COUNT(start_station_id) AS trip_counts,
AVG(tripduration) AS avg_tripduration,
MIN(tripduration) AS min_tripduration,
MAX(tripduration) AS max_tripduration
FROM citibike
WHERE tripduration <= 10800
GROUP BY start_station_id
ORDER BY start_station_id ASC, avg_tripduration ASC
LIMIT 10
'''

This shows the first 10 stations

In [5]:
queryCartoDB(sql_1)

Unnamed: 0,start_station_id,trip_counts,avg_tripduration,min_tripduration,max_tripduration
0,72,114,742.701754,107,2099
1,79,71,659.140845,149,1916
2,82,49,445.346939,138,1581
3,83,35,783.685714,73,2647
4,116,288,543.965278,66,2081
5,119,19,1044.789474,278,2167
6,120,17,979.882353,270,2131
7,127,209,710.84689,83,5776
8,128,247,664.206478,62,2875
9,137,96,697.364583,85,2316


In [6]:
sql_1_2 = '''SELECT DISTINCT start_station_id, 
COUNT(start_station_id) AS trip_counts,
AVG(tripduration) AS avg_tripduration,
MIN(tripduration) AS min_tripduration,
MAX(tripduration) AS max_tripduration
FROM citibike
WHERE tripduration <= 10800
GROUP BY start_station_id
ORDER BY start_station_id DESC, avg_tripduration DESC
LIMIT 10
'''

This shows the last 10 stations.

In [7]:
queryCartoDB(sql_1_2)

Unnamed: 0,start_station_id,trip_counts,avg_tripduration,min_tripduration,max_tripduration
0,3002,184,714.646739,79,7225
1,2023,91,771.978022,108,2454
2,2022,96,979.0625,107,2495
3,2021,141,738.815603,90,6592
4,2017,86,763.383721,109,2476
5,2012,256,714.519531,79,2026
6,2010,76,763.184211,167,2610
7,2009,107,777.794393,60,2765
8,2008,86,832.22093,94,3476
9,2006,79,1130.531646,90,3459


In [8]:
sql_1_3 = '''SELECT DISTINCT start_station_id, 
COUNT(start_station_id) AS trip_counts,
AVG(tripduration) AS avg_tripduration,
MIN(tripduration) AS min_tripduration,
MAX(tripduration) AS max_tripduration
FROM citibike
WHERE tripduration <= 10800
GROUP BY start_station_id
ORDER BY avg_tripduration DESC
LIMIT 10
'''

This shows the top 10 stations with the longest trip duration.

In [9]:
queryCartoDB(sql_1_3)

Unnamed: 0,start_station_id,trip_counts,avg_tripduration,min_tripduration,max_tripduration
0,471,32,1218.28125,60,2251
1,481,13,1143.923077,165,2270
2,2006,79,1130.531646,90,3459
3,2005,7,1130.285714,545,2513
4,2002,45,1116.711111,77,2426
5,431,28,1095.535714,184,2423
6,372,9,1083.333333,233,2160
7,532,50,1076.6,198,2167
8,332,36,1067.111111,201,5930
9,119,19,1044.789474,278,2167


## Working with date/time
* Selecting trips started on Feb 2, 2015 only
* Selecting trips started on the weekends
* What are the average trip duration during weekends?
* Can we do the sam efor weekday?

### Trips started on Feb. 2, 2015 only

In [10]:
sql_2_1 = '''SELECT starttime, stoptime, start_station_id, 
end_station_id, tripduration, usertype, gender, birth_year
FROM citibike
WHERE starttime >= '2015-02-02 00:00:00'
AND starttime < '2015-02-03 00:00:00'
ORDER BY starttime
'''

In [11]:
queryCartoDB(sql_2_1)

Unnamed: 0,starttime,stoptime,start_station_id,end_station_id,tripduration,usertype,gender,birth_year
0,2015-02-02 00:02:00+00,2015-02-02 00:05:00+00,442,489,199,Subscriber,1,1992.0
1,2015-02-02 00:03:00+00,2015-02-02 00:10:00+00,326,349,418,Subscriber,2,1964.0
2,2015-02-02 00:04:00+00,2015-02-02 00:09:00+00,279,415,276,Subscriber,1,1974.0
3,2015-02-02 00:06:00+00,2015-02-02 00:13:00+00,496,237,420,Subscriber,2,1992.0
4,2015-02-02 00:07:00+00,2015-02-02 00:12:00+00,477,450,304,Subscriber,1,1968.0
5,2015-02-02 00:08:00+00,2015-02-02 00:18:00+00,467,391,622,Subscriber,1,1979.0
6,2015-02-02 00:09:00+00,2015-02-02 00:15:00+00,434,482,371,Subscriber,1,1963.0
7,2015-02-02 00:11:00+00,2015-02-02 00:19:00+00,330,2008,477,Subscriber,1,1982.0
8,2015-02-02 00:12:00+00,2015-02-02 00:25:00+00,521,423,743,Subscriber,2,1978.0
9,2015-02-02 00:12:00+00,2015-02-02 00:22:00+00,504,453,599,Subscriber,1,1956.0


### Trips that started on the weekends with the average duration

In [12]:
sql_2_2 = '''SELECT date_part('dow', starttime) as day_of_week,
AVG(tripduration) as avg_tripduration
FROM citibike
WHERE date_part('dow', starttime) IN (0, 6)
GROUP BY date_part('dow', starttime)'''

In [13]:
queryCartoDB(sql_2_2)

Unnamed: 0,day_of_week,avg_tripduration
0,0,638.1492
1,6,686.460825


In [14]:
sql_2_21 = '''SELECT AVG(tripduration) as avg_tripduration
FROM citibike
WHERE date_part('dow', starttime) IN (0, 6)'''

The average trip duration for the weekend.

In [15]:
queryCartoDB(sql_2_21)

Unnamed: 0,avg_tripduration,Unnamed: 1
0,662.942181,


### Trips that started on the weekdays with the average duration

In [16]:
sql_2_3 = '''SELECT date_part('dow', starttime) as day_of_week,
AVG(tripduration) as avg_tripduration
FROM citibike
WHERE date_part('dow', starttime) > 0
AND  date_part('dow', starttime) < 6
GROUP BY date_part('dow', starttime)
ORDER BY day_of_week'''

In [17]:
queryCartoDB(sql_2_3)

Unnamed: 0,day_of_week,avg_tripduration
0,1,920.862234
1,2,767.224443
2,3,697.556559
3,4,623.39672
4,5,637.116968


In [18]:
sql_2_31 = '''SELECT AVG(tripduration) as avg_tripduration
FROM citibike
WHERE date_part('dow', starttime) > 0
AND  date_part('dow', starttime) < 6'''

The average trip duration for the weekdays.

In [19]:
queryCartoDB(sql_2_31)

Unnamed: 0,avg_tripduration,Unnamed: 1
0,681.052292,


## Task 3 - Working with Space

* Showing the list of start station locations
    * Using GROUP BY
* Showing the number of trips started per station
    * … but only for stations within 500m of Time Square!
    * The coordinates of Time Square is (40.7577,-73.9857)

In [20]:
sql_3 = '''SELECT start_station_latitude, start_station_longitude, start_station_name,
    CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)) AS the_geom_webmercator,
    MIN(cartodb_id) AS cartodb_id,
    COUNT(tripduration) AS numTrips
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography,
                500)
GROUP BY start_station_latitude, start_station_longitude, start_station_name
ORDER BY numtrips'''

In [21]:
queryCartoDB(sql_3)

Unnamed: 0,start_station_latitude,start_station_longitude,start_station_name,the_geom_webmercator,cartodb_id,numtrips
0,40.755273,-73.983169,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,1115,112
1,40.7568,-73.982912,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,19,141
2,40.759291,-73.988597,W 45 St & 8 Ave,0101000020110F00001607D538556B5FC119A6CA6F41FC...,124,141
3,40.760647,-73.984427,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,42,213
4,40.75757,-73.990985,W 42 St & 8 Ave,0101000020110F00003F9300B2976B5FC1B8B88F3102FC...,783,221
5,40.755136,-73.98658,Broadway & W 41 St,0101000020110F000004AF971C1D6B5FC17A48F3C1A8FB...,54,251
6,40.756405,-73.990026,W 41 St & 8 Ave,0101000020110F0000F55695027D6B5FC186E5BB69D7FB...,33,507


## Task 4 - Putting it all together

### Find the station that had the longest average trip duration during weekends and within 500m of TimeSquare! 

In [22]:
sql_4_1 = '''SELECT start_station_latitude, start_station_longitude, start_station_name,
    CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)) AS the_geom_webmercator,
    MIN(cartodb_id) AS cartodb_id,
    AVG(tripduration) AS avg_duration
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography,
                500)
AND date_part('dow', starttime) IN (0, 6)
GROUP BY start_station_latitude, start_station_longitude, start_station_name
ORDER BY avg_duration DESC'''

In [23]:
queryCartoDB(sql_4_1)

Unnamed: 0,start_station_latitude,start_station_longitude,start_station_name,the_geom_webmercator,cartodb_id,avg_duration
0,40.760647,-73.984427,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,42,1010.104167
1,40.759291,-73.988597,W 45 St & 8 Ave,0101000020110F00001607D538556B5FC119A6CA6F41FC...,124,762.931818
2,40.755136,-73.98658,Broadway & W 41 St,0101000020110F000004AF971C1D6B5FC17A48F3C1A8FB...,54,683.121212
3,40.7568,-73.982912,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,19,675.4
4,40.756405,-73.990026,W 41 St & 8 Ave,0101000020110F0000F55695027D6B5FC186E5BB69D7FB...,33,643.260274
5,40.755273,-73.983169,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,1115,629.7
6,40.75757,-73.990985,W 42 St & 8 Ave,0101000020110F00003F9300B2976B5FC1B8B88F3102FC...,783,579.142857


In [24]:
sql_4_11 = '''SELECT start_station_latitude, start_station_longitude, start_station_name,
    CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)) AS the_geom_webmercator,
    date_part('dow', starttime) AS day_of_week, 
    MIN(cartodb_id) AS cartodb_id,
    AVG(tripduration) AS avg_duration
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography,
                500)
AND date_part('dow', starttime) IN (0, 6)
GROUP BY start_station_latitude, start_station_longitude, start_station_name, date_part('dow', starttime) 
ORDER BY day_of_week, avg_duration DESC'''

In [25]:
queryCartoDB(sql_4_11)

Unnamed: 0,start_station_latitude,start_station_longitude,start_station_name,the_geom_webmercator,day_of_week,cartodb_id,avg_duration
0,40.759291,-73.988597,W 45 St & 8 Ave,0101000020110F00001607D538556B5FC119A6CA6F41FC...,0,124,799.722222
1,40.755136,-73.98658,Broadway & W 41 St,0101000020110F000004AF971C1D6B5FC17A48F3C1A8FB...,0,54,671.818182
2,40.756405,-73.990026,W 41 St & 8 Ave,0101000020110F0000F55695027D6B5FC186E5BB69D7FB...,0,33,647.483871
3,40.760647,-73.984427,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,0,42,594.954545
4,40.7568,-73.982912,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,0,19,555.0
5,40.75757,-73.990985,W 42 St & 8 Ave,0101000020110F00003F9300B2976B5FC1B8B88F3102FC...,0,783,476.8
6,40.755273,-73.983169,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,0,1115,410.5
7,40.760647,-73.984427,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,6,39625,1361.384615
8,40.7568,-73.982912,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,6,41260,813.0
9,40.755273,-73.983169,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,6,40490,775.833333


### Extra: 
**Create lines** for trips started from stations within 500m of Times Square and **lasted less than 2 hours**. The number of trips per each pair of stations are output as attributes of these lines. 

In [26]:
sql_4_2 = '''SELECT start_station_latitude, start_station_longitude, start_station_name,
    end_station_latitude, end_station_longitude, end_station_name,
    ST_Makeline(CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)),
                   CDB_TransformToWebmercator(CDB_LatLng(end_station_latitude, end_station_longitude))) AS the_geom_webmercator,
    MIN(cartodb_id) AS cartodb_id,
    COUNT(tripduration) AS numTrips
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography,
                500)
    AND tripduration < 7200
GROUP BY start_station_latitude, start_station_longitude, start_station_name, end_station_latitude, end_station_longitude, end_station_name'''

In [27]:
queryCartoDB(sql_4_2)

Unnamed: 0,start_station_latitude,start_station_longitude,start_station_name,end_station_latitude,end_station_longitude,end_station_name,the_geom_webmercator,cartodb_id,numtrips
0,40.756405,-73.990026,W 41 St & 8 Ave,40.704718,-74.009260,Pearl St & Hanover Square,0102000020110F000002000000F55695027D6B5FC186E5...,13694,3
1,40.757570,-73.990985,W 42 St & 8 Ave,40.760958,-73.967245,E 58 St & 3 Ave,0102000020110F0000020000003F9300B2976B5FC1B8B8...,12816,10
2,40.755136,-73.986580,Broadway & W 41 St,40.739713,-73.994564,W 18 St & 6 Ave,0102000020110F00000200000004AF971C1D6B5FC17A48...,21914,1
3,40.755136,-73.986580,Broadway & W 41 St,40.735243,-73.987586,E 16 St & Irving Pl,0102000020110F00000200000004AF971C1D6B5FC17A48...,22653,1
4,40.755273,-73.983169,W 43 St & 6 Ave,40.769155,-73.981918,Broadway & W 60 St,0102000020110F000002000000F22A632FBE6A5FC1363A...,20088,6
5,40.755136,-73.986580,Broadway & W 41 St,40.720196,-73.989978,Allen St & Rivington St,0102000020110F00000200000004AF971C1D6B5FC17A48...,3976,1
6,40.756405,-73.990026,W 41 St & 8 Ave,40.758281,-73.970694,E 53 St & Lexington Ave,0102000020110F000002000000F55695027D6B5FC186E5...,15127,4
7,40.755273,-73.983169,W 43 St & 6 Ave,40.751551,-73.993934,8 Ave & W 33 St,0102000020110F000002000000F22A632FBE6A5FC1363A...,36993,1
8,40.756405,-73.990026,W 41 St & 8 Ave,40.760660,-73.980420,W 51 St & 6 Ave,0102000020110F000002000000F55695027D6B5FC186E5...,14514,8
9,40.757570,-73.990985,W 42 St & 8 Ave,40.750977,-73.987654,Broadway & W 36 St,0102000020110F0000020000003F9300B2976B5FC1B8B8...,15100,1
