In [4]:
funcs = pyspark.sql.functions
types = pyspark.sql.types

In [2]:
bike = spark.read.parquet('/bigdata/citibike.parquet')

In [3]:
bike.show(5)

+-------------+----------------+----------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+-----------------+---------------+-----------+---------+
|trip_duration|      start_time|       stop_time|start_station_id|  start_station_name|start_station_latitude|start_station_longitude|end_station_id|    end_station_name|end_station_latitude|end_station_longitude|bike_id|   user_type|birth_year|gender|start_taxizone_id|end_taxizone_id|start_ct_id|end_ct_id|
+-------------+----------------+----------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+-----------------+---------------+-----------+---------+
|          801|1481563027000000|1481563829000000|            3002|"South 

In [4]:
bike = \
    bike.withColumn('start_time', funcs.from_unixtime((bike.start_time/1000000).cast(types.IntegerType()))).withColumn(
                    'stop_time', funcs.from_unixtime((bike.stop_time/1000000).cast(types.IntegerType())))

In [5]:
bike.show(5)

+-------------+-------------------+-------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+-----------------+---------------+-----------+---------+
|trip_duration|         start_time|          stop_time|start_station_id|  start_station_name|start_station_latitude|start_station_longitude|end_station_id|    end_station_name|end_station_latitude|end_station_longitude|bike_id|   user_type|birth_year|gender|start_taxizone_id|end_taxizone_id|start_ct_id|end_ct_id|
+-------------+-------------------+-------------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+-----------------+---------------+-----------+---------+
|          801|2016-12-12 12:17:07|2016-12-12 12:30:29|

In [6]:
bike.repartition(32, 'start_station_id').sortWithinPartitions('start_station_id', 'start_time') \
    .write.parquet('/data/citibike_repartitioned.parquet', compression='snappy', mode='overwrite')

In [7]:
subway = spark.read.parquet('/bigdata/subway.parquet')

In [8]:
subway.show(5)

+------+------+----------+-------+--------+--------+----------------+-----------+-------------+-----------+
|    ca|  unit|       scp|station|linename|division|         endtime|description|cumul_entries|cumul_exits|
+------+------+----------+-------+--------+--------+----------------+-----------+-------------+-----------+
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|1403913600000000|  "REGULAR"|      4679542|    1591173|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|1403928000000000|  "REGULAR"|      4679583|    1591180|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|1403942400000000|  "REGULAR"|      4679603|    1591196|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|1403956800000000|  "REGULAR"|      4679707|    1591296|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|1403971200000000|  "REGULAR"|      4680003|    1591342|
+------+------+----------+-------+--------+--------+----------------+-----------+-------------+-----------+
only showing top 5 rows



In [9]:
subway = \
    subway.withColumn('endtime', funcs.from_unixtime((subway.endtime/1000000).cast(types.IntegerType())))

In [10]:
subway.show(5)

+------+------+----------+-------+--------+--------+-------------------+-----------+-------------+-----------+
|    ca|  unit|       scp|station|linename|division|            endtime|description|cumul_entries|cumul_exits|
+------+------+----------+-------+--------+--------+-------------------+-----------+-------------+-----------+
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|2014-06-27 20:00:00|  "REGULAR"|      4679542|    1591173|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|2014-06-28 00:00:00|  "REGULAR"|      4679583|    1591180|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|2014-06-28 04:00:00|  "REGULAR"|      4679603|    1591196|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|2014-06-28 08:00:00|  "REGULAR"|      4679707|    1591296|
|"A002"|"R051"|"02-00-00"| "NULL"|  "NULL"|  "NULL"|2014-06-28 12:00:00|  "REGULAR"|      4680003|    1591342|
+------+------+----------+-------+--------+--------+-------------------+-----------+-------------+-----------+
o

In [11]:
subway = subway.repartition(16, 'ca', 'unit', 'scp').sortWithinPartitions('ca', 'unit', 'scp', 'endtime')
subway.write.parquet('/data/subway_repartitioned.parquet', compression='snappy', mode='overwrite')

In [64]:
taxi = spark.read.parquet('/bigdata/all_trips.parquet')

In [65]:
taxi.show(5)

+-------------+----------------+----------------+------------------+-------------------+---------+-----+-----------+---------------------+-------+---------------+------------+------------+----------------+---------------+------------------+------------------+------------+------------------+----------+------------+------------+-------------+---------+---------+
|dropoff_ct_id|dropoff_datetime|dropoff_latitude| dropoff_longitude|dropoff_taxizone_id|ehail_fee|extra|fare_amount|improvement_surcharge|mta_tax|passenger_count|payment_type|pickup_ct_id| pickup_datetime|pickup_latitude|  pickup_longitude|pickup_taxizone_id|rate_code_id|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|trip_type|vendor_id|
+-------------+----------------+----------------+------------------+-------------------+---------+-----+-----------+---------------------+-------+---------------+------------+------------+----------------+---------------+------------------+------------------+------------+--

In [66]:
taxi = \
    taxi.withColumn('dropoff_datetime', 
                    funcs.from_unixtime((taxi.dropoff_datetime/1000000).cast(types.IntegerType())).cast(types.TimestampType())) \
    .withColumn('pickup_datetime', 
                    funcs.from_unixtime((taxi.pickup_datetime/1000000).cast(types.IntegerType())).cast(types.TimestampType())) \
    .withColumn('dropoff_ct_id', taxi.dropoff_ct_id.cast(types.IntegerType())) \
    .withColumn('pickup_ct_id', taxi.pickup_ct_id.cast(types.IntegerType())) \
    .withColumn('dropoff_taxizone_id', taxi.dropoff_taxizone_id.cast(types.IntegerType())) \
    .withColumn('pickup_taxizone_id', taxi.pickup_taxizone_id.cast(types.IntegerType())) \
    .withColumn('dropoff_latitude', taxi.dropoff_latitude.cast(types.FloatType())) \
    .withColumn('dropoff_longitude', taxi.dropoff_longitude.cast(types.FloatType())) \
    .withColumn('ehail_fee', taxi.ehail_fee.cast(types.FloatType())) \
    .withColumn('extra', taxi.extra.cast(types.FloatType())) \
    .withColumn('fare_amount', taxi.fare_amount.cast(types.FloatType())) \
    .withColumn('improvement_surcharge', taxi.improvement_surcharge.cast(types.FloatType())) \
    .withColumn('mta_tax', taxi.mta_tax.cast(types.FloatType())) \
    .withColumn('pickup_latitude', taxi.pickup_latitude.cast(types.FloatType())) \
    .withColumn('pickup_longitude', taxi.pickup_longitude.cast(types.FloatType())) \
    .withColumn('tip_amount', taxi.tip_amount.cast(types.FloatType())) \
    .withColumn('tolls_amount', taxi.tolls_amount.cast(types.FloatType())) \
    .withColumn('total_amount', taxi.total_amount.cast(types.FloatType())) \
    .withColumn('trip_distance', taxi.trip_distance.cast(types.FloatType())) \
    .withColumn('passenger_count', taxi.passenger_count.cast(types.IntegerType())) \
    .withColumn('rate_code_id', taxi.rate_code_id.cast(types.IntegerType())) \
    .withColumn('trip_id', funcs.monotonically_increasing_id())

In [67]:
taxi.show(5)

+-------------+--------------------+----------------+-----------------+-------------------+---------+-----+-----------+---------------------+-------+---------------+------------+------------+--------------------+---------------+----------------+------------------+------------+------------------+----------+------------+------------+-------------+---------+---------+-------+
|dropoff_ct_id|    dropoff_datetime|dropoff_latitude|dropoff_longitude|dropoff_taxizone_id|ehail_fee|extra|fare_amount|improvement_surcharge|mta_tax|passenger_count|payment_type|pickup_ct_id|     pickup_datetime|pickup_latitude|pickup_longitude|pickup_taxizone_id|rate_code_id|store_and_fwd_flag|tip_amount|tolls_amount|total_amount|trip_distance|trip_type|vendor_id|trip_id|
+-------------+--------------------+----------------+-----------------+-------------------+---------+-----+-----------+---------------------+-------+---------------+------------+------------+--------------------+---------------+----------------+---

In [68]:
taxi.repartition('pickup_taxizone_id').sortWithinPartitions('pickup_datetime') \
    .write.partitionBy('pickup_taxizone_id') \
    .parquet('/data/all_trips_repartitioned.parquet', compression='snappy', mode='overwrite')