In [1]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
import bokeh
output_notebook()

In [15]:
import mpld3
mpld3.enable_notebook()

In [2]:
df = spark.read.load('/data4/parquet/citibike.parquet')

In [4]:
df.show(10)

+-------------+----------------+----------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+
|trip_duration|      start_time|       stop_time|start_station_id|  start_station_name|start_station_latitude|start_station_longitude|end_station_id|    end_station_name|end_station_latitude|end_station_longitude|bike_id|   user_type|birth_year|gender|
+-------------+----------------+----------------+----------------+--------------------+----------------------+-----------------------+--------------+--------------------+--------------------+---------------------+-------+------------+----------+------+
|         4253|1475863020000000|1475867274000000|            3163|"Central Park Wes...|             40.773407|              -73.97783|          3163|"Central Park Wes...|           40.773407|            -73.97783|  26005|  "Customer"|       

In [5]:
df.dtypes

[('trip_duration', 'int'),
 ('start_time', 'bigint'),
 ('stop_time', 'bigint'),
 ('start_station_id', 'int'),
 ('start_station_name', 'string'),
 ('start_station_latitude', 'float'),
 ('start_station_longitude', 'float'),
 ('end_station_id', 'int'),
 ('end_station_name', 'string'),
 ('end_station_latitude', 'float'),
 ('end_station_longitude', 'float'),
 ('bike_id', 'int'),
 ('user_type', 'string'),
 ('birth_year', 'float'),
 ('gender', 'int')]

In [6]:
from pyspark.sql.types import IntegerType, TimestampType, DateType, DateConverter, DatetimeConverter
import pyspark.sql.functions as func

## Fastparquet outputs timestamps in microseconds by default. This does the conversion.

In [7]:
df = df.withColumn("start_time", func.to_utc_timestamp(func.from_unixtime(df.start_time/1000000), "UTC"))
df = df.withColumn("stop_time", func.to_utc_timestamp(func.from_unixtime(df.stop_time/1000000), "UTC"))

In [8]:
df = df.repartition("start_station_id")
df = df.sortWithinPartitions("start_time")

## Partitioned write

In [9]:
df.write.parquet('/data4/parquet_spark/citibike.parquet', mode='overwrite', compression="snappy", partitionBy=['start_station_id'])

In [56]:
df2 = spark.sql('SELECT (hour(start_time)*60+minute(start_time)+hour(start_time)/60.0)/60.0 as start_time from parquet.`/data4/parquet_spark/citibike.parquet` where start_station_id=482')
df3 = spark.sql('SELECT (hour(start_time)*60+minute(start_time)+hour(start_time)/60.0)/60.0 as start_time from parquet.`/data4/parquet_spark/citibike.parquet` where start_station_id=3163')

## Partitioned read

In [10]:
df4 = spark.sql('''SELECT (hour(start_time)*60+minute(start_time)+hour(start_time)/60.0)/60.0 as start_time 
                    from parquet.`/data4/parquet_spark/citibike.parquet/start_station_id=482`''')
df5 = spark.sql('''SELECT (hour(start_time)*60+minute(start_time)+hour(start_time)/60.0)/60.0 as start_time 
                    from parquet.`/data4/parquet_spark/citibike.parquet/start_station_id=3163`''')

In [13]:
import seaborn
import numpy as np
from matplotlib import pyplot as plt
from bokeh import mpl

In [14]:
import sys

In [57]:
seaborn.distplot(df2.toPandas()['start_time'].astype(np.float32), bins=np.linspace(0, 24, 97))
seaborn.distplot(df3.toPandas()['start_time'].astype(np.float32), bins=np.linspace(0, 24, 97))
# plt.xlim(-1, 25)
mpld3.display()
# print(mpld3.fig_to_html(plt.gcf()))

  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j


In [40]:
seaborn.distplot(df4.toPandas()['start_time'].astype(np.float32), bins=np.linspace(0, 24, 97))
seaborn.distplot(df5.toPandas()['start_time'].astype(np.float32), bins=np.linspace(0, 24, 97))
# plt.xlim(-1, 25)
mpld3.display()
# print(mpld3.fig_to_html(plt.gcf()))

  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j


## Reading the parquet file from Spark in Dask/Fastparquet does not work because of datetime incompatibilities

In [47]:
from glob import glob

In [50]:
import dask.dataframe as dd
import fastparquet

In [53]:
m = fastparquet.ParquetFile(glob('/data4/parquet_spark/citibike.parquet/start_station_id=482/*parquet')[0])

In [54]:
m

<Parquet File: {'columns': ['trip_duration', 'start_time', 'stop_time', 'start_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_id', 'end_station_name', 'end_station_latitude', 'end_station_longitude', 'bike_id', 'user_type', 'birth_year', 'gender'], 'categories': [], 'name': '/data4/parquet_spark/citibike.parquet/start_station_id=482/part-r-00140-f8a97310-1d33-4890-9550-c192fb0d03c5.snappy.parquet', 'rows': 153755}>

In [60]:
m.to_pandas(columns='trip_duration')

KeyError: 'trip_duration'