In [8]:
// because we'll need it later
import org.apache.spark.sql._
import org.apache.spark.sql.functions._

// Utility method to count & print the number of records in each partition.
def printRecordsPerPartition(df:org.apache.spark.sql.Dataset[Row]):Unit = {
  println("Per-Partition Counts:")
  val results = df.rdd                                   // Convert to an RDD
    .mapPartitions(it => Array(it.size).iterator, true)  // For each partition, count
    .collect()                                           // Return the counts to the driver

  results.foreach(x => println("* " + x))
}

printRecordsPerPartition: (df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row])Unit


In [9]:
val parquetDir = "file:///Users/navaro/zeppelin-0.7.3-bin-all/data/pageviews-by-second"

val pagecountsEnAllDF = spark  // Our SparkSession & Entry Point
  .read                        // Our DataFrameReader
  .parquet(parquetDir)         // Returns an instance of DataFrame


parquetDir = file:///Users/navaro/zeppelin-0.7.3-bin-all/data/pageviews-by-second
pagecountsEnAllDF = [timestamp: timestamp, site: string ... 1 more field]


[timestamp: timestamp, site: string ... 1 more field]

In [10]:
pagecountsEnAllDF
  .cache()        // Mark the DataFrame as cached
  .count()        // Materialize the cache


7200000

In [11]:
pagecountsEnAllDF.printSchema()

root
 |-- timestamp: timestamp (nullable = true)
 |-- site: string (nullable = true)
 |-- requests: integer (nullable = true)



In [12]:
val parquetFile = "file:///Users/navaro/zeppelin-0.7.3-bin-all/data/pagecounts-parquet/"

spark.read               // The DataFrameReader
  .parquet(parquetFile)  // Creates a DataFrame from Parquet after reading in the file
  .printSchema()

root
 |-- project: string (nullable = true)
 |-- article: string (nullable = true)
 |-- requests: integer (nullable = true)
 |-- bytes_served: long (nullable = true)



parquetFile = file:///Users/navaro/zeppelin-0.7.3-bin-all/data/pagecounts-parquet/


file:///Users/navaro/zeppelin-0.7.3-bin-all/data/pagecounts-parquet/

In [14]:
import org.apache.spark.sql.types._

val parquetSchema = StructType(
  List(
    StructField("project", StringType, false),
    StructField("article", StringType, false),
    StructField("requests", IntegerType, false),
    StructField("bytes_served", LongType, false)
  )
)

spark.read                // The DataFrameReader
  .schema(parquetSchema)  // Use the specified schema
  .parquet(parquetFile)   // Creates a DataFrame from Parquet after reading in the file
  .printSchema()

root
 |-- project: string (nullable = true)
 |-- article: string (nullable = true)
 |-- requests: integer (nullable = true)
 |-- bytes_served: long (nullable = true)



parquetSchema = StructType(StructField(project,StringType,false), StructField(article,StringType,false), StructField(requests,IntegerType,false), StructField(bytes_served,LongType,false))


StructType(StructField(project,StringType,false), StructField(article,StringType,false), StructField(requests,IntegerType,false), StructField(bytes_served,LongType,false))

In [15]:
val parquetDF = spark.read.schema(parquetSchema).parquet(parquetFile)

printf("Partitions: %,d%n", parquetDF.rdd.partitions.size)
printRecordsPerPartition(parquetDF)
println("-"*80)

Partitions: 8
Per-Partition Counts:
* 1161100                                                                       
* 1111411
* 999869
* 724384
* 725313
* 625841
* 536227
* 386797
--------------------------------------------------------------------------------


parquetDF = [project: string, article: string ... 2 more fields]


[project: string, article: string ... 2 more fields]