
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/2013_hard_drive.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df.limit(10).display()
df.printSchema()


_c0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,smart_9_raw,smart_10_raw,smart_11_raw,smart_12_raw,smart_13_raw,smart_15_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,smart_189_raw,smart_190_raw,smart_191_raw,smart_192_raw,smart_193_raw,smart_194_raw,smart_195_raw,smart_196_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_200_raw,smart_201_raw,smart_223_raw,smart_225_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
0,2013-04-10,MJ0351YNG9Z0XA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,4031,,,,,,,,,,,,,,,26,,,0,,,,,,,,,,,,,,
1,2013-04-10,MJ0351YNG9WJSA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,2,,,4099,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,
2,2013-04-10,MJ0351YNG9Z7LA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,3593,,,,,,,,,,,,,,,26,,,0,,,,,,,,,,,,,,
3,2013-04-10,MJ0351YNGAD37A,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,2339,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,
4,2013-04-10,MJ0351YNGABYAA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,2741,,,,,,,,,,,,,,,25,,,0,,,,,,,,,,,,,,
5,2013-04-10,MJ1311YNG7ESHA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,8723,,,,,,,,,,,,,,,20,,,0,,,,,,,,,,,,,,
6,2013-04-10,S2F0BE6T,ST1500DM003,1500301910016,0,23972424,,,,0,,,4378,,,,,,,,,,,,,,,25,,,0,,,,,,,,,,,,,,
7,2013-04-10,W1F0LRXG,ST3000DM001,3000592982016,0,5538736,,,,0,,,6297,,,,,,,,,,,,,,,25,,,8,,,,,,,,,,,,,,
8,2013-04-10,6XW099YJ,ST31500541AS,1500301910016,0,17458817,,,,0,,,24737,,,,,,,,,,,,,,,24,,,0,,,,,,,,,,,,,,
9,2013-04-10,W1F09LPX,ST3000DM001,3000592982016,0,49517448,,,,0,,,7484,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,


root
 |-- _c0: string (nullable = true)
 |-- date: string (nullable = true)
 |-- serial_number: string (nullable = true)
 |-- model: string (nullable = true)
 |-- capacity_bytes: string (nullable = true)
 |-- failure: string (nullable = true)
 |-- smart_1_raw: string (nullable = true)
 |-- smart_2_raw: string (nullable = true)
 |-- smart_3_raw: string (nullable = true)
 |-- smart_4_raw: string (nullable = true)
 |-- smart_5_raw: string (nullable = true)
 |-- smart_7_raw: string (nullable = true)
 |-- smart_8_raw: string (nullable = true)
 |-- smart_9_raw: string (nullable = true)
 |-- smart_10_raw: string (nullable = true)
 |-- smart_11_raw: string (nullable = true)
 |-- smart_12_raw: string (nullable = true)
 |-- smart_13_raw: string (nullable = true)
 |-- smart_15_raw: string (nullable = true)
 |-- smart_183_raw: string (nullable = true)
 |-- smart_184_raw: string (nullable = true)
 |-- smart_187_raw: string (nullable = true)
 |-- smart_188_raw: string (nullable = true)
 |-- smart_18

In [0]:
# Create a view or table

temp_table_name = "2013_hard_drive_csv_temp"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `2013_hard_drive_csv_temp` LIMIT 10

_c0,date,serial_number,model,capacity_bytes,failure,smart_1_raw,smart_2_raw,smart_3_raw,smart_4_raw,smart_5_raw,smart_7_raw,smart_8_raw,smart_9_raw,smart_10_raw,smart_11_raw,smart_12_raw,smart_13_raw,smart_15_raw,smart_183_raw,smart_184_raw,smart_187_raw,smart_188_raw,smart_189_raw,smart_190_raw,smart_191_raw,smart_192_raw,smart_193_raw,smart_194_raw,smart_195_raw,smart_196_raw,smart_197_raw,smart_198_raw,smart_199_raw,smart_200_raw,smart_201_raw,smart_223_raw,smart_225_raw,smart_240_raw,smart_241_raw,smart_242_raw,smart_250_raw,smart_251_raw,smart_252_raw,smart_254_raw,smart_255_raw
0,2013-04-10,MJ0351YNG9Z0XA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,4031,,,,,,,,,,,,,,,26,,,0,,,,,,,,,,,,,,
1,2013-04-10,MJ0351YNG9WJSA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,2,,,4099,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,
2,2013-04-10,MJ0351YNG9Z7LA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,3593,,,,,,,,,,,,,,,26,,,0,,,,,,,,,,,,,,
3,2013-04-10,MJ0351YNGAD37A,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,2339,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,
4,2013-04-10,MJ0351YNGABYAA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,2741,,,,,,,,,,,,,,,25,,,0,,,,,,,,,,,,,,
5,2013-04-10,MJ1311YNG7ESHA,Hitachi HDS5C3030ALA630,3000592982016,0,0,,,,0,,,8723,,,,,,,,,,,,,,,20,,,0,,,,,,,,,,,,,,
6,2013-04-10,S2F0BE6T,ST1500DM003,1500301910016,0,23972424,,,,0,,,4378,,,,,,,,,,,,,,,25,,,0,,,,,,,,,,,,,,
7,2013-04-10,W1F0LRXG,ST3000DM001,3000592982016,0,5538736,,,,0,,,6297,,,,,,,,,,,,,,,25,,,8,,,,,,,,,,,,,,
8,2013-04-10,6XW099YJ,ST31500541AS,1500301910016,0,17458817,,,,0,,,24737,,,,,,,,,,,,,,,24,,,0,,,,,,,,,,,,,,
9,2013-04-10,W1F09LPX,ST3000DM001,3000592982016,0,49517448,,,,0,,,7484,,,,,,,,,,,,,,,29,,,0,,,,,,,,,,,,,,


In [0]:
#spark.sql("DROP DATABASE BRONZE CASCADE")
spark.sql("CREATE DATABASE IF NOT EXISTS BRONZE")


DataFrame[]

In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.
spark.conf.set("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation","true")

permanent_table_name = "BRONZE.2013_hard_drive_csv_parquet"

df.write.mode("overwrite").format("parquet").saveAsTable(permanent_table_name)