In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
spark = SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate()

In [2]:
csvFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv"

In [3]:
flightSchema = StructType([StructField("date", StringType(), True),
                                   StructField("delay", IntegerType(), True),
                                   StructField("distance", IntegerType(),True),
                                   StructField("origin", StringType(), True),
                                   StructField("destination", StringType(), True)])

In [4]:
df =(spark.read.format("csv")
.schema(flightSchema)
.option("header","true")
.load(csvFile))

In [5]:
df.createOrReplaceTempView("us_delay_flights_tbl")

In [6]:
spark.sql("""SELECT *
            FROM us_delay_flights_tbl""").show(10,False)

+--------+-----+--------+------+-----------+
|date    |delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|01011245|6    |602     |ABE   |ATL        |
|01020600|-8   |369     |ABE   |DTW        |
|01021245|-2   |602     |ABE   |ATL        |
|01020605|-4   |602     |ABE   |ATL        |
|01031245|-4   |602     |ABE   |ATL        |
|01030605|0    |602     |ABE   |ATL        |
|01041243|10   |602     |ABE   |ATL        |
|01040605|28   |602     |ABE   |ATL        |
|01051245|88   |602     |ABE   |ATL        |
|01050605|9    |602     |ABE   |ATL        |
+--------+-----+--------+------+-----------+
only showing top 10 rows



In [7]:
spark.sql("""SELECT distance,origin,destination 
            FROM us_delay_flights_tbl
            WHERE distance >1000
            ORDER BY distance DESC""").show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [34]:
(df.select("distance", "origin", "destination")
.filter(col("distance")>1000)
.orderBy(desc("distance"))
.show(10))

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



In [24]:
spark.sql("""SELECT delay,date,origin,destination 
            FROM us_delay_flights_tbl
            WHERE origin='SFO' and destination='ORD' and delay > 120 
            ORDER BY delay DESC""").show(10)

+-----+--------+------+-----------+
|delay|    date|origin|destination|
+-----+--------+------+-----------+
| 1638|02190925|   SFO|        ORD|
|  396|01031755|   SFO|        ORD|
|  326|01022330|   SFO|        ORD|
|  320|01051205|   SFO|        ORD|
|  297|01190925|   SFO|        ORD|
|  296|02171115|   SFO|        ORD|
|  279|01071040|   SFO|        ORD|
|  274|01051550|   SFO|        ORD|
|  266|03120730|   SFO|        ORD|
|  258|01261104|   SFO|        ORD|
+-----+--------+------+-----------+
only showing top 10 rows



In [35]:
(df
.select("delay", "date", "origin", "destination")
.where((col("origin")=="SFO") & (col("destination")=="ORD") & (col("delay")>120))
.orderBy("delay")
.show(10))

+-----+--------+------+-----------+
|delay|    date|origin|destination|
+-----+--------+------+-----------+
|  122|01011237|   SFO|        ORD|
|  123|02131320|   SFO|        ORD|
|  123|03311600|   SFO|        ORD|
|  124|01011410|   SFO|        ORD|
|  125|03051115|   SFO|        ORD|
|  126|01041205|   SFO|        ORD|
|  131|01031550|   SFO|        ORD|
|  134|02021115|   SFO|        ORD|
|  137|02081104|   SFO|        ORD|
|  139|03311810|   SFO|        ORD|
+-----+--------+------+-----------+
only showing top 10 rows



In [38]:
from pyspark.sql.functions import *
new_df=df.withColumn("DateTime", to_timestamp(col("date"), "MMddHHmm")).drop("date")

In [39]:
new_df.createOrReplaceTempView("us_delay_flights_tbl")

In [40]:
spark.sql("""SELECT day(DateTime) Day,month(DateTime) Month, count(*) Count
            FROM us_delay_flights_tbl
            WHERE origin='SFO' and destination='ORD' and delay > 120 
            Group by month(DateTime), day(DateTime)
            ORDER BY count(*) DESC""").show(10)

+---+-----+-----+
|Day|Month|Count|
+---+-----+-----+
| 31|    3|    4|
|  2|    1|    4|
|  3|    1|    4|
|  9|    2|    3|
| 12|    3|    3|
| 26|    3|    2|
| 17|    3|    2|
| 27|    2|    2|
|  1|    1|    2|
|  5|    1|    2|
+---+-----+-----+
only showing top 10 rows



In [50]:
(new_df
.where((col("origin")=="SFO") & (col("destination")=="ORD") & (col("delay")>120))
.groupBy(month(col("DateTime")), dayofmonth(col("DateTime")))
.count()
.orderBy(desc("count")).show(10))

+---------------+--------------------+-----+
|month(DateTime)|dayofmonth(DateTime)|count|
+---------------+--------------------+-----+
|              1|                   3|    4|
|              1|                   2|    4|
|              3|                  31|    4|
|              3|                  12|    3|
|              2|                   9|    3|
|              3|                  26|    2|
|              3|                  17|    2|
|              1|                   5|    2|
|              1|                  30|    2|
|              1|                   1|    2|
+---------------+--------------------+-----+
only showing top 10 rows



In [38]:
spark.sql("""SELECT delay, origin, destination,
             CASE
                 WHEN delay >= 360 THEN 'Very Long Delays'
                 WHEN delay >= 120 AND delay < 360 THEN 'Long Delays'
                 WHEN delay >= 60 AND delay < 120 THEN 'Short Delays'
                 WHEN delay >= 0 AND delay < 60 THEN 'Tolerable Delays'
                 WHEN delay = 0 THEN 'No Delays'
                 ELSE 'Early'
             END AS Flight_Delays
             FROM us_delay_flights_tbl
             ORDER BY origin, delay DESC""").show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



In [66]:
(df
.select(col("delay"), col("origin"), col("destination"),
                 when(col("delay") >= 360,"Very Long Delays")
                 .when((col("delay") >= 120) & (col("delay") < 360),"Long Delays")
                 .when((col("delay") >= 60) & (col("delay") < 120),"Short Delays")
                 .when((col("delay")>= 0) & (col("delay") < 60),"Tolerable Delays")
                 .when(col("delay") == 0, "No Delays")
                 .otherwise("Early")
                 .alias("Flight_Delays"))
.orderBy(asc("origin"),desc("delay"))
.show(10))

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



In [1]:
spark.sql("CREATE DATABASE learn_spark_db")
spark.sql("USE learn_spark_db")

DataFrame[]

In [2]:
spark.sql("CREATE TABLE managed_us_delay_flights_tbl (date STRING, delay INT, distance INT, origin STRING, destination STRING)")

DataFrame[]

In [8]:
spark.sql("""CREATE TABLE us_delay_flights_tbl(date STRING, delay INT, distance INT, origin STRING, destination STRING)
             USING csv OPTIONS (PATH 'C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/departuredelays.csv')""")

DataFrame[]

In [10]:
spark.catalog.listDatabases()

[Database(name='default', description='Default Hive database', locationUri='file:/C:/Users/alvaro.romero/Big_Data/spark-warehouse'),
 Database(name='learn_spark_db', description='', locationUri='file:/C:/Users/alvaro.romero/Big_Data/spark-warehouse/learn_spark_db.db')]

In [11]:
spark.catalog.listTables()

[Table(name='managed_us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='us_delay_flights_tbl', database='learn_spark_db', description=None, tableType='EXTERNAL', isTemporary=False)]

In [12]:
spark.catalog.listColumns("us_delay_flights_tbl")

[Column(name='date', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='delay', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='distance', description=None, dataType='int', nullable=True, isPartition=False, isBucket=False),
 Column(name='origin', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False),
 Column(name='destination', description=None, dataType='string', nullable=True, isPartition=False, isBucket=False)]

In [13]:
df=spark.read.format("parquet").load("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/Parquet/*")
df.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- SupervisorDistrict: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Location: string (nullable =

In [14]:
df2=spark.read.format("json").load("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/JSON/*")
df2.printSchema()

root
 |-- ALSUnit: boolean (nullable = true)
 |-- Address: string (nullable = true)
 |-- AvailableDtTS: string (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- CallNumber: long (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- City: string (nullable = true)
 |-- FinalPriority: long (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- IncidentDate: string (nullable = true)
 |-- IncidentNumber: long (nullable = true)
 |-- Location: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- NumAlarms: long (nullable = true)
 |-- OnWatchDate: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- ResponseDelayedMins: double (nullable = true)
 |-- RowID: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Supervi

In [35]:
df3=(spark.read.format("csv")
.option("inferSchema", "true")
.option("mode","PERMISSIVE").load("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/CSV/*"))
df3.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: string (nullable = true)
 |-- _c9: string (nullable = true)
 |-- _c10: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: boolean (nullable = true)
 |-- _c15: string (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: string (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: string (nullable = true)
 |-- _c20: string (nullable = true)
 |-- _c21: string (nullable = true)
 |-- _c22: string (nullable = true)
 |-- _c23: string (nullable = true)
 |-- _c24: double (nullable = true)
 |-- _c25: timestamp (nullable = true)
 |-- _c26: timestamp (nullable = true)
 |-- _c27: timestamp

In [16]:
df4=spark.read.format("avro").load("C:/Users/alvaro.romero/Big_Data/Ejercicios_Spark/Guardar_Tablas2/AVRO/*")
df4.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 |-- SupervisorDistrict: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Location: string (nullable =

In [51]:
parquetFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet"
jsonFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/json/*"
csvFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*"
orcFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
avroFile="C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"
schema= "DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count INT"

In [57]:
df = (spark
.read
.format("parquet")
.option("path", parquetFile)
.load())

In [59]:
df2 = spark.read.parquet(parquetFile)

In [61]:
df.show(10)
df2.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equatorial Guinea|      United States|    1|
|    United States|          Singapore|   25|
|    United States|            Grenada|   54|
|       Costa Rica|      United States|  477|
|          Senegal|      United States|   29|
|    United States|   Marshall Islands|   44|
+-----------------+-------------------+-----+
only showing top 10 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
|    United States|              India|   69|
|            Egypt|      United States|   24|
|Equator

In [62]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
            USING parquet
            OPTIONS ( path "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/parquet/2010-summary.parquet")""")

DataFrame[]

In [64]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [65]:
 df = (spark
.read
.format("json")
.option("path", jsonFile)
.load())

In [67]:
df.show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
|United States    |Singapore          |1    |
|United States    |Grenada            |62   |
|Costa Rica       |United States      |588  |
|Senegal          |United States      |40   |
|Moldova          |United States      |1    |
+-----------------+-------------------+-----+
only showing top 10 rows



In [68]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
            USING json
            OPTIONS ( path "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/json/*")""")

DataFrame[]

In [69]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India              |62   |
|United States    |Singapore          |1    |
|United States    |Grenada            |62   |
|Costa Rica       |United States      |588  |
|Senegal          |United States      |40   |
|Moldova          |United States      |1    |
+-----------------+-------------------+-----+
only showing top 10 rows



In [70]:
df = (spark
.read
.format("csv")
.option("header","true")
.schema(schema)
.option("mode","FAILFAST")
.option("nullValue","")
.option("path",csvFile)
.load())

In [71]:
df.show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [73]:
df2 = (spark
  .read
  .option("header", "true")
  .option("mode", "FAILFAST") 
  .option("nullValue", "")
  .schema(schema)
  .csv(csvFile))

In [74]:
df2.show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [75]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
    USING csv
    OPTIONS (
      path "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/csv/*",
      header "true",
      inferSchema "true",
      mode "FAILFAST"
    )""")

DataFrame[]

In [76]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [77]:
df = (spark
.read
.format("orc")
.option("path", orcFile)
.load())

In [78]:
df.show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [79]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
    USING orc
    OPTIONS (
      path "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/orc/*"
    )""")

DataFrame[]

In [80]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [81]:
df = (spark.read
  .format("avro")
  .option("path", avroFile)
  .load())

In [82]:
df.show(10,False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [83]:
spark.sql("""CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl
    USING avro
    OPTIONS (
      path "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/flights/summary-data/avro/*"
    )""")

DataFrame[]

In [84]:
spark.sql("SELECT * FROM us_delay_flights_tbl").show(10, False)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |1    |
|United States    |Ireland            |264  |
|United States    |India              |69   |
|Egypt            |United States      |24   |
|Equatorial Guinea|United States      |1    |
|United States    |Singapore          |25   |
|United States    |Grenada            |54   |
|Costa Rica       |United States      |477  |
|Senegal          |United States      |29   |
|United States    |Marshall Islands   |44   |
+-----------------+-------------------+-----+
only showing top 10 rows



In [85]:
from pyspark.ml import image

imageDir = "C:/Users/alvaro.romero/Big_Data/LearningSparkV2-master/databricks-datasets/learning-spark-v2/cctvVideos/train_images/"
imagesDF = spark.read.format("image").load(imageDir)

imagesDF.printSchema
imagesDF.select("image.height", "image.width", "image.nChannels", "image.mode", "label").show(5,False)

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |1    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
|288   |384  |3        |16  |0    |
+------+-----+---------+----+-----+
only showing top 5 rows



In [87]:
binaryFilesDF = (spark.read.format("binaryFile")
.option("pathGlobFilter", "*.jpg")
.load(imageDir))

binaryFilesDF.show(5)

+--------------------+-------------------+------+--------------------+-----+
|                path|   modificationTime|length|             content|label|
+--------------------+-------------------+------+--------------------+-----+
|file:/C:/Users/al...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|    1|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|    0|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|    0|
+--------------------+-------------------+------+--------------------+-----+
only showing top 5 rows



In [88]:
binaryFilesDF = (spark.read.format("binaryFile")
  .option("pathGlobFilter", "*.jpg")
  .option("recursiveFileLookup", "true")
  .load(imageDir))
binaryFilesDF.show(5)

+--------------------+-------------------+------+--------------------+
|                path|   modificationTime|length|             content|
+--------------------+-------------------+------+--------------------+
|file:/C:/Users/al...|2021-04-15 02:34:17| 55037|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54634|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54624|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54505|[FF D8 FF E0 00 1...|
|file:/C:/Users/al...|2021-04-15 02:34:17| 54475|[FF D8 FF E0 00 1...|
+--------------------+-------------------+------+--------------------+
only showing top 5 rows

