# Data discovery: Load and simple query 
Download the dataset from [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page)


In [1]:
# Import library PySpark and use SparkSession
from pyspark.sql import SparkSession

## Create SparkSession 
- What is a **SparkSession** - It is a representation or working instance of a Spark Application that is used to create and manage data processing in the system.
- What is **master("local[1]")** - defines whether to use local mode to run with only 1 execution thread.
- What is **appName("spark")** - Defines the name of the Spark Application.
- What is **getOrCreate()** - It's the method used to invoke or create a SparkSession.

In [2]:
# Create SparkSession
spark = SparkSession.builder\
             .master("local[1]")\
             .appName("spark")\
             .getOrCreate()

In [None]:
# Load file
local_file = '../datasets/yellow_tripdata_2023-01.parquet'

# Show data from parquet file
spark.read.parquet(local_file).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2023-01-01 07:32:10|  2023-01-01 07:40:36|            1.0|         0.97|       1.0|                 N|         161|         141|           2|        9.3|  1.0|    0.5|       0.

In [None]:
# Read taxi data
local_file = '../datasets/yellow_tripdata_2023-01.parquet'
df = spark.read.parquet(local_file)

In [None]:
# DF is like a relation table in **memory** . Let's see the columns
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [None]:
# Query sample from SQL to Spark query:
# select VendorID, total_amount from df
# where total_amount > 1;
# ---------------------------------------
# And save result to df2

df2 = df.select('VendorID','total_amount').where('total_amount > 1')

In [None]:
# Query sample from SQL to Spark query:
# select VendorID, total_amount from df
# where total_amount > 1
# limit 5;

df.select('VendorID','total_amount').where('total_amount > 1').show(n=5)

+--------+------------+
|VendorID|total_amount|
+--------+------------+
|       2|        14.3|
|       2|        16.9|
|       2|        34.9|
|       1|       20.85|
|       2|       19.68|
+--------+------------+
only showing top 5 rows



In [None]:
# Create Temporary View for SQL query 

df.createOrReplaceTempView('yellow_taxis')

In [None]:
# SQL Statement

spark.sql('select VendorID, tpep_pickup_datetime, passenger_count from yellow_taxis where total_amount > 1 and passenger_count > 2').show(n=5)

+--------+--------------------+---------------+
|VendorID|tpep_pickup_datetime|passenger_count|
+--------+--------------------+---------------+
|       1| 2023-01-01 07:43:37|            4.0|
|       1| 2023-01-01 07:03:36|            3.0|
|       1| 2023-01-01 07:21:49|            4.0|
|       2| 2023-01-01 07:27:16|            4.0|
|       2| 2023-01-01 07:15:13|            5.0|
+--------+--------------------+---------------+
only showing top 5 rows



In [None]:
# **Important** Stop the session

spark.stop()