In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, TimestampType, DoubleType
from pyspark.sql.types import StructField, StructType, StringType

spark = (
    
            SparkSession
            .builder
            .appName('SparkCourse') 
            .master('local[*]') 
            .config('spark.dynamicAllocation.enabled', 'false')
            .config('spark.sql.adaptive.enabled', 'false')
            .enableHiveSupport()
            .getOrCreate()
    )

sc = spark.sparkContext

spark

In [3]:
data_path = '../data/'

In [4]:
taxi_schema = StructType([
    StructField('VendorID', IntegerType(), True),
    StructField('tpep_pickup_datetime', TimestampType(), True),
    StructField('tpep_dropoff_datetime', TimestampType(), True),
    StructField('passenger_count', DoubleType(), True),
    StructField('trip_distance', DoubleType(), True),
    StructField('RatecodeID', DoubleType(), True),
    StructField('store_and_fwd_flag', StringType(), True),
    StructField('PULocationID', IntegerType(), True),
    StructField('DOLocationID', IntegerType(), True),
    StructField('payment_type', IntegerType(), True),
    StructField('fare_amount', DoubleType(), True),
    StructField('extra', DoubleType(), True),
    StructField('mta_tax', DoubleType(), True),
    StructField('tip_amount', DoubleType(), True),
    StructField('tolls_amount', DoubleType(), True),
    StructField('improvement_surcharge', DoubleType(), True),
    StructField('total_amount', DoubleType(), True),
    StructField('congestion_surcharge', DoubleType(), True),
    StructField('airport_fee', DoubleType(), True),
])

In [5]:
yellow_taxi_df = (
    spark
        .read
        .option('header', 'true')
        .schema(taxi_schema)
        .csv(data_path + 'YellowTaxis_202210.csv')
)

In [6]:
green_taxi_df = (spark
                    .read
                    .option('header', 'true')
                    .option('delimiter', '\t')
                    .csv(data_path + 'GreenTaxis_202210.csv')
    )

green_taxi_df.createOrReplaceTempView('GreenTaxis')

In [7]:
green_taxi_df.printSchema()

root
 |-- VendorId: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [8]:
yellow_taxi_df.createOrReplaceTempView('YellowTaxis')

In [9]:
output_df = spark.sql(
    'SELECT * FROM YellowTaxis WHERE PULocationID=171'
)

In [10]:
output_df.show()

[Stage 2:>                                                          (0 + 3) / 3]

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2022-10-01 11:17:23|  2022-10-01 12:08:50|            1.0|          9.4|      99.0|                 N|         171|         263|           1|       35.2|  0.0|    0.5|       0.

                                                                                

In [11]:
df_sql = spark.sql("""
    SELECT 'Yellow' AS TaxiType
          ,tpep_pickup_datetime AS PickupTime
          ,tpep_dropoff_datetime AS DropTime
          ,PULocationID AS PickupLoacationId
          ,DOLocationID AS DropLocationId

    FROM YellowTaxis
    
    UNION ALL
          
    SELECT 'Green' AS TaxiType
          ,lpep_pickup_datetime AS PickupTime
          ,lpep_dropoff_datetime AS DropTime
          ,PULocationID AS PickupLocationId
          ,DOLocationID AS DropLocationId
      FROM GreenTaxis
          """
)

In [12]:
df_sql.show(10, truncate=False)

+--------+-------------------+-------------------+-----------------+--------------+
|TaxiType|PickupTime         |DropTime           |PickupLoacationId|DropLocationId|
+--------+-------------------+-------------------+-----------------+--------------+
|Yellow  |2022-10-01 03:03:41|2022-10-01 03:18:39|249              |107           |
|Yellow  |2022-10-01 03:14:30|2022-10-01 03:19:48|151              |238           |
|Yellow  |2022-10-01 03:27:13|2022-10-01 03:37:41|238              |166           |
|Yellow  |2022-10-01 03:32:53|2022-10-01 03:38:55|142              |239           |
|Yellow  |2022-10-01 03:44:55|2022-10-01 03:50:21|238              |166           |
|Yellow  |2022-10-01 03:22:52|2022-10-01 03:52:14|186              |41            |
|Yellow  |2022-10-01 03:33:19|2022-10-01 03:44:51|162              |145           |
|Yellow  |2022-10-01 03:02:42|2022-10-01 03:50:01|100              |22            |
|Yellow  |2022-10-01 03:06:35|2022-10-01 03:24:38|138              |112     

In [13]:
taxi_zones_schema = 'Location INT, Borough STRING, Zone STRING, ServiceZone STRING'

taxi_zones_df = (
    spark
        .read
        .schema(taxi_zones_schema)
        .csv(data_path + 'TaxiZones.csv')
)

taxi_zones_df.createOrReplaceGlobalTempView('TaxiZones')

taxi_zones_df.show()

23/08/19 21:10:32 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/08/19 21:10:32 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/08/19 21:10:40 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/08/19 21:10:40 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore raddy@127.0.1.1
23/08/19 21:10:40 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
23/08/19 21:10:41 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+--------+-------------+--------------------+-----------+
|Location|      Borough|                Zone|ServiceZone|
+--------+-------------+--------------------+-----------+
|       1|          EWR|      Newark Airport|        EWR|
|       2|       Queens|         Jamaica Bay|  Boro Zone|
|       3|        Bronx|Allerton/Pelham G...|  Boro Zone|
|       4|    Manhattan|       Alphabet City|Yellow Zone|
|       5|Staten Island|       Arden Heights|  Boro Zone|
|       6|Staten Island|Arrochar/Fort Wad...|  Boro Zone|
|       7|       Queens|             Astoria|  Boro Zone|
|       8|       Queens|        Astoria Park|  Boro Zone|
|       9|       Queens|          Auburndale|  Boro Zone|
|      10|       Queens|        Baisley Park|  Boro Zone|
|      11|     Brooklyn|          Bath Beach|  Boro Zone|
|      12|    Manhattan|        Battery Park|Yellow Zone|
|      13|    Manhattan|   Battery Park City|Yellow Zone|
|      14|     Brooklyn|           Bay Ridge|  Boro Zone|
|      15|    

In [14]:
spark.sql(
    """
    SELECT Borough, TaxiType, COUNT(*) AS TotalTrips
    FROM global_temp.TaxiZones
    LEFT JOIN
    (
        SELECT 'Yellow' AS TaxiType, PULocationID FROM YellowTaxis
        UNION ALL
        SELECT 'Green' AS TaxiType, PULocationID FROM YellowTaxis
    ) AllTaxis
    ON AllTaxis.PULocationID = TaxiZones.Location
    GROUP BY Borough, TaxiType
    ORDER BY Borough, TaxiType

    """
).show()



+-------------+--------+----------+
|      Borough|TaxiType|TotalTrips|
+-------------+--------+----------+
|        Bronx|   Green|      4511|
|        Bronx|  Yellow|      4511|
|     Brooklyn|   Green|     28089|
|     Brooklyn|  Yellow|     28089|
|          EWR|   Green|      1157|
|          EWR|  Yellow|      1157|
|    Manhattan|    null|         2|
|    Manhattan|   Green|   3250695|
|    Manhattan|  Yellow|   3250695|
|       Queens|    null|         1|
|       Queens|   Green|    333922|
|       Queens|  Yellow|    333922|
|Staten Island|    null|         2|
|Staten Island|   Green|       303|
|Staten Island|  Yellow|       303|
|      Unknown|   Green|     56735|
|      Unknown|  Yellow|     56735|
+-------------+--------+----------+



                                                                                

In [16]:
spark.sql("""

    show databases
""").show()

+---------+
|namespace|
+---------+
|  default|
+---------+




CREATE SCHEMA:

prod - manually  
test - infer(dynamic) 

    (StructType([\
        StructField('name', Type(), Nullable),\
        ....\
    ]))



In [18]:
yellow_taxi_df.printSchema()


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [19]:
taxi_zones_df.printSchema()

root
 |-- Location: integer (nullable = true)
 |-- Borough: string (nullable = true)
 |-- Zone: string (nullable = true)
 |-- ServiceZone: string (nullable = true)



In [21]:
joined = (
    yellow_taxi_df.join(taxi_zones_df, 
        yellow_taxi_df.PULocationID == taxi_zones_df.Location,
        'inner')
)

In [24]:
16.99/30

0.5663333333333332

In [26]:
35.97/48

0.749375

In [23]:
joined.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+--------+---------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|Location|  Borough|                Zone|ServiceZone|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+--------+---------+--------------------+-----------+
|       1| 2022-10-01 03:03:41|

In [28]:
data = [
    ['Sales', 'Anna' ,8000],
    ['Marketing', 'Steve', 9500],
    ['Tech', 'Christina', 9000],
    ['Marketing', 'Neha', 10500],
    ['Tech', 'Kari', 10000],
    ['Sales', 'Ivan', 10000],
    ['Tech', 'Mohit', 8000],
]
    

#RDD
employees_rdd = sc.parallelize(data)

In [29]:
#DF
employees_df =  employees_rdd.toDF(['department', 'employee', 'salary'])

                                                                                

In [30]:
employees_df.show()

+----------+---------+------+
|department| employee|salary|
+----------+---------+------+
|     Sales|     Anna|  8000|
| Marketing|    Steve|  9500|
|      Tech|Christina|  9000|
| Marketing|     Neha| 10500|
|      Tech|     Kari| 10000|
|     Sales|     Ivan| 10000|
|      Tech|    Mohit|  8000|
+----------+---------+------+

