In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, IntegerType, DateType,StructType,StructField
from pyspark.sql.functions import col
from datetime import datetime

spark = SparkSession.builder.master("local[2]").appName("app").getOrCreate()

In [0]:
schema = StructType(
    [
        StructField("sales_id",IntegerType(),False),
        StructField("name",StringType(),False),
        StructField("salary",IntegerType(),True),
        StructField("commission_rate",IntegerType(),True),
        StructField("hire_date",DateType(),False)
    ]
)

data = [
    (1        , "John" , 100000 , 6 , datetime(2006,4,1) ), 
    (2        , "Amy"  , 12000  , 5 , datetime(2010,5,1) ), 
    (3        , "Mark" , 65000  , 12  , datetime(2008,12,25) ), 
    (4        , "Pam"  , 25000  , 25  , datetime(2005,1,1) ), 
    (5        , "Alex" , 5000   , 10 , datetime(2007,2,3) ), 
]

salesPerson = spark.createDataFrame(data,schema)
salesPerson.show()


+--------+----+------+---------------+----------+
|sales_id|name|salary|commission_rate| hire_date|
+--------+----+------+---------------+----------+
|       1|John|100000|              6|2006-04-01|
|       2| Amy| 12000|              5|2010-05-01|
|       3|Mark| 65000|             12|2008-12-25|
|       4| Pam| 25000|             25|2005-01-01|
|       5|Alex|  5000|             10|2007-02-03|
+--------+----+------+---------------+----------+



In [0]:
schema = StructType(
    [
        StructField("com_id",IntegerType(),False),
        StructField("name",StringType(),False),
        StructField("city",StringType(),True)
    ]
)

data = [
    (1      , "RED"    , "Boston"   ),
    (2      , "ORANGE" , "New York" ),
    (3      , "YELLOW" , "Boston"   ),
    (4      , "GREEN"  , "Austin"   )
]

company = spark.createDataFrame(data,schema)
company.show()

+------+------+--------+
|com_id|  name|    city|
+------+------+--------+
|     1|   RED|  Boston|
|     2|ORANGE|New York|
|     3|YELLOW|  Boston|
|     4| GREEN|  Austin|
+------+------+--------+



In [0]:
schema = StructType(
    [
        StructField("order_id",IntegerType(),False),
        StructField("order_date",DateType(),False),
        StructField("com_id",IntegerType(),False),
        StructField("sales_id",IntegerType(),True),
        StructField("amount",IntegerType(),True)        
    ]
)

data = [
    (1, datetime(2014,1,1), 3,4,10000 ),
    (2, datetime(2014,2,1), 4,5,5000 ),
    (3, datetime(2014,3,1), 1,1,50000 ),
    (4, datetime(2014,4,1), 1,4,25000 )
]

orders = spark.createDataFrame(data,schema)
orders.show()

+--------+----------+------+--------+------+
|order_id|order_date|com_id|sales_id|amount|
+--------+----------+------+--------+------+
|       1|2014-01-01|     3|       4| 10000|
|       2|2014-02-01|     4|       5|  5000|
|       3|2014-03-01|     1|       1| 50000|
|       4|2014-04-01|     1|       4| 25000|
+--------+----------+------+--------+------+



In [0]:
# Write a solution to find the names of all the salespersons who did not have any orders related to the company with the name "RED". Return the result table in any order.

sales_ids_to_skip = orders.alias('o').join(company.alias('c'),col("o.com_id") == col("c.com_id"),"inner").where(col("c.name")=="RED").select("o.sales_id")
sales_ids_to_skip.show()

salesPerson.alias("sp").join(sales_ids_to_skip.alias("skip"),col("sp.sales_id")==col("skip.sales_id"),"left").filter(col("skip.sales_id").isNull()).select("sp.name").show()

+--------+
|sales_id|
+--------+
|       1|
|       4|
+--------+

+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+



In [0]:
# ids_to_skip = sales_ids_to_skip.select("sales_id").rdd.flatMap(lambda x:x).collect()
ids_to_skip = sales_ids_to_skip.rdd.map(lambda x: x.sales_id).collect()
print("ids to skip: ",ids_to_skip)

salesPerson.filter(~col("sales_id").isin(ids_to_skip)).select('name').show()

ids to skip:  [1, 4]
+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+



In [0]:
orders.createOrReplaceTempView("o")
salesPerson.createOrReplaceTempView("sp")
company.createOrReplaceTempView("c")

spark.sql("select name from sp where sp.sales_id not in (select o.sales_id from c  join o using(com_id) where c.name='RED')").show()


+----+
|name|
+----+
| Amy|
|Mark|
|Alex|
+----+

