In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType
from pyspark.sql.functions import col, count

spark = SparkSession.builder.appName("app").master("local[3]").getOrCreate()

In [0]:
schema = StructType([
    StructField("visit_id",IntegerType(),False),
    StructField("customer_id",IntegerType(),False)
])

data = [
    ( 1        , 23    )      ,
    ( 2        , 9     )      ,
    ( 4        , 30    )      ,
    ( 5        , 54    )      ,
    ( 6        , 96    )      ,
    ( 7        , 54    )      ,
    ( 8        , 54    )    

]

visits = spark.createDataFrame(data,schema)
visits.show()

+--------+-----------+
|visit_id|customer_id|
+--------+-----------+
|       1|         23|
|       2|          9|
|       4|         30|
|       5|         54|
|       6|         96|
|       7|         54|
|       8|         54|
+--------+-----------+



In [0]:
schema = StructType([
    StructField("transaction_id",IntegerType(),False),
    StructField("visit_id",IntegerType(),False),
    StructField("amount",IntegerType(),False)
])

data = [
    ( 2              , 5        , 310  )  ,
    ( 3              , 5        , 300  )  ,
    ( 9              , 5        , 200  )  ,
    ( 12             , 1        , 910  )  ,
    ( 13             , 2        , 970  )  
]

transactions = spark.createDataFrame(data,schema)
transactions.show()

+--------------+--------+------+
|transaction_id|visit_id|amount|
+--------------+--------+------+
|             2|       5|   310|
|             3|       5|   300|
|             9|       5|   200|
|            12|       1|   910|
|            13|       2|   970|
+--------------+--------+------+



In [0]:
# Write a solution to find the IDs of the users who visited without making any transactions and the number of times they made these types of visits.
# Return the result table sorted in any order.
visits.join(transactions,visits.visit_id==transactions.visit_id,"left").filter(col("amount").isNull()).groupBy("customer_id").agg(count("customer_id").alias("count_no_trans")).show()

+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
|         30|             1|
|         96|             1|
|         54|             2|
+-----------+--------------+



In [0]:
visits.createOrReplaceTempView("v")
transactions.createOrReplaceTempView("t")
spark.sql("select v.customer_id ,count(*) as count_no_trans from v left join t using(visit_id) where t.amount is null group by 1").show()

+-----------+--------------+
|customer_id|count_no_trans|
+-----------+--------------+
|         30|             1|
|         96|             1|
|         54|             2|
+-----------+--------------+



In [0]:
spark.stop()