In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType,StringType
from pyspark.sql.functions import col,avg,round,when,sum

spark = SparkSession.builder.appName("app").master("local[2]").getOrCreate()

In [0]:
schema = StructType([
    StructField("query_name",StringType(),False),
    StructField("result",StringType(),False),
    StructField("position",IntegerType(),False),
    StructField("rating",IntegerType(),False)
])
data = [
 ("Dog"        , "Golden Retriever"  , 1        , 5)      ,
 ("Dog"        , "German Shepherd"   , 2        , 5)      ,
 ("Dog"        , "Mule"              , 200      , 1)      ,
 ("Cat"        , "Shirazi"           , 5        , 2)      ,
 ("Cat"        , "Siamese"           , 3        , 3)      ,
 ("Cat"        , "Sphynx"            , 7        , 4)
]
query = spark.createDataFrame(data,schema)
query.show()

+----------+----------------+--------+------+
|query_name|          result|position|rating|
+----------+----------------+--------+------+
|       Dog|Golden Retriever|       1|     5|
|       Dog| German Shepherd|       2|     5|
|       Dog|            Mule|     200|     1|
|       Cat|         Shirazi|       5|     2|
|       Cat|         Siamese|       3|     3|
|       Cat|          Sphynx|       7|     4|
+----------+----------------+--------+------+



In [0]:
# We define query quality as:
#     * The average of the ratio between query rating and its position.
# We also define poor query percentage as:
#     * The percentage of all queries with rating less than 3.
# Write a solution to find each query_name, the quality and poor_query_percentage. Both quality and poor_query_percentage should be rounded to 2 decimal places.
# Return the result table in any order.

query.withColumn("quality",query.rating/query.position)\
    .withColumn("poor",when(query.rating<3,1).otherwise(0))\
    .groupBy("query_name")\
    .agg(round(avg("quality"),2).alias("ratio"),round(100*avg("poor"),2).alias("poor_query_percentage"))\
    .show()


+----------+-----+---------------------+
|query_name|ratio|poor_query_percentage|
+----------+-----+---------------------+
|       Dog|  2.5|                33.33|
|       Cat| 0.66|                33.33|
+----------+-----+---------------------+



In [0]:
query.createOrReplaceTempView("query")
spark.sql("select query_name, round(avg(rating/position),2) as ratio, round(avg(100*case when rating<3 then 1 else 0 end),2) as poor_query_percentage from query group by 1").show()

+----------+-----+---------------------+
|query_name|ratio|poor_query_percentage|
+----------+-----+---------------------+
|       Dog|  2.5|                33.33|
|       Cat| 0.66|                33.33|
+----------+-----+---------------------+



In [0]:
spark.stop()