%md
Pyspark Coding Interview Qusetion:
==========================
#### Run this file in Databricks Azure cloud

In [0]:
# Stop SparkSession
# It will clear the history of Compute -> Cluster -> Spark UI -> Jobs and Stages
spark.stop()

#### getNumPartitions() is neither an action nor a transformation in PySpark or Apache Spark. Instead, it is a method used to retrieve the number of partitions of an RDD.

In [0]:
print("Hello")

Hello


Pyspark Coding Interview Qusetion:
==========================
1. Create df in pyspark.
2. find avg stock value on daily basis for each stock.
3. find max avg stock value of each stock.

#### 1. Create df in pyspark.
#### =============================

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
data = [("2023-01-01","APPL",150.00),\
 ("2023-01-02","AAPL",155.00),\
 ("2023-01-01","GOOG",2500.00),\
 ("2023-01-02","GOOG",2550.00),\
 ("2023-01-01","MFST",300.00),\
 ("2023-01-02","MFST",310.0)]


In [0]:
schema = ["date","stock","value"]
df = spark.createDataFrame(data=data,schema=schema)
display(df)
df.printSchema()

date,stock,value
2023-01-01,APPL,150.0
2023-01-02,AAPL,155.0
2023-01-01,GOOG,2500.0
2023-01-02,GOOG,2550.0
2023-01-01,MFST,300.0
2023-01-02,MFST,310.0


root
 |-- date: string (nullable = true)
 |-- stock: string (nullable = true)
 |-- value: double (nullable = true)



#### 2. find avg stock value on daily basis for each stock.:
====================================
#### i. Convert the "date" column from string to date type:

In [0]:
df1 = df.withColumn("date",to_date(col("date")))
display(df1)
df1.printSchema()

date,stock,value
2023-01-01,APPL,150.0
2023-01-02,AAPL,155.0
2023-01-01,GOOG,2500.0
2023-01-02,GOOG,2550.0
2023-01-01,MFST,300.0
2023-01-02,MFST,310.0


root
 |-- date: date (nullable = true)
 |-- stock: string (nullable = true)
 |-- value: double (nullable = true)



In [0]:
from pyspark.sql.functions import *
df_avg = df1.groupBy(col("date"),col("stock"))\
 .agg(avg(col("value"))\
 .alias("avg_stock_value"))\
 .orderBy(col("avg_stock_value").desc())
display(df_avg)

date,stock,avg_stock_value
2023-01-02,GOOG,2550.0
2023-01-01,GOOG,2500.0
2023-01-02,MFST,310.0
2023-01-01,MFST,300.0
2023-01-02,AAPL,155.0
2023-01-01,APPL,150.0


In [0]:
df1.createOrReplaceTempView("avg_stock_value_table")
spark.sql("select date,stock,avg(value) as avg_stock_value from avg_stock_value_table group by date,stock order by avg_stock_value desc").show(truncate=False)

+----------+-----+---------------+
|date      |stock|avg_stock_value|
+----------+-----+---------------+
|2023-01-02|GOOG |2550.0         |
|2023-01-01|GOOG |2500.0         |
|2023-01-02|MFST |310.0          |
|2023-01-01|MFST |300.0          |
|2023-01-02|AAPL |155.0          |
|2023-01-01|APPL |150.0          |
+----------+-----+---------------+



#### 3. find max avg stock value of each stock.:
#### =============================

In [0]:
df_avg_max_stock = df_avg.groupBy(col("stock"))\
 .agg(max(col("avg_stock_value"))\
 .alias("max_avg_stock"))
display(df_avg_max_stock)

stock,max_avg_stock
APPL,150.0
GOOG,2550.0
AAPL,155.0
MFST,310.0


In [0]:
df_avg.createOrReplaceTempView("avg_stock")
spark.sql("select stock,avg(avg_stock_value) as max_avg_stock from avg_stock group by stock").show(truncate=False)

+-----+-------------+
|stock|max_avg_stock|
+-----+-------------+
|APPL |150.0        |
|GOOG |2525.0       |
|AAPL |155.0        |
|MFST |305.0        |
+-----+-------------+

