In [21]:
from pyspark.sql import SparkSession

import getpass
username = getpass.getuser()

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | Python - Data Processing - Overview'). \
    master('yarn'). \
    getOrCreate()

In [22]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import Window

In [23]:
data = [(101, 'xyz', 10),
(102, 'xyz', 10),
(103, 'xyz', 40),
(104, 'pqr', 40),
(105, 'pqr', 60)]

In [24]:
df1 = spark.createDataFrame(data, ['id', 'city', 'age'])

In [25]:
df1.show()

+---+----+---+
| id|city|age|
+---+----+---+
|101| xyz| 10|
|102| xyz| 10|
|103| xyz| 40|
|104| pqr| 40|
|105| pqr| 60|
+---+----+---+



___Find the ids of people whose age is greater than avg_age per city___

In [26]:
df1.createOrReplaceTempView("tab1")

In [27]:
df2 = spark.sql("SELECT id FROM (SELECT id, age, AVG(age) OVER(PARTITION BY city) AS avg_age FROM tab1) WHERE age > avg_age")

In [28]:
df2.show()

+---+
| id|
+---+
|105|
|103|
+---+



In [29]:
windowSpec = Window.partitionBy("city")

df3 = df1.withColumn("avg_age", avg("age").over(windowSpec))

df3.show()

+---+----+---+-------+
| id|city|age|avg_age|
+---+----+---+-------+
|104| pqr| 40|   50.0|
|105| pqr| 60|   50.0|
|103| xyz| 40|   20.0|
|101| xyz| 10|   20.0|
|102| xyz| 10|   20.0|
+---+----+---+-------+



In [30]:
df4 = df3.filter("age > avg_age"). \
    select("id")

df4.show()

+---+
| id|
+---+
|105|
|103|
+---+



In [31]:
data2 = [(2020, 10, 2000),
(2021, 10, 1000),
(2022, 20, 2000)]

In [32]:
df5 = spark.createDataFrame(data2, ['year', 'stock_purchse', 'profit'])

In [33]:
df5.show()

+----+-------------+------+
|year|stock_purchse|profit|
+----+-------------+------+
|2020|           10|  2000|
|2021|           10|  1000|
|2022|           20|  2000|
+----+-------------+------+



___Find the year where previous year profit is more than current year profit___

In [34]:
df5.createOrReplaceTempView("t1")

df6 = spark.sql("SELECT year FROM (SELECT year, \
profit, LAG(profit, 1) OVER(ORDER BY YEAR) AS prev_year_prof FROM t1) WHERE prev_year_prof > profit")

In [35]:
df6.show()

+----+
|year|
+----+
|2021|
+----+



In [36]:
df7 = spark.sql("SELECT year FROM (SELECT year, \
profit, LAG(profit, 1) OVER(ORDER BY YEAR DESC) AS prev_year_prof FROM t1) WHERE prev_year_prof > profit")

df7.show()

+----+
|year|
+----+
|2021|
+----+



In [37]:
win_spec = Window.orderBy("year")
df8 = df5.withColumn("prev_year_prof", lag("profit", 1).over(win_spec))

df8.show()

+----+-------------+------+--------------+
|year|stock_purchse|profit|prev_year_prof|
+----+-------------+------+--------------+
|2020|           10|  2000|          null|
|2021|           10|  1000|          2000|
|2022|           20|  2000|          1000|
+----+-------------+------+--------------+



In [38]:
df9 = df8.filter("prev_year_prof > profit").drop("prev_year_prof")

df9.show()

+----+-------------+------+
|year|stock_purchse|profit|
+----+-------------+------+
|2021|           10|  1000|
+----+-------------+------+



In [39]:
data3 = [(2020, 1, 100),
(2020, 2, 110),
(2020, 5, 150),
(2020, 3, 90),
(2020, 8, 130),
(2021, 2, 150),
(2021, 5, 90),
(2021, 7, 130)]

In [40]:
df10 = spark.createDataFrame(data3, ['year', 'month', 'profit'])

df10.show()

+----+-----+------+
|year|month|profit|
+----+-----+------+
|2020|    1|   100|
|2020|    2|   110|
|2020|    5|   150|
|2020|    3|    90|
|2020|    8|   130|
|2021|    2|   150|
|2021|    5|    90|
|2021|    7|   130|
+----+-----+------+

