In [1]:

import os


import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
spark = SparkSession.builder.master("local[*]") \
    .appName("testing") \
    .config("spark.driver.extraClassPath", "C:\\my_sql_jar\\mysql-connector-java-8.0.26.jar") \
    .getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x00000214F8D5B670>


In [3]:
product_data = [
(1,"iphone","01-01-2023",1500000),
(2,"samsung","01-01-2023",1100000),
(3,"oneplus","01-01-2023",1100000),
(1,"iphone","01-02-2023",1300000),
(2,"samsung","01-02-2023",1120000),
(3,"oneplus","01-02-2023",1120000),
(1,"iphone","01-03-2023",1600000),
(2,"samsung","01-03-2023",1080000),
(3,"oneplus","01-03-2023",1160000),
(1,"iphone","01-04-2023",1700000),
(2,"samsung","01-04-2023",1800000),
(3,"oneplus","01-04-2023",1170000),
(1,"iphone","01-05-2023",1200000),
(2,"samsung","01-05-2023",980000),
(3,"oneplus","01-05-2023",1175000),
(1,"iphone","01-06-2023",1100000),
(2,"samsung","01-06-2023",1100000),
(3,"oneplus","01-06-2023",1200000)
]

prod_schema = ['product_id', 'product_name', 'sales_date', 'sales']

product_df = spark.createDataFrame(data = product_data, schema = prod_schema)

In [4]:
product_df.show()

+----------+------------+----------+-------+
|product_id|product_name|sales_date|  sales|
+----------+------------+----------+-------+
|         1|      iphone|01-01-2023|1500000|
|         2|     samsung|01-01-2023|1100000|
|         3|     oneplus|01-01-2023|1100000|
|         1|      iphone|01-02-2023|1300000|
|         2|     samsung|01-02-2023|1120000|
|         3|     oneplus|01-02-2023|1120000|
|         1|      iphone|01-03-2023|1600000|
|         2|     samsung|01-03-2023|1080000|
|         3|     oneplus|01-03-2023|1160000|
|         1|      iphone|01-04-2023|1700000|
|         2|     samsung|01-04-2023|1800000|
|         3|     oneplus|01-04-2023|1170000|
|         1|      iphone|01-05-2023|1200000|
|         2|     samsung|01-05-2023| 980000|
|         3|     oneplus|01-05-2023|1175000|
|         1|      iphone|01-06-2023|1100000|
|         2|     samsung|01-06-2023|1100000|
|         3|     oneplus|01-06-2023|1200000|
+----------+------------+----------+-------+



In [5]:
window = Window.partitionBy("product_id").orderBy("sales_date")

product_df.withColumn("previous_sales", lag(col("sales")).over(window)).show()



+----------+------------+----------+-------+--------------+
|product_id|product_name|sales_date|  sales|previous_sales|
+----------+------------+----------+-------+--------------+
|         1|      iphone|01-01-2023|1500000|          null|
|         1|      iphone|01-02-2023|1300000|       1500000|
|         1|      iphone|01-03-2023|1600000|       1300000|
|         1|      iphone|01-04-2023|1700000|       1600000|
|         1|      iphone|01-05-2023|1200000|       1700000|
|         1|      iphone|01-06-2023|1100000|       1200000|
|         2|     samsung|01-01-2023|1100000|          null|
|         2|     samsung|01-02-2023|1120000|       1100000|
|         2|     samsung|01-03-2023|1080000|       1120000|
|         2|     samsung|01-04-2023|1800000|       1080000|
|         2|     samsung|01-05-2023| 980000|       1800000|
|         2|     samsung|01-06-2023|1100000|        980000|
|         3|     oneplus|01-01-2023|1100000|          null|
|         3|     oneplus|01-02-2023|1120

In [6]:
window = Window.partitionBy("product_id").orderBy("sales_date")

product_df.withColumn("next_sales", lead(col("sales")).over(window)).show()



+----------+------------+----------+-------+----------+
|product_id|product_name|sales_date|  sales|next_sales|
+----------+------------+----------+-------+----------+
|         1|      iphone|01-01-2023|1500000|   1300000|
|         1|      iphone|01-02-2023|1300000|   1600000|
|         1|      iphone|01-03-2023|1600000|   1700000|
|         1|      iphone|01-04-2023|1700000|   1200000|
|         1|      iphone|01-05-2023|1200000|   1100000|
|         1|      iphone|01-06-2023|1100000|      null|
|         2|     samsung|01-01-2023|1100000|   1120000|
|         2|     samsung|01-02-2023|1120000|   1080000|
|         2|     samsung|01-03-2023|1080000|   1800000|
|         2|     samsung|01-04-2023|1800000|    980000|
|         2|     samsung|01-05-2023| 980000|   1100000|
|         2|     samsung|01-06-2023|1100000|      null|
|         3|     oneplus|01-01-2023|1100000|   1120000|
|         3|     oneplus|01-02-2023|1120000|   1160000|
|         3|     oneplus|01-03-2023|1160000|   1

In [9]:
window = Window.partitionBy("product_id").orderBy("sales_date")

product_df.withColumn("previous_sales", col("sales") - lag(col("sales"), 1, 0).over(window)).show()



+----------+------------+----------+-------+--------------+
|product_id|product_name|sales_date|  sales|previous_sales|
+----------+------------+----------+-------+--------------+
|         1|      iphone|01-01-2023|1500000|       1500000|
|         1|      iphone|01-02-2023|1300000|       -200000|
|         1|      iphone|01-03-2023|1600000|        300000|
|         1|      iphone|01-04-2023|1700000|        100000|
|         1|      iphone|01-05-2023|1200000|       -500000|
|         1|      iphone|01-06-2023|1100000|       -100000|
|         2|     samsung|01-01-2023|1100000|       1100000|
|         2|     samsung|01-02-2023|1120000|         20000|
|         2|     samsung|01-03-2023|1080000|        -40000|
|         2|     samsung|01-04-2023|1800000|        720000|
|         2|     samsung|01-05-2023| 980000|       -820000|
|         2|     samsung|01-06-2023|1100000|        120000|
|         3|     oneplus|01-01-2023|1100000|       1100000|
|         3|     oneplus|01-02-2023|1120