In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [9]:
spark.stop()

In [2]:
orders = spark.read \
.format("csv") \
.option("inferSchema", "true") \
.option("header","true") \
.load("/public/trendytech/datasets/order_data.csv")

In [3]:
orders.show(5)

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|   536378|     null|PACK OF 60 DINOSA...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|     null|PACK OF 60 PINK P...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|    84991|60 TEATIME FAIRY ...|      24|01-12-2010 9.37|     0.55|     14688|United Kingdom|
|   536378|   84519A|TOMATO CHARLIE+LO...|       6|01-12-2010 9.37|     2.95|     14688|United Kingdom|
|   536378|   85183B|CHARLIE & LOLA WA...|      48|01-12-2010 9.37|     1.25|     14688|United Kingdom|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
only showing top 5 rows



In [10]:
from pyspark.sql.functions import *
orders.select(count("*").alias("totalRecord"), avg("UnitPrice").alias("avgPrice")).show()

+-----------+----------------+
|totalRecord|        avgPrice|
+-----------+----------------+
|     541782|4.61156532331637|
+-----------+----------------+



In [14]:
orders.selectExpr("count(*) as totalRecord", "avg(UnitPrice) as avgPrice").show()

+-----------+----------------+
|totalRecord|        avgPrice|
+-----------+----------------+
|     541782|4.61156532331637|
+-----------+----------------+



In [26]:
orders.groupBy("Country", "InvoiceNo") \
      .agg(sum("Quantity").alias("totalQty"), sum(expr("Quantity * UnitPrice")).alias("invoiceVal")) \
      .sort("invoiceVal", ascending=False) \
      .show(10, False)

+--------------+---------+--------+------------------+
|Country       |InvoiceNo|totalQty|invoiceVal        |
+--------------+---------+--------+------------------+
|United Kingdom|581483   |80995   |168469.6          |
|United Kingdom|541431   |74215   |77183.6           |
|United Kingdom|574941   |14149   |52940.93999999999 |
|United Kingdom|576365   |13956   |50653.909999999996|
|United Kingdom|556444   |60      |38970.0           |
|United Kingdom|567423   |12572   |31698.159999999996|
|Australia     |556917   |15049   |22775.930000000008|
|United Kingdom|572209   |1920    |22206.0           |
|United Kingdom|567381   |6760    |22104.8           |
|Australia     |563614   |12196   |21880.439999999995|
+--------------+---------+--------+------------------+
only showing top 10 rows



In [2]:
data = [
    (1, "John", [("Math", 85, "A"), ("Science", 92, "A"), ("English", 78, "B")]),
    (2, "Emma", [("Math", 91, "A"), ("Science", 87, "B"), ("History", 94, "A")]),
    (3, "Alex", [("Physics", 76, "C"), ("Chemistry", 82, "B")]),
    (4, "Lisa", [])
]

In [3]:
schema = ["student_id", "name", "courses"]

In [4]:
df = spark.createDataFrame(data, schema)

In [7]:
df.show(truncate = False)

+----------+----+---------------------------------------------------+
|student_id|name|courses                                            |
+----------+----+---------------------------------------------------+
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|
|3         |Alex|[{Physics, 76, C}, {Chemistry, 82, B}]             |
|4         |Lisa|[]                                                 |
+----------+----+---------------------------------------------------+



In [10]:
from pyspark.sql.functions import *
df_exploded = df.withColumn("course_data", explode(df.courses))

In [11]:
df_exploded.show(truncate = False)

+----------+----+---------------------------------------------------+------------------+
|student_id|name|courses                                            |course_data       |
+----------+----+---------------------------------------------------+------------------+
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{Math, 85, A}     |
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{Science, 92, A}  |
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{English, 78, B}  |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{Math, 91, A}     |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{Science, 87, B}  |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{History, 94, A}  |
|3         |Alex|[{Physics, 76, C}, {Chemistry, 82, B}]             |{Physics, 76, C}  |
|3         |Alex|[{Physics, 76, C}, {Chemistry, 82, B}]             |{Chemistry, 82, B}|
+----------+----+----

In [12]:
df_final = df_exploded.select(
    "student_id", 
    "name",
    col("course_data._1").alias("subject"),
    col("course_data._2").alias("score"),
    col("course_data._3").alias("grade")
)

In [13]:
df_final.show(truncate = False)

+----------+----+---------+-----+-----+
|student_id|name|subject  |score|grade|
+----------+----+---------+-----+-----+
|1         |John|Math     |85   |A    |
|1         |John|Science  |92   |A    |
|1         |John|English  |78   |B    |
|2         |Emma|Math     |91   |A    |
|2         |Emma|Science  |87   |B    |
|2         |Emma|History  |94   |A    |
|3         |Alex|Physics  |76   |C    |
|3         |Alex|Chemistry|82   |B    |
+----------+----+---------+-----+-----+



In [16]:
df.withColumn("course_data", explode_outer(df.courses)).show(truncate = False)

+----------+----+---------------------------------------------------+------------------+
|student_id|name|courses                                            |course_data       |
+----------+----+---------------------------------------------------+------------------+
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{Math, 85, A}     |
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{Science, 92, A}  |
|1         |John|[{Math, 85, A}, {Science, 92, A}, {English, 78, B}]|{English, 78, B}  |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{Math, 91, A}     |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{Science, 87, B}  |
|2         |Emma|[{Math, 91, A}, {Science, 87, B}, {History, 94, A}]|{History, 94, A}  |
|3         |Alex|[{Physics, 76, C}, {Chemistry, 82, B}]             |{Physics, 76, C}  |
|3         |Alex|[{Physics, 76, C}, {Chemistry, 82, B}]             |{Chemistry, 82, B}|
|4         |Lisa|[]  

In [20]:
df.select("student_id", "name", 
         posexplode("courses").alias("pos", "course_data")).show(truncate = False)

+----------+----+---+------------------+
|student_id|name|pos|course_data       |
+----------+----+---+------------------+
|1         |John|0  |{Math, 85, A}     |
|1         |John|1  |{Science, 92, A}  |
|1         |John|2  |{English, 78, B}  |
|2         |Emma|0  |{Math, 91, A}     |
|2         |Emma|1  |{Science, 87, B}  |
|2         |Emma|2  |{History, 94, A}  |
|3         |Alex|0  |{Physics, 76, C}  |
|3         |Alex|1  |{Chemistry, 82, B}|
+----------+----+---+------------------+



In [21]:
data = [
    (1, "Product A", [1, 2, 3, 5], 10.5),
    (2, "Product B", [7, 8, 9], 15.2),
    (3, "Product C", [4, 2], 8.7),
    (4, "Product D", [], 12.3),  # Empty array
    (5, "Product E", [10], 9.0)  # Single element array
]

In [22]:
schema = ["product_id", "name", "values", "price"]
df = spark.createDataFrame(data, schema)

In [23]:
df.show(truncate = False)

+----------+---------+------------+-----+
|product_id|name     |values      |price|
+----------+---------+------------+-----+
|1         |Product A|[1, 2, 3, 5]|10.5 |
|2         |Product B|[7, 8, 9]   |15.2 |
|3         |Product C|[4, 2]      |8.7  |
|4         |Product D|[]          |12.3 |
|5         |Product E|[10]        |9.0  |
+----------+---------+------------+-----+



In [24]:
df_exploded = df.withColumn("value", explode(df.values))

In [25]:
df_exploded.show(truncate = False)

+----------+---------+------------+-----+-----+
|product_id|name     |values      |price|value|
+----------+---------+------------+-----+-----+
|1         |Product A|[1, 2, 3, 5]|10.5 |1    |
|1         |Product A|[1, 2, 3, 5]|10.5 |2    |
|1         |Product A|[1, 2, 3, 5]|10.5 |3    |
|1         |Product A|[1, 2, 3, 5]|10.5 |5    |
|2         |Product B|[7, 8, 9]   |15.2 |7    |
|2         |Product B|[7, 8, 9]   |15.2 |8    |
|2         |Product B|[7, 8, 9]   |15.2 |9    |
|3         |Product C|[4, 2]      |8.7  |4    |
|3         |Product C|[4, 2]      |8.7  |2    |
|5         |Product E|[10]        |9.0  |10   |
+----------+---------+------------+-----+-----+



In [22]:
df = spark.read \
.format("csv") \
.option("inferSchema","true") \
.option("header", "true") \
.load("/public/trendytech/datasets/windowdatamodified.csv")

In [14]:
df.show(10, truncate = False)

+---------+-------+-----------+-------------+------------+
|country  |weeknum|numinvoices|totalquantity|invoicevalue|
+---------+-------+-----------+-------------+------------+
|Spain    |49     |1          |67           |174.72      |
|Germany  |48     |11         |1795         |1600.0      |
|Lithuania|48     |3          |622          |1598.06     |
|Germany  |49     |12         |1852         |1800.0      |
|Bahrain  |51     |1          |54           |205.74      |
|Iceland  |49     |1          |319          |711.79      |
|India    |51     |5          |95           |300.0       |
|Australia|50     |2          |133          |387.95      |
|Italy    |49     |1          |-2           |-17.0       |
|India    |49     |5          |1280         |3284.1      |
+---------+-------+-----------+-------------+------------+
only showing top 10 rows



In [15]:
# Running Total

from pyspark.sql import *
from pyspark.sql.functions import *

window = Window.partitionBy("country") \
               .orderBy("weeknum") \
               .rowsBetween(Window.unboundedPreceding, Window.currentRow)


df_window = df.withColumn("runningTotal", sum(col("invoicevalue").cast("double")).over(window))

df_window.show(10, truncate = False)

+-------+-------+-----------+-------------+------------+------------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|runningTotal      |
+-------+-------+-----------+-------------+------------+------------------+
|Sweden |50     |3          |3714         |2646.3      |2646.3            |
|Germany|48     |11         |1795         |1600.0      |1600.0            |
|Germany|49     |12         |1852         |1800.0      |3400.0            |
|Germany|50     |15         |1973         |1800.0      |5200.0            |
|Germany|51     |5          |1103         |1600.0      |6800.0            |
|France |48     |4          |1299         |500.0       |500.0             |
|France |49     |9          |2303         |500.0       |1000.0            |
|France |50     |6          |529          |537.32      |1537.3200000000002|
|France |51     |5          |847          |500.0       |2037.3200000000002|
|Belgium|48     |1          |528          |800.0       |800.0             |
+-------+---

In [18]:
# rank, dence_rank, row_number

from pyspark.sql import *
from pyspark.sql.functions import *

window = Window.partitionBy("country") \
               .orderBy("invoicevalue")


df_rank = df.withColumn("rank", rank().over(window)) \
            .withColumn("dense_rank", dense_rank().over(window)) \
            .withColumn("row_number", row_number().over(window))

df_rank.show(10, truncate = False)

+-------+-------+-----------+-------------+------------+----+----------+----------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|rank|dense_rank|row_number|
+-------+-------+-----------+-------------+------------+----+----------+----------+
|Sweden |50     |3          |3714         |2646.3      |1   |1         |1         |
|Germany|48     |11         |1795         |1600.0      |1   |1         |1         |
|Germany|51     |5          |1103         |1600.0      |1   |1         |2         |
|Germany|49     |12         |1852         |1800.0      |3   |2         |3         |
|Germany|50     |15         |1973         |1800.0      |3   |2         |4         |
|France |51     |5          |847          |500.0       |1   |1         |1         |
|France |49     |9          |2303         |500.0       |1   |1         |2         |
|France |48     |4          |1299         |500.0       |1   |1         |3         |
|France |50     |6          |529          |537.32      |4   |2         |4   

In [31]:
# lead and lag


from pyspark.sql import *
from pyspark.sql.functions import *

window = Window.partitionBy("country") \
               .orderBy("weeknum")

df_lead = df.withColumn("previous_week", lag("invoicevalue").over(window)) \
            .withColumn("next_week", lead("invoicevalue").over(window)) \
            .withColumn("invoice_diff", round(expr("invoicevalue - previous_week"), 2))

df_lead.show(10, truncate = False)


+-------+-------+-----------+-------------+------------+-------------+---------+------------+
|country|weeknum|numinvoices|totalquantity|invoicevalue|previous_week|next_week|invoice_diff|
+-------+-------+-----------+-------------+------------+-------------+---------+------------+
|Sweden |50     |3          |3714         |2646.3      |null         |null     |null        |
|Germany|48     |11         |1795         |1600.0      |null         |1800.0   |null        |
|Germany|49     |12         |1852         |1800.0      |1600.0       |1800.0   |200.0       |
|Germany|50     |15         |1973         |1800.0      |1800.0       |1600.0   |0.0         |
|Germany|51     |5          |1103         |1600.0      |1800.0       |null     |-200.0      |
|France |48     |4          |1299         |500.0       |null         |500.0    |null        |
|France |49     |9          |2303         |500.0       |500.0        |537.32   |0.0         |
|France |50     |6          |529          |537.32      |500.

In [4]:
from pyspark.sql.functions import split
df = spark.createDataFrame([("Alice,Bob,Charlie",)], ["names"])
df.show(truncate=False)

expanded_df = df.select(split(df["names"], ",").alias("split_names"))
expanded_df.show(truncate=False)

+-----------------+
|names            |
+-----------------+
|Alice,Bob,Charlie|
+-----------------+

+---------------------+
|split_names          |
+---------------------+
|[Alice, Bob, Charlie]|
+---------------------+



In [6]:
 lines = [
            "In my younger and more vulnerable years my father gave me some advice that I've been "
            "turning over in my mind ever since. \"Whenever you feel like criticising any one,\""
            " he told me, \"just remember that all the people in this world haven't had the advantages"
            " that you've had.\"",
            "Most of the big shore places were closed now and there were hardly any lights except the "
            "shadowy, moving glow of a ferryboat across the Sound. And as the moon rose higher the "
            "inessential houses began to melt away until gradually I became aware of the old island "
            "here that flowered once for Dutch sailors' eyes--a fresh, green breast of the new world. "
            "Its vanished trees, the trees that had made way for Gatsby's house, had once pandered in "
            "whispers to the last and greatest of all human dreams; for a transitory enchanted moment "
            "man must have held his breath in the presence of this continent, compelled into an "
            "aesthetic contemplation he neither understood nor desired, face to face for the last time "
            "in history with something commensurate to his capacity for wonder.",
            "And as I sat there, brooding on the old unknown world, I thought of Gatsby's wonder when "
            "he first picked out the green light at the end of Daisy's dock. He had come a long way to "
            "this blue lawn and his dream must have seemed so close that he could hardly fail to grasp "
            "it. He did not know that it was already behind him, somewhere back in that vast obscurity "
            "beyond the city, where the dark fields of the republic rolled on under the night.",
            "Gatsby believed in the green light, the orgastic future that year by year recedes before "
            "us. It eluded us then, but that's no matter--tomorrow we will run faster, stretch out our "
            "arms farther.... And one fine morning----",
            "So we beat on, boats against the current, borne back ceaselessly into the past.      "
        ]
    

full_text = " ".join(lines)

In [None]:
df = spark.createDataFrame([(line,) for line in lines], ["line"])
df.show(5, truncate=False)

rdd = df.rdd

words = rdd.flatMap(lambda row: row.line.split(" ")) \
           .map(lambda word: (word, 1)) \
           .reduceByKey(lambda x,y: x+y)

words.collect()

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|line                                                                                                                                                                                                                                

In [14]:
df = spark.createDataFrame([(full_text,)], ["full_text"])
df.show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [15]:
rdd = df.rdd

In [16]:
rdd.collect()

[Row(full_text='In my younger and more vulnerable years my father gave me some advice that I\'ve been turning over in my mind ever since. "Whenever you feel like criticising any one," he told me, "just remember that all the people in this world haven\'t had the advantages that you\'ve had." Most of the big shore places were closed now and there were hardly any lights except the shadowy, moving glow of a ferryboat across the Sound. And as the moon rose higher the inessential houses began to melt away until gradually I became aware of the old island here that flowered once for Dutch sailors\' eyes--a fresh, green breast of the new world. Its vanished trees, the trees that had made way for Gatsby\'s house, had once pandered in whispers to the last and greatest of all human dreams; for a transitory enchanted moment man must have held his breath in the presence of this continent, compelled into an aesthetic contemplation he neither understood nor desired, face to face for the last time in h

In [None]:
words = rdd.flatMap(lambda row: row.full_text.split(" ")) \
           .map(lambda x: (x, 1)) \
           .reduceByKey(lambda x,y:x+y)
words.collect()