In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import *

In [2]:
spark = SparkSession.builder.master("local").getOrCreate()

21/10/17 10:12:36 WARN Utils: Your hostname, shivam-Vostro-3559 resolves to a loopback address: 127.0.1.1; using 192.168.0.104 instead (on interface wlp2s0)
21/10/17 10:12:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/10/17 10:12:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [8]:
schema = StructType([StructField("id",StringType()),
                    StructField("eventdate",StringType())])

In [9]:
rows = [Row("1","04/05/2020"),Row("2","02/03/2020"),Row("3","06/04/2021")]

In [10]:
rdd = spark.sparkContext.parallelize(rows,2)

In [11]:
rdd.collect()

[Stage 0:>                                                          (0 + 2) / 2]                                                                                

[<Row('1', '04/05/2020')>, <Row('2', '02/03/2020')>, <Row('3', '06/04/2021')>]

In [12]:
df = spark.createDataFrame(rdd,schema)

In [13]:
df.show()

+---+----------+
| id| eventdate|
+---+----------+
|  1|04/05/2020|
|  2|02/03/2020|
|  3|06/04/2021|
+---+----------+



[Stage 1:>                                                          (0 + 1) / 1]                                                                                

In [18]:
new_df = df.withColumn("eventdate",to_date("eventdate","m/d/y"))
new_df.show()

+---+----------+
| id| eventdate|
+---+----------+
|  1|2020-01-05|
|  2|2020-01-03|
|  3|2021-01-04|
+---+----------+



In [27]:
df = spark.read \
    .format('csv') \
    .option("path",r"../project/data/sample.csv") \
    .option("header","true") \
    .load()
df = df.withColumn("sequence",monotonically_increasing_id())
df = df.select("Country","state","sequence","Age")
df.show()

+--------------+-----+--------+---+
|       Country|state|sequence|Age|
+--------------+-----+--------+---+
| United States|   IL|       0| 37|
| United States|   IN|       1| 44|
|        Canada|   NA|       2| 32|
|United Kingdom|   NA|       3| 31|
| United States|   TX|       4| 31|
| United States|   TN|       5| 33|
| United States|   MI|       6| 35|
|        Canada|   NA|       7| 39|
| United States|   IL|       8| 42|
+--------------+-----+--------+---+



In [32]:
running_total_window = Window.partitionBy("Country") \
                    .orderBy("sequence") \
                    .rowsBetween(Window.unboundedPreceding,Window.currentRow)

In [33]:
df.withColumn("running total",sum("Age").over(running_total_window)).show()

                                                                                

+--------------+-----+--------+---+-------------+
|       Country|state|sequence|Age|running total|
+--------------+-----+--------+---+-------------+
| United States|   IL|       0| 37|         37.0|
| United States|   IN|       1| 44|         81.0|
| United States|   TX|       4| 31|        112.0|
| United States|   TN|       5| 33|        145.0|
| United States|   MI|       6| 35|        180.0|
| United States|   IL|       8| 42|        222.0|
|        Canada|   NA|       2| 32|         32.0|
|        Canada|   NA|       7| 39|         71.0|
|United Kingdom|   NA|       3| 31|         31.0|
+--------------+-----+--------+---+-------------+





In [34]:
window_spec = Window.partitionBy("Country").orderBy("Age")

In [35]:
df.withColumn("row_no", row_number().over(window_spec)).show()

                                                                                

+--------------+-----+--------+---+------+
|       Country|state|sequence|Age|row_no|
+--------------+-----+--------+---+------+
| United States|   TX|       4| 31|     1|
| United States|   TN|       5| 33|     2|
| United States|   MI|       6| 35|     3|
| United States|   IL|       0| 37|     4|
| United States|   IL|       8| 42|     5|
| United States|   IN|       1| 44|     6|
|        Canada|   NA|       2| 32|     1|
|        Canada|   NA|       7| 39|     2|
|United Kingdom|   NA|       3| 31|     1|
+--------------+-----+--------+---+------+



In [36]:
df.withColumn("rank",rank().over(window_spec)).show()



+--------------+-----+--------+---+----+
|       Country|state|sequence|Age|rank|
+--------------+-----+--------+---+----+
| United States|   TX|       4| 31|   1|
| United States|   TN|       5| 33|   2|
| United States|   MI|       6| 35|   3|
| United States|   IL|       0| 37|   4|
| United States|   IL|       8| 42|   5|
| United States|   IN|       1| 44|   6|
|        Canada|   NA|       2| 32|   1|
|        Canada|   NA|       7| 39|   2|
|United Kingdom|   NA|       3| 31|   1|
+--------------+-----+--------+---+----+





In [37]:
df.withColumn("dense rank",dense_rank().over(window_spec)).show()



+--------------+-----+--------+---+----------+
|       Country|state|sequence|Age|dense rank|
+--------------+-----+--------+---+----------+
| United States|   TX|       4| 31|         1|
| United States|   TN|       5| 33|         2|
| United States|   MI|       6| 35|         3|
| United States|   IL|       0| 37|         4|
| United States|   IL|       8| 42|         5|
| United States|   IN|       1| 44|         6|
|        Canada|   NA|       2| 32|         1|
|        Canada|   NA|       7| 39|         2|
|United Kingdom|   NA|       3| 31|         1|
+--------------+-----+--------+---+----------+





In [39]:
window_spec = Window.partitionBy("Country")