### Window functions

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext()
sc

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Window functions") \
        .getOrCreate()

### Load the dataset

Dataset has been taken from https://www.kaggle.com/arminehn/disasteraccident-sources. <br />
It contains a list of Twitter users who regularly report on natural and man-made disasters, violence or crime. The accounts may belong to journalists, news media, local fire or police departments, other local authorities, or disaster monitors.

In [4]:
accounts = spark.read\
                .format("csv")\
                .option("header", "true")\
                .load('../datasets/disaster_accident_crime_accounts.csv')

In [5]:
accounts.show(5)

+--------------+--------------------+--------------+--------------------+---------+---------+----------------+--------------------+--------------------+
|      category|    on twitter since|twitter handle|         profile url|followers|following|profile location|     profile lat/lon| profile description|
+--------------+--------------------+--------------+--------------------+---------+---------+----------------+--------------------+--------------------+
|          news|1:45 PM - 11 Sep ...|  abc13houston|http://www.abc13.com|   568709|     1113|     Houston, TX|29.75893,-95.3676899|ABC13 is Houston'...|
|police/traffic|3:41 AM - 19 Aug ...|   metpoliceuk|https://beta.met....|   670529|      367|          London| 51.50732,-0.1276399|London's Metropol...|
|            wx|7:00 AM - 17 Feb ...|         wsbtv|http://2wsb.tv/Ne...|   934824|     2134|     Atlanta, GA|33.74909,-84.3901799|Metro Atlanta and...|
|          news|9:22 PM - 19 Sep ...|          ABC7|     http://abc7.com|   904324

In [6]:
accounts = accounts.drop('on twitter since',
                         'profile url',
                         'following',
                         'profile location',
                         'profile lat/lon',
                         'profile description'
                        )

In [7]:
from pyspark.sql.types import IntegerType

In [8]:
accounts = accounts.withColumn("followers", 
                               accounts['followers'].cast(IntegerType()))

In [9]:
accounts.show(5)

+--------------+--------------+---------+
|      category|twitter handle|followers|
+--------------+--------------+---------+
|          news|  abc13houston|   568709|
|police/traffic|   metpoliceuk|   670529|
|            wx|         wsbtv|   934824|
|          news|          ABC7|   904324|
|            wx|         FOX59|   204408|
+--------------+--------------+---------+
only showing top 5 rows



#### Window rank function

In [10]:
import sys
from pyspark.sql.window import Window
import pyspark.sql.functions as func

In [11]:
windowSpec1 = Window \
              .partitionBy(accounts['category']) \
              .orderBy(accounts['followers'].desc())

In [12]:
followers_rank = (func.rank().over(windowSpec1))

followers_rank

Column<b'RANK() OVER (PARTITION BY category ORDER BY followers DESC NULLS LAST unspecifiedframe$())'>

In [13]:
followers_rank = accounts.select(accounts['twitter handle'],
                                 accounts['category'],
                                 accounts['followers']
                                ).withColumn('rank', 
                                             func.rank().over(windowSpec1))

followers_rank.show()

+---------------+--------+---------+----+
| twitter handle|category|followers|rank|
+---------------+--------+---------+----+
|IndiaTodayFLASH|    news|   978677|   1|
|          Salon|    news|   976952|   2|
| CapitalFMKenya|    news|   967429|   3|
|CapitalOfficial|    news|   954127|   4|
|      DunyaNews|    news|   950673|   5|
|       NewsHour|    news|   948347|   6|
| chicagotribune|    news|   926799|   7|
|          STcom|    news|   907667|   8|
|           ABC7|    news|   904324|   9|
|GuardianNigeria|    news|   902789|  10|
|      60Minutes|    news|   898224|  11|
|    DailyMirror|    news|   898158|  12|
|  foxandfriends|    news|   896748|  13|
|   BuzzFeedNews|    news|   891313|  14|
|            NME|    news|   885933|  15|
|          NewsX|    news|   869350|  16|
| TheNationalUAE|    news|   854185|  17|
| NBCNightlyNews|    news|   849738|  18|
|            ajc|    news|   843790|  19|
|        SAMAATV|    news|   843254|  20|
+---------------+--------+--------

In [14]:
traffic = followers_rank.filter(followers_rank['category'] == 'police/traffic')

traffic.show()

+---------------+--------------+---------+----+
| twitter handle|      category|followers|rank|
+---------------+--------------+---------+----+
|          CPBlr|police/traffic|   822032|   1|
|       Ma3Route|police/traffic|   734626|   2|
|    metpoliceuk|police/traffic|   670529|   3|
|MassStatePolice|police/traffic|   416173|   4|
|  wsdot_traffic|police/traffic|   362617|   5|
|  trafficbutter|police/traffic|   262205|   6|
|BaltimorePolice|police/traffic|   255021|   7|
|           NJSP|police/traffic|   254180|   8|
|      TrafficSA|police/traffic|   253441|   9|
|  KenyanTraffic|police/traffic|   241647|  10|
|     EWNTraffic|police/traffic|   229280|  11|
|      jeffcanoy|police/traffic|   229002|  12|
|    FastCoIdeas|police/traffic|   208623|  13|
|trafficscotland|police/traffic|   190064|  14|
|        2xTessy|police/traffic|   188600|  15|
|   DenverPolice|police/traffic|   184100|  16|
|      Venice311|police/traffic|   165389|  17|
|      Kent_999s|police/traffic|   16484

#### Window max function between rows

In [15]:
windowSpec2 = Window \
              .partitionBy(accounts['category']) \
              .orderBy(accounts['followers'].desc()) \
              .rowsBetween(-1, 0)

In [16]:
followers_max = (func.max(accounts['followers']).over(windowSpec2))

In [17]:
accounts.select(accounts['twitter handle'],
                accounts['category'],
                accounts['followers'],
                followers_max.alias("followers_max")).show()

+---------------+--------+---------+-------------+
| twitter handle|category|followers|followers_max|
+---------------+--------+---------+-------------+
|IndiaTodayFLASH|    news|   978677|       978677|
|          Salon|    news|   976952|       978677|
| CapitalFMKenya|    news|   967429|       976952|
|CapitalOfficial|    news|   954127|       967429|
|      DunyaNews|    news|   950673|       954127|
|       NewsHour|    news|   948347|       950673|
| chicagotribune|    news|   926799|       948347|
|          STcom|    news|   907667|       926799|
|           ABC7|    news|   904324|       907667|
|GuardianNigeria|    news|   902789|       904324|
|      60Minutes|    news|   898224|       902789|
|    DailyMirror|    news|   898158|       898224|
|  foxandfriends|    news|   896748|       898158|
|   BuzzFeedNews|    news|   891313|       896748|
|            NME|    news|   885933|       891313|
|          NewsX|    news|   869350|       885933|
| TheNationalUAE|    news|   85

#### Window followers difference function between ranges

In [18]:
windowSpec3 = Window \
              .partitionBy(accounts['category']) \
              .orderBy(accounts['followers'].desc()) 

In [19]:
followers_difference = \
  (func.max(accounts['followers']).over(windowSpec3) - accounts['followers'])

In [20]:
accounts.select(
    accounts['twitter handle'],
    accounts['category'],
    accounts['followers'],
    followers_difference.alias("followers_difference")).show()

+---------------+--------+---------+--------------------+
| twitter handle|category|followers|followers_difference|
+---------------+--------+---------+--------------------+
|IndiaTodayFLASH|    news|   978677|                   0|
|          Salon|    news|   976952|                1725|
| CapitalFMKenya|    news|   967429|               11248|
|CapitalOfficial|    news|   954127|               24550|
|      DunyaNews|    news|   950673|               28004|
|       NewsHour|    news|   948347|               30330|
| chicagotribune|    news|   926799|               51878|
|          STcom|    news|   907667|               71010|
|           ABC7|    news|   904324|               74353|
|GuardianNigeria|    news|   902789|               75888|
|      60Minutes|    news|   898224|               80453|
|    DailyMirror|    news|   898158|               80519|
|  foxandfriends|    news|   896748|               81929|
|   BuzzFeedNews|    news|   891313|               87364|
|            N