In [19]:
# Required libraries
import sys
import datetime
import time 

from pyspark import SparkContext
from pyspark.conf import SparkConf

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as sFuncs
from pyspark.sql.window import Window

# from pyspark.mllib.clustering import KMeans, KMeansModel
# from pyspark.mllib.linalg import Vectors

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

### Note: 
In case you are using a single-node cluster, executing this cell is essential, as otherwise, SparkContext put the sc.master on 'yarn' which you don't have. The result would be you'll never see a collect() to converge.

In [84]:
# sc._conf.getAll()
sc.stop()
conf = SparkConf().setMaster('local[4]')
sc = SparkContext(conf=conf)

In [85]:
rdd = sc.parallelize([('a',7),('a',2),('b',2)])
rdd.collect()

[('a', 7), ('a', 2), ('b', 2)]

## PA4 Question 1

The last 5 presidents in our speeches collection were `reagan, bush, clinton, gwbush` and `obama` (that's how the tar.gz files of their speeches are named in Canvas).

Pairwise comparisons of the similarities in their speech collections (10 pairs) will give us a half-matrix like shown below (The symmetry in the problem formulation makes it unnecessary to compute the blank spots in the matrix):

|  | r | b | c | g | o |
| --- | --- | --- | --- | --- | --- |
| reagan |   | * | * | * | * |
| bush |   |   | * | * | * |
| clinton |   |   |   | * | * |
| gwbush |   |   |   |   | * |

Compute the similarity denoted by each asterisk and answer:

1. Using n-gram character shingles, assuming n=4, which two presidents' speeches were the most similar and which were the least similar?
2. Using n-gram word shingles, assuming n=3, which two presidents' speeches were the most similar and which were the least similar?

In [None]:
# to unpack the dataset into the current directory and then move it to gs_bucket
# NOTE that this cell needs to run once
%%bash
cd /home/BigData/PA4
tar -xcvf ./dataset.tar.gz 
gsutil -q cp -rq ./dataset/* gs://saberbf0098/presidential_speech_corpus/

gsutil ls -al gs://saberbf0098/presidential_speech_corpus/

In [22]:
# to load the documents for each president into a separate rdd
presidents = ['reagan', 'bush', 'clinton', 'gwbush', 'obama']
speech = {p:sc.textFile('gs://saberbf0098/presidential_speech_corpus/{0}/{0}_speeches_*.txt'\
                     .format(p)) for p in presidents}

In [23]:
# to create n-gram character shingles
def ngramsC(phrase, N):
    grams = [phrase.lower()[i:i+N] for i in range(len(phrase)-N+1)]
    return grams#set(grams)

def cmp_matrix(set1, set2):
    return float(len(set1&set2))/float(len(set1|set2))

In [26]:
# To test to see if the algorithm works fine, check the similarity between 'test' 
# and 'obama' documents. This should give us a near zero result. Also, a test between 'obama'
# and 'obama' should give 1.0
set1 = sc.textFile('gs://saberbf0098/test.txt').flatMap(lambda x: ngramsC(x,N)).collect()
set2 = speech['obama'].flatMap(lambda x: ngramsC(x,N)).collect()
print ("Similarity ({},{}) = {:.4f}".format('test','obama',cmp_matrix(set(set2), set(set1))))

Similarity (reagan,obama) = 0.0060


### 4-gram character shingles

In [6]:
# working on 4-gram characters
presidents = ['reagan', 'bush', 'clinton', 'gwbush', 'obama']
presidents = list(reversed(presidents))
cmp_4gram=dict()
N=4
while presidents:
    p = presidents.pop()
    set1 = speech[p].flatMap(lambda x: ngramsC(x,N)).collect()
    for q in presidents:
        set2 = speech[q].flatMap(lambda x: ngramsC(x,N)).collect()
        cmp_4gram[(p,q)] = cmp_matrix(set(set1), set(set2))
        print ("Similarity ({},{}) = {:.4f}".format(p,q,cmp_4gram[(p,q)]))
        

Similarity (reagan,obama) = 0.5601
Similarity (reagan,gwbush) = 0.5314
Similarity (reagan,clinton) = 0.5590
Similarity (reagan,bush) = 0.5461
Similarity (bush,obama) = 0.5580
Similarity (bush,gwbush) = 0.5684
Similarity (bush,clinton) = 0.5774
Similarity (clinton,obama) = 0.5788
Similarity (clinton,gwbush) = 0.5755
Similarity (gwbush,obama) = 0.5777


In [122]:
print('Least similarity btween {} speeches.'.format(min(cmp_4gram, key=cmp_4gram.get)))
print('Most similarity btween {} speeches.'.format(max(cmp_4gram, key=cmp_4gram.get)))

Least similarity btween ('reagan', 'gwbush') speeches.
Most similarity btween ('clinton', 'obama') speeches.


### 10-gram character shingles

In [27]:
# working on 10-gram characters
presidents = ['reagan', 'bush', 'clinton', 'gwbush', 'obama']
presidents = list(reversed(presidents))
cmp_10gram=dict()
N=10
while presidents:
    p = presidents.pop()
    set1 = speech[p].flatMap(lambda x: ngramsC(x,N)).collect()
    for q in presidents:
        set2 = speech[q].flatMap(lambda x: ngramsC(x,N)).collect()
        cmp_10gram[(p,q)] = cmp_matrix(set(set1), set(set2))
        print ("Similarity ({},{}) = {:.4f}".format(p,q,cmp_10gram[(p,q)]))
        

Similarity (reagan,obama) = 0.1311
Similarity (reagan,gwbush) = 0.1141
Similarity (reagan,clinton) = 0.1295
Similarity (reagan,bush) = 0.1175
Similarity (bush,obama) = 0.1141
Similarity (bush,gwbush) = 0.1133
Similarity (bush,clinton) = 0.1240
Similarity (clinton,obama) = 0.1386
Similarity (clinton,gwbush) = 0.1222
Similarity (gwbush,obama) = 0.1251


In [121]:
print('Least similarity btween {} speeches.'.format(min(cmp_10gram, key=cmp_10gram.get)))
print('Most similarity btween {} speeches.'.format(max(cmp_10gram, key=cmp_10gram.get)))

Least similarity btween ('bush', 'gwbush') speeches.
Most similarity btween ('clinton', 'obama') speeches.


### 3-gram word shignles

In [40]:
# to create n-gram word shingles
def ngramsW(phrase, N):
    grams = [tuple(phrase[i:i+N]) for i in range(len(phrase)-N+1)]
    return grams

In [41]:
# To test to see if the algorithm works fine, check the similarity between 'test' 
# and 'obama' documents. This should give us a near zero result. Also, a test between 'obama'
# and 'obama' should give 1.0
set1 = sc.textFile('gs://saberbf0098/test.txt').map(lambda x:x.split(' '))\
         .flatMap(lambda x: ngramsW(x,N)).collect()
set2 = speech['obama'].map(lambda x:x.split(' '))\
                      .flatMap(lambda x: ngramsW(x,N)).collect()
print ("Similarity ({},{}) = {:.4f}".format('test','obama',cmp_matrix(set(set1), set(set2))))

Similarity (test,obama) = 0.0000


In [42]:
presidents = ['reagan', 'bush', 'clinton', 'gwbush', 'obama']
presidents = list(reversed(presidents))
cmp_3gram=dict()
N = 3
while presidents:
    p = presidents.pop()
    set1 = speech[p].map(lambda x:x.split(' ')).flatMap(lambda x: ngramsW(x,N)).collect()
    for q in presidents:
        set2 = speech[q].map(lambda x:x.split(' ')).flatMap(lambda x: ngramsW(x,N)).collect()
        cmp_3gram[(p,q)] = cmp_matrix(set(set1), set(set2))
        print ("Similarity ({},{}) = {:.4f}".format(p,q,cmp_3gram[(p,q)]))

Similarity (reagan,obama) = 0.0405
Similarity (reagan,gwbush) = 0.0322
Similarity (reagan,clinton) = 0.0417
Similarity (reagan,bush) = 0.0375
Similarity (bush,obama) = 0.0356
Similarity (bush,gwbush) = 0.0334
Similarity (bush,clinton) = 0.0411
Similarity (clinton,obama) = 0.0464
Similarity (clinton,gwbush) = 0.0376
Similarity (gwbush,obama) = 0.0385


In [119]:
print('Least similarity btween {} spechees.'.format(min(cmp_3gram, key=cmp_3gram.get)))
print('Most similarity btween {} spechees.'.format(max(cmp_3gram, key=cmp_3gram.get)))

Least similarity btween ('reagan', 'gwbush') spechees.
Most similarity btween ('clinton', 'obama') spechees.


### Notice - this section was used to test if everything can be done without rdds

In [None]:
# without using rdds
test = sc.textFile('gs://saberbf0098/test.txt').reduce(lambda x,y:x+' '+y)
ngramsC(test,4)

In [None]:
presidents = ['reagan', 'bush', 'clinton', 'gwbush', 'obama']
presidents = list(reversed(presidents))
cmp_4gram=dict()
N=4
while presidents:
    p = presidents.pop()
    set1 = ngramsC(speech[p],N)
    for q in presidents:
        set2 = ngramsC(speech[q],N)
        cmp_4gram[(p,q)] = cmp_matrix(set(set1), set(set2))
        print ("Similarity ({},{}) = {:.4f}".format(p,q,cmp_4gram[(p,q)]))
        

In [97]:
spark = SparkSession.builder.master('local[4]').getOrCreate()
lst = [(k[1],v) for (k,v) in list(cmp_4gram.items())]
rdd = sc.parallelize(lst)#.reduce(lambda x:x)
rdd.collect()
# df = spark.createDataFrame(lst)
# df.show()

[('obama', 0.5600765337014701),
 ('gwbush', 0.5314372804298408),
 ('clinton', 0.5590224135747752),
 ('bush', 0.5460797438118487),
 ('obama', 0.55795140650544),
 ('gwbush', 0.5683723257850619),
 ('clinton', 0.5773999398134216),
 ('obama', 0.5787752432344044),
 ('gwbush', 0.5755378509832005),
 ('obama', 0.5776688020862861)]

## PA4 Question 3

This question builds on the [UCI Online Retail II dataset](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II) analysis you performed in Quiz 3. This time, however, the R, F, M values should be calculated as follows:

- Recency should be the number of days relative to year-end 2011 (Dec 31). 
- Frequency should simply be the number of transactions in the total period.
- Monetary value should be the log10 of the total dollars spent. Why log10? We use logs to flatten the range — so high-spenders don't skew the analysis.

After calculating RFM values as specified above, run K-means clustering to divide the customers into 6 clusters. How do the number of customers in these 6 clusters compare with the clusters you in Question 2 of Quiz 3?

In [123]:
# to load the data into a DataFrame
df = spark.read.load('gs://datathinks-home/online_retail_II.csv', 
                          format='csv', inferSchema=True, header=True)
df.show(10)

+-------+---------+--------------------+--------+--------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|   InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+--------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|12/1/2009 7:45| 6.95|      13085|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|12/1/2009 7:45| 6.75|      13085|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|12/1/2009 7:45| 6.75|      13085|United Kingdom|
| 489434|    22041|"RECORD FRAME 7""...|      48|12/1/2009 7:45|  2.1|      13085|United Kingdom|
| 489434|    21232|STRAWBERRY CERAMI...|      24|12/1/2009 7:45| 1.25|      13085|United Kingdom|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|12/1/2009 7:45| 1.65|      13085|United Kingdom|
| 489434|    21871| SAVE THE PLANET MUG|      24|12/1/2009 7:45| 1.25|      13085|United Kingdom|
| 489434|    21523|F

In [124]:
df.filter(df['Quantity'] <=0).count()

22950

### Data preprocessing

In [125]:
# select\rename columns
n_records_before_cleaning = df.count()
print('Number of records befor preprocessing: ', n_records_before_cleaning)
# Missing Value Imputation
# df = df.na.drop()
df = df.dropna()
print('Number of records after missing value imputation: ', df.count())

Number of records befor preprocessing:  1067371
Number of records after missing value imputation:  824364


### 2. Calcuate Monetary Value

In [126]:
def totalSpend(quantity, price):
    return int(quantity) * float(price)
udfTotalSpend = sFuncs.udf(totalSpend)

In [127]:
monDF = df.withColumn('totalSpend', udfTotalSpend('Quantity', 'Price'))\
          .groupBy('customer ID').agg(sFuncs.log10(sFuncs.sum('totalSpend')).alias('Monetary'))\
          .dropna()\
          .sort(sFuncs.desc('Monetary'))
monDF.show(10)

+-----------+-----------------+
|customer ID|         Monetary|
+-----------+-----------------+
|      18102|5.776857458307348|
|      14646|5.718785647857314|
|      14156|5.472119441193257|
|      14911|5.431763340314062|
|      17450|5.368434519929445|
|      13694|5.280636454508753|
|      17511|5.235240454598928|
|      12415|5.156153108660705|
|      16684| 5.15076334554805|
|      15061|5.134787241987954|
+-----------+-----------------+
only showing top 10 rows



### 3. Calcuate Frequency

In [128]:
frqDF = df.groupBy('customer ID')\
          .agg(sFuncs.count('invoice').alias('Frequency'))\
          .sort(sFuncs.desc('Frequency'))

frqDF.show(10)

+-----------+---------+
|customer ID|Frequency|
+-----------+---------+
|      17841|    13097|
|      14911|    11613|
|      12748|     7307|
|      14606|     6709|
|      14096|     5128|
|      15311|     4717|
|      14156|     4130|
|      14646|     3890|
|      13089|     3438|
|      16549|     3255|
+-----------+---------+
only showing top 10 rows



### 4. Calcuate Recency

In [129]:
present = datetime.date(2011, 12, 31)

def daysToPresent(date):
    return (present - date).days

udfDaysToPresent = sFuncs.udf(daysToPresent)

# below, we need to use cast() function to make sure Recency column is integer. 
# Otherwise, it will be string and sorting works based on UTF value.
recDF = df.withColumn('InvoiceDate', sFuncs.to_date(df.InvoiceDate,"MM/dd/yyyy"))\
          .groupBy('Customer ID')\
          .agg(sFuncs.max('InvoiceDate').alias('maxInvoiceDate'))\
          .withColumn('Recency', udfDaysToPresent('maxInvoiceDate').cast('int'))\
          .sort(sFuncs.desc('Recency'))
recDF.show(10)

+-----------+--------------+-------+
|Customer ID|maxInvoiceDate|Recency|
+-----------+--------------+-------+
|      17641|    2009-12-01|    760|
|      14654|    2009-12-01|    760|
|      17056|    2009-12-01|    760|
|      12636|    2009-12-01|    760|
|      17592|    2009-12-01|    760|
|      17485|    2009-12-01|    760|
|      13526|    2009-12-01|    760|
|      14980|    2009-12-02|    759|
|      15833|    2009-12-02|    759|
|      17087|    2009-12-02|    759|
+-----------+--------------+-------+
only showing top 10 rows



### 5. Number of customers in each category


In [130]:
rfm = recDF.join(frqDF, on=['Customer ID'], how='outer')\
           .join(monDF, on=['Customer ID'], how='outer')\
           .drop('maxInvoiceDate')\
           .dropna()
rfm.show(10)

+-----------+-------+---------+------------------+
|Customer ID|Recency|Frequency|          Monetary|
+-----------+-------+---------+------------------+
|      12799|    551|       15| 2.341137638740964|
|      12940|     68|      103|  2.94264785566263|
|      13285|     45|      231| 3.526932149812256|
|      13289|    745|       16| 2.488480208426273|
|      13623|     52|      303|3.3885203670539803|
|      13832|     39|       58|2.7799137998502625|
|      13840|    438|       40|2.8138477542288545|
|      14450|    202|       76|3.0524784722647733|
|      14570|    302|       68| 2.787991505131025|
|      15447|    352|       30| 2.685679052400608|
+-----------+-------+---------+------------------+
only showing top 10 rows



In [131]:
# step1 - to assemble features in a vector
vecAssembler = VectorAssembler(inputCols=["Recency", "Frequency", "Monetary"],\
                               outputCol="features",\
                               handleInvalid="skip")
vecRFM = vecAssembler.transform(rfm)
vecRFM.show(10,truncate=False)

+-----------+-------+---------+------------------+--------------------------------+
|Customer ID|Recency|Frequency|Monetary          |features                        |
+-----------+-------+---------+------------------+--------------------------------+
|12799      |551    |15       |2.341137638740964 |[551.0,15.0,2.341137638740964]  |
|12940      |68     |103      |2.94264785566263  |[68.0,103.0,2.94264785566263]   |
|13285      |45     |231      |3.526932149812256 |[45.0,231.0,3.526932149812256]  |
|13289      |745    |16       |2.488480208426273 |[745.0,16.0,2.488480208426273]  |
|13623      |52     |303      |3.3885203670539803|[52.0,303.0,3.3885203670539803] |
|13832      |39     |58       |2.7799137998502625|[39.0,58.0,2.7799137998502625]  |
|13840      |438    |40       |2.8138477542288545|[438.0,40.0,2.8138477542288545] |
|14450      |202    |76       |3.0524784722647733|[202.0,76.0,3.0524784722647733] |
|14570      |302    |68       |2.787991505131025 |[302.0,68.0,2.787991505131

In [132]:
# step2 - to fit K-means model
kmeans = KMeans(k=6, seed=1)
model = kmeans.fit(vecRFM.select('features'))

In [133]:
# step3 - do the prediction
transformed = model.transform(vecRFM)
transformed.show(10) 

+-----------+-------+---------+------------------+--------------------+----------+
|Customer ID|Recency|Frequency|          Monetary|            features|prediction|
+-----------+-------+---------+------------------+--------------------+----------+
|      12799|    551|       15| 2.341137638740964|[551.0,15.0,2.341...|         0|
|      12940|     68|      103|  2.94264785566263|[68.0,103.0,2.942...|         4|
|      13285|     45|      231| 3.526932149812256|[45.0,231.0,3.526...|         5|
|      13289|    745|       16| 2.488480208426273|[745.0,16.0,2.488...|         0|
|      13623|     52|      303|3.3885203670539803|[52.0,303.0,3.388...|         5|
|      13832|     39|       58|2.7799137998502625|[39.0,58.0,2.7799...|         4|
|      13840|    438|       40|2.8138477542288545|[438.0,40.0,2.813...|         0|
|      14450|    202|       76|3.0524784722647733|[202.0,76.0,3.052...|         4|
|      14570|    302|       68| 2.787991505131025|[302.0,68.0,2.787...|         0|
|   

In [134]:
BestCustomer = transformed.where('prediction == 0').count()

LoyalCustomer = transformed.where('prediction == 1').count()

BigSpender = transformed.where('prediction == 2').count()

AlmostLost = transformed.where('prediction == 3').count()

LostCustomers = transformed.where('prediction == 4').count()

LostCheapCustomers = transformed.where('prediction == 5').count()

In [135]:
print("Number of Best Customers: ", BestCustomer)
print("Number of Loyal Customers: ", LoyalCustomer)
print("Number of Big Spenders: ", BigSpender)
print("Number of Almost Lost: ", AlmostLost)
print("Number of Lost Customers: ", LostCustomers)
print("Number of Lost Cheap Customers: ", LostCheapCustomers)

Number of Best Customers:  1838
Number of Loyal Customers:  3
Number of Big Spenders:  13
Number of Almost Lost:  151
Number of Lost Customers:  3076
Number of Lost Cheap Customers:  762


## PA4 Question 4

A freshly-loaded copy of the NYT covid dataset is available as [`gs://datathinks-home/covid2.json`](gs://datathinks-home/covid2.json).

Please don’t upload your data! Instead, starting on March 1, 2020, for the first day of each month, which county had the worst numbers of confirmed cases, and deaths? They might not be the same county. In other words, develop a table that looks like this:

| Query | 4/1 | 5/1 | ...etc... |
| --- | --- | --- | --- |
| Confirmed Cases | nnn, County, State | nnn, County, State |   |
| Deaths | nnn, County, State | nnn, County, State |   |

This analysis should be done on Spark.

In [136]:
# to read the .json file
# covidDF = spark.read.json('gs://datathinks-home/covid2.json')
covidDF = spark.read.load('gs://datathinks-home/covid2.json', 
                          format='json', inferSchema='true', header='true')

In [137]:
covidDF.show(10, truncate=False)

+---------------+-----------+----------------+----------+------+----------+
|confirmed_cases|county     |county_fips_code|date      |deaths|state_name|
+---------------+-----------+----------------+----------+------+----------+
|12             |Kansas City|null            |2020-03-20|0     |Missouri  |
|13             |Kansas City|null            |2020-03-21|0     |Missouri  |
|13             |Kansas City|null            |2020-03-22|0     |Missouri  |
|18             |Kansas City|null            |2020-03-23|0     |Missouri  |
|30             |Kansas City|null            |2020-03-24|0     |Missouri  |
|51             |Kansas City|null            |2020-03-25|0     |Missouri  |
|64             |Kansas City|null            |2020-03-26|0     |Missouri  |
|78             |Kansas City|null            |2020-03-27|0     |Missouri  |
|94             |Kansas City|null            |2020-03-28|0     |Missouri  |
|102            |Kansas City|null            |2020-03-29|0     |Missouri  |
+-----------

In [138]:
covidDF.printSchema()

root
 |-- confirmed_cases: string (nullable = true)
 |-- county: string (nullable = true)
 |-- county_fips_code: string (nullable = true)
 |-- date: string (nullable = true)
 |-- deaths: string (nullable = true)
 |-- state_name: string (nullable = true)



In [139]:
# there is no missing value in columns other than county_fips_code column which we can drop
for col in covidDF.columns:
    condition = '{} is null'.format(col)
    print('number of nulls in {:s}: {:>10}'.format(col, covidDF.filter(condition).count()))

number of nulls in confirmed_cases:          0
number of nulls in county:          0
number of nulls in county_fips_code:       6691
number of nulls in date:          0
number of nulls in deaths:          0
number of nulls in state_name:          0


In [140]:
# to conver columns to proper datatype 
covidDF = covidDF.withColumn('confirmed_cases', covidDF.confirmed_cases.cast('int'))\
                 .withColumn('deaths', covidDF.deaths.cast('int'))\
                 .withColumn('date', sFuncs.to_date(covidDF.date))

# create a new dataset holding records for the first day of each months
dateDF = covidDF.filter(sFuncs.dayofmonth(covidDF.date) == 1)

In [141]:
def findStat(num, county, state):
    return str(num) + ', ' + str(county) + ', ' + str(state)

# def findStat(*row):
#     return ','.join(row)

func = sFuncs.udf(findStat)

start = time.time()
max_cases = dateDF.sort(sFuncs.desc('confirmed_cases'))\
                  .groupBy('date')\
                  .agg(sFuncs.max('confirmed_cases').alias('cases'), \
                       sFuncs.first('county').alias('county'), \
                       sFuncs.first('state_name').alias('state'))\
                  .select(sFuncs.date_format('date', 'MM/dd').alias('date'), \
                          func('cases', 'county', 'state').alias('max_cases'))

max_deaths = dateDF.sort(sFuncs.desc('deaths'))\
                   .groupBy('date')\
                   .agg(sFuncs.max('deaths').alias('deaths'), \
                        sFuncs.first('county').alias('county'), \
                        sFuncs.first('state_name').alias('state'))\
                   .select(sFuncs.date_format('date', 'MM/dd').alias('date'), \
                           func('deaths', 'county', 'state').alias('max_deaths'))

max_cases.join(max_deaths, ['date']).sort('date').show(truncate=False)

print('elapsed time: {:.0f} seconds'.format(time.time() - start))

+-----+-------------------------------+------------------------------+
|date |max_cases                      |max_deaths                    |
+-----+-------------------------------+------------------------------+
|02/01|2, Cook, Illinois              |0, Maricopa, Arizona          |
|03/01|13, Douglas, Nebraska          |3, King, Washington           |
|04/01|47914, New York City, New York |1848, New York City, New York |
|05/01|174931, New York City, New York|17931, New York City, New York|
|06/01|208550, New York City, New York|21090, New York City, New York|
|07/01|220143, New York City, New York|22574, New York City, New York|
|08/01|230147, New York City, New York|23007, New York City, New York|
|09/01|242521, Los Angeles, California|23703, New York City, New York|
|10/01|271371, Los Angeles, California|23829, New York City, New York|
|11/01|309190, Los Angeles, California|24013, New York City, New York|
+-----+-------------------------------+------------------------------+

elaps

In [142]:
# Alternatively, this part can be done using window partitioning which is faster than aggregation

start = time.time()

byCase = Window.partitionBy('date').orderBy(sFuncs.desc('confirmed_cases'))

w_max_cases = dateDF.withColumn('rank', sFuncs.dense_rank().over(byCase))\
                     .filter('rank == 1')\
                     .withColumn('rn', sFuncs.row_number().over(byCase)).where('rn==1').drop('rn')\
                     .select(sFuncs.date_format('date', 'MM/dd').alias('date'), \
                             func('confirmed_cases', 'county', 'state_name').alias('max_cases'))\

byDeath = Window.partitionBy('date').orderBy(sFuncs.desc('deaths'))

w_max_deaths = dateDF.withColumn('rank', sFuncs.dense_rank().over(byDeath))\
                     .filter('rank == 1')\
                     .withColumn('rn', sFuncs.row_number().over(byDeath)).where('rn==1').drop('rn')\
                     .select(sFuncs.date_format('date', 'MM/dd').alias('date'), \
                             func('deaths', 'county', 'state_name').alias('max_deaths'))\



w_max_cases.join(w_max_deaths, ['date']).sort('date').show(truncate=False)

print('elapsed time: {:.0f} seconds'.format(time.time() - start))

+-----+-------------------------------+------------------------------+
|date |max_cases                      |max_deaths                    |
+-----+-------------------------------+------------------------------+
|02/01|2, Cook, Illinois              |0, Maricopa, Arizona          |
|03/01|13, Douglas, Nebraska          |3, King, Washington           |
|04/01|47914, New York City, New York |1848, New York City, New York |
|05/01|174931, New York City, New York|17931, New York City, New York|
|06/01|208550, New York City, New York|21090, New York City, New York|
|07/01|220143, New York City, New York|22574, New York City, New York|
|08/01|230147, New York City, New York|23007, New York City, New York|
|09/01|242521, Los Angeles, California|23703, New York City, New York|
|10/01|271371, Los Angeles, California|23829, New York City, New York|
|11/01|309190, Los Angeles, California|24013, New York City, New York|
+-----+-------------------------------+------------------------------+

elaps