In [2]:
# Required libraries
import sys
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

### Note: 
In case you are using a single-node cluster, executing this cell is essential, as otherwise, SparkContext put the sc.master on 'yarn' which you don't have. The result would be you'll never see a collect() to converge.

In [3]:
spark = SparkSession.builder \
    .master("local[4]") \
    .appName("QA4") \
    .getOrCreate()

## HW3 Part1

In [None]:
!hadoop fs -put /home/saberbf/transition-matrix.txt  /user/root/

In [None]:
states = ['S0', 'S1', 'S2', 'S3', 'S4', 'Sf']
links = sc.parallelize(map(lambda s: (s, states), states)) \
# links.collect()

ranks = sc.parallelize(states).map(lambda s: (s, 1.0))
# ranks.collect()

wordProbs = sc.parallelize([('S0',{}), \
                            ('S1', {'Tom':1/5, 'John':1/5, 'Mary':1/5, 'Alice':1/5, 'Jerry':1/5}), \
                            ('S2',{'a':3/8, 'the':4/8, 'that':1/8}),\
                            ('S3',{'bit':1/6, 'saw':1/6, 'ate':1/6, 'played':1/6, 'hit':1/6, 'gave':1/6}), \
                            ('S4', {'cat':1/6, 'dog':1/6, 'car':1/6, 'bed':1/6, 'apple':1/6, 'pen':1/6}), \
                            ('Sf', {})])
# wordProbs.collect()

In [None]:
lines = sc.textFile('hdfs:///user/root/transition-matrix.txt')
header = lines.first().split('\t')
def czip(x):
    return tuple(zip(states, x[1:]))
    
trans = lines.map(lambda line: line.split('\t')) \
             .filter(lambda line: line != header)\
             .map(czip).flatMap(lambda x:x) \
             

numlinks = trans.filter(lambda x: x[1] !='0').map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x+y)
trans = trans.mapValues(float).reduceByKey(lambda x, y: x+y) 

numlinks.collect()

ranks.join(numlinks.join(trans)).collect()

In [None]:
iters = 0
pre_ranks = ranks
diff = 1
def compute_contribs(pair):
    [url, [rank, [numlinks,tran]]] = pair  # split key-value pair
    return [(url, rank * tran / numlinks)]

def rdd_diff(pair):
    (url, (pre, cur)) = pair
    return [abs(cur-pre)]
while diff > 0.01:
    contribs = ranks.join(numlinks.join(trans)).flatMap(compute_contribs)
    ranks = contribs.reduceByKey(lambda x, y: x + y) \
                    .mapValues(lambda x: 0.15 + 0.85 * x)
    diff = pre_ranks.join(ranks).flatMap(rdd_diff).mean()
    pre_ranks = ranks
    iters +=1

ranks = ranks.sortBy(lambda x: x[0])

In [None]:
!gsutil rm -r gs://saberbf0098/sparkResults/HW2/
ranks.saveAsTextFile('gs://saberbf0098/sparkResults/HW2/')

In [None]:
print('\nNumber of iterations: ',iters)
print('Ranks: ')
ranks.collect()


In [None]:
def calc_word_prob(pair):
    [jar, [wordProbs, rank]] = pair
    return [(jar, [(word,prob * rank)]) for (word,prob) in wordProbs.items()]

print('List of Word Probabilities in each Bucket:') 
wordProbs.join(ranks).flatMap(calc_word_prob).reduceByKey(lambda x,y: x+y)\
                     .sortBy(lambda x: x[0]).collect()


## HW3 Part2

In [3]:
df = spark.read.format("csv").option('header','true').load('gs://datathinks-home/online_retail_II.csv')
df.show(10)

+-------+---------+--------------------+--------+--------------+-----+-----------+--------------+
|Invoice|StockCode|         Description|Quantity|   InvoiceDate|Price|Customer ID|       Country|
+-------+---------+--------------------+--------+--------------+-----+-----------+--------------+
| 489434|    85048|15CM CHRISTMAS GL...|      12|12/1/2009 7:45| 6.95|      13085|United Kingdom|
| 489434|   79323P|  PINK CHERRY LIGHTS|      12|12/1/2009 7:45| 6.75|      13085|United Kingdom|
| 489434|   79323W| WHITE CHERRY LIGHTS|      12|12/1/2009 7:45| 6.75|      13085|United Kingdom|
| 489434|    22041|"RECORD FRAME 7""...|      48|12/1/2009 7:45|  2.1|      13085|United Kingdom|
| 489434|    21232|STRAWBERRY CERAMI...|      24|12/1/2009 7:45| 1.25|      13085|United Kingdom|
| 489434|    22064|PINK DOUGHNUT TRI...|      24|12/1/2009 7:45| 1.65|      13085|United Kingdom|
| 489434|    21871| SAVE THE PLANET MUG|      24|12/1/2009 7:45| 1.25|      13085|United Kingdom|
| 489434|    21523|F

### Data preprocessing

In [4]:
# select\rename columns
df = df.withColumn("CustomerID", df["Customer ID"]).drop("Customer ID", "Country", "Description")
n_records_before_cleaning = df.count()
# remove duplicate records
df = df.distinct()
n_records_after_removing_duplicates = df.count()
print('Number of records befor duplicate removal: ', n_records_before_cleaning)
print('Number of records after duplicate removal: ', n_records_after_removing_duplicates)
print('Number of duplicate records removed: ', n_records_before_cleaning-n_records_after_removing_duplicates)
# Missing Value Imputation
n_null_records = df.filter('CustomerID is null').count()
print('Number of Null records: ', n_null_records)
print('Number of records befor missing value imputation: ', df.count())
df = df.na.drop()
print('Number of records after missing value imputation: ', df.count())

df.show(5)

Number of records befor duplicate removal:  1067371
Number of records after duplicate removal:  1033034
Number of duplicate records removed:  34337
Number of Null records:  235151
Number of records befor missing value imputation:  1033034
Number of records after missing value imputation:  797883
+-------+---------+--------+---------------+-----+----------+
|Invoice|StockCode|Quantity|    InvoiceDate|Price|CustomerID|
+-------+---------+--------+---------------+-----+----------+
| 489520|    22080|      10|12/1/2009 11:41| 1.65|     14911|
| 489522|    22300|       3|12/1/2009 11:45| 2.55|     15998|
| 489532|    16235|      60|12/1/2009 11:58| 0.21|     13394|
| 489537|   72801G|       3|12/1/2009 12:14| 1.25|     14040|
| 489539|    20692|      96|12/1/2009 12:18| 3.75|     15061|
+-------+---------+--------+---------------+-----+----------+
only showing top 5 rows



### 2. Calcuate Monetary Value

In [100]:
df = df.withColumn("Price", df.Price.cast('float'))
df.printSchema()

root
 |-- Invoice: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- Price: float (nullable = true)
 |-- CustomerID: string (nullable = true)



In [101]:
monDF = df.groupBy('customerID').agg(sum('Price').alias('Monetary'))
monDF = monDF.orderBy(desc('Monetary'))
tmp= monDF.withColumn("new_column",lit("ABC"))
w = Window().partitionBy('new_column').orderBy(lit('A'))
monDF = tmp.withColumn("id", row_number().over(w)).drop("new_column")

monDF.show(10)

+----------+------------------+---+
|customerID|          Monetary| id|
+----------+------------------+---+
|     14911|  70473.0999276936|  1|
|     14096| 41376.32996287942|  2|
|     15098| 40278.89999961853|  3|
|     14063| 39920.94912147522|  4|
|     14156|36397.009877726436|  5|
|     17841| 34582.22974572331|  6|
|     15760|  33628.5498046875|  7|
|     12918|           32860.5|  8|
|     12744| 25481.40001243353|  9|
|     17399|    25111.08984375| 10|
+----------+------------------+---+
only showing top 10 rows



In [102]:
def func(pair):
    (key, val) = pair
    result = 0
    if val < 0.15*cnt: 
        result = 1
    if 0.15*cnt < val < 0.3*cnt: 
        result = 2
    if 0.3*cnt < val < 0.6*cnt: 
        result = 3
    if 0.6*cnt < val : 
        result = 4
    return (key, result)

cnt = monDF.count()
monRDD = monDF.rdd
monRDD = monRDD.map(lambda x:(x[0],x[2])).map(func)
# monRDD.collect()

In [103]:
# RDD code to compute aggregate average
monDF = monRDD.toDF(["customerID", "Monetary"])
monDF.show(10)

+----------+--------+
|customerID|Monetary|
+----------+--------+
|     14911|       1|
|     14096|       1|
|     15098|       1|
|     14063|       1|
|     14156|       1|
|     17841|       1|
|     15760|       1|
|     12918|       1|
|     12744|       1|
|     17399|       1|
+----------+--------+
only showing top 10 rows



### 3. Calcuate Frequency

In [123]:
frqDF = df.groupBy('customerID').agg(count('invoice').alias('Frequency'))
frqDF = frqDF.orderBy(desc('Frequency'))
tmp= frqDF.withColumn("new_column",lit("ABC"))
w = Window().partitionBy('new_column').orderBy(lit('A'))
frqDF = tmp.withColumn("id", row_number().over(w)).drop("new_column")

frqDF.show(10)

+----------+---------+---+
|customerID|Frequency| id|
+----------+---------+---+
|     17841|    12638|  1|
|     14911|    11444|  2|
|     12748|     6662|  3|
|     14606|     6500|  4|
|     14096|     5128|  5|
|     15311|     4579|  6|
|     14156|     4118|  7|
|     14646|     3890|  8|
|     13089|     3391|  9|
|     16549|     3098| 10|
+----------+---------+---+
only showing top 10 rows



In [124]:
cnt = frqDF.count()
frqRDD = frqDF.rdd
frqRDD = frqRDD.map(lambda x:(x[0],x[2])).map(func)
# frqRDD.collect()

In [125]:
# RDD code to compute aggregate average
frqDF = frqRDD.toDF(["customerID", "Frequency"])
frqDF.show(10)

+----------+---------+
|customerID|Frequency|
+----------+---------+
|     17841|        1|
|     14911|        1|
|     12748|        1|
|     14606|        1|
|     14096|        1|
|     15311|        1|
|     14156|        1|
|     14646|        1|
|     13089|        1|
|     16549|        1|
+----------+---------+
only showing top 10 rows



### 4. Calcuate Recency

In [118]:
from datetime import datetime as dt

In [119]:
# Setting an user define function:
# This function converts the string cell into a date:
dateFormat =  udf (lambda x: dt.strptime(x, '%m/%d/%Y %H:%M'), DateType())

recDF = df.groupBy('CustomerID').agg(max('InvoiceDate').alias('Recency'))
recDF = recDF.select('CustomerID','Recency') \
        .withColumn('Recency', dateFormat(col('Recency')))
recDF = recDF.orderBy(desc('Recency'))
recDF.show(10)

+----------+----------+
|CustomerID|   Recency|
+----------+----------+
|     12713|2011-12-09|
|     12985|2011-12-09|
|     13298|2011-12-08|
|     14251|2011-12-08|
|     14138|2011-12-08|
|     15877|2011-12-08|
|     15520|2011-12-08|
|     16322|2011-12-08|
|     15156|2011-12-08|
|     13521|2011-12-08|
+----------+----------+
only showing top 10 rows



In [120]:
def compDate(pair):
    (key, val) = pair
    result = 0
    if dt.date(dt(2011, 11, 15)) <= val: 
        result = 1
    if dt.date(dt(2011, 9, 5)) <= val <= dt.date(dt(2011, 11, 14)): 
        result = 2
    if dt.date(dt(2011, 1, 5)) <= val <= dt.date(dt(2011, 9, 4)): 
        result = 3
    if val <= dt.date(dt(2011, 1, 4)): 
        result = 4
    return (key, result)


# func =  udf (compDate, DateType())
# recDF.withColumn('Recency', func(col('Recency'))).show(10)

recRDD = recDF.rdd
recRDD = recRDD.map(lambda x:(x[0],x[1])).map(compDate)
# recRDD.collect()

In [121]:
# RDD code to compute aggregate average
recDF = recRDD.toDF(["customerID", "Recency"])
recDF.show(10)

+----------+-------+
|customerID|Recency|
+----------+-------+
|     12713|      1|
|     12985|      1|
|     16322|      1|
|     15156|      1|
|     13471|      1|
|     13521|      1|
|     15877|      1|
|     15520|      1|
|     14251|      1|
|     14138|      1|
+----------+-------+
only showing top 10 rows



### 5. Number of customers in each category


In [140]:
uDF = recDF.join(frqDF, on=['CustomerID'], how='outer')
uDF = uDF.join(monDF, on=['CustomerID'], how='outer')
uDF.show(10)

+----------+-------+---------+--------+
|customerID|Recency|Frequency|Monetary|
+----------+-------+---------+--------+
|     12394|      3|        4|       4|
|     12529|      4|        4|       4|
|     12847|      1|        2|       3|
|     13192|      2|        2|       2|
|     13282|      2|        3|       3|
|     13442|      4|        3|       3|
|     13610|      4|        1|       1|
|     13772|      3|        1|       1|
|     13865|      3|        4|       4|
|     14157|      4|        3|       3|
+----------+-------+---------+--------+
only showing top 10 rows



In [138]:
BestCustomer = uDF.where((col('Recency')==lit('1')) \
            & (col('Frequency')==lit('1')) \
            & (col('Monetary')==lit('1'))).count()

LoyalCustomer = uDF.where((col('Frequency')==lit('1'))).count()

BigSpender = uDF.where((col('Monetary')==lit('1'))).count()

AlmostLost = uDF.where((col('Recency')==lit('3')) \
            & (col('Frequency')==lit('1')) \
            & (col('Monetary')==lit('1'))).count()

LostCustomers = uDF.where((col('Recency')==lit('4')) \
            & (col('Frequency')==lit('1')) \
            & (col('Monetary')==lit('1'))).count()

LostCheapCustomers = uDF.where((col('Recency')==lit('4')) \
            & (col('Frequency')==lit('4')) \
            & (col('Monetary')==lit('4'))).count()

In [139]:
print("Number of Best Customers: ", BestCustomer)
print("Number of Loyal Customers: ", LoyalCustomer)
print("Number of Big Spenders: ", BigSpender)
print("Number of Almost Lost: ", AlmostLost)
print("Number of Lost Customers: ", LostCustomers)
print("Number of Lost Cheap Customers: ", LostCheapCustomers)

Number of Best Customers:  6
Number of Loyal Customers:  891
Number of Big Spenders:  891
Number of Almost Lost:  103
Number of Lost Customers:  371
Number of Lost Cheap Customers:  1150


## PA4 Question 4

A freshly-loaded copy of the NYT covid dataset is available as [gs://datathinks-home/covid2.json](gs://datathinks-home/covid2.json).

Please don’t upload your data! Instead, starting on March 1, 2020, for the first day of each month, which county had the worst numbers of confirmed cases, and deaths? They might not be the same county. In other words, develop a table that looks like this:

| Query | 4/1 | 5/1 | ...etc... |
| --- | --- | --- | --- |
| Confirmed Cases | nnn, County, State | nnn, County, State |   |
| Deaths | nnn, County, State | nnn, County, State |   |

This analysis should be done on Spark.

## PA4 Question 1

The last 5 presidents in our speeches collection were `reagan, bush, clinton, gwbush` and `obama` (that's how the tar.gz files of their speeches are named in Canvas).

Pairwise comparisons of the similarities in their speech collections (10 pairs) will give us a half-matrix like shown below (The symmetry in the problem formulation makes it unnecessary to compute the blank spots in the matrix):

|  | r | b | c | g | o |
| --- | --- | --- | --- | --- | --- |
| reagan |   | * | * | * | * |
| bush |   |   | * | * | * |
| clinton |   |   |   | * | * |
| gwbush |   |   |   |   | * |

Compute the similarity denoted by each asterisk and answer:

1. Using n-gram character shingles, assuming n=4, which two presidents' speeches were the most similar and which were the least similar?
2. Using n-gram word shingles, assuming n=3, which two presidents' speeches were the most similar and which were the least similar?

## PA4 Question 3

This question builds on the [UCI Online Retail II dataset](https://archive.ics.uci.edu/ml/datasets/Online+Retail+II) analysis you performed in Quiz 3. This time, however, the R, F, M values should be calculated as follows:

Recency should be the number of days relative to year-end 2011 (Dec 31). 
Frequency should simply be the number of transactions in the total period.
Monetary value should be the log10 of the total dollars spent. Why log10? We use logs to flatten the range — so high-spenders don't skew the analysis.
After calculating RFM values as specified above, run K-means clustering to divide the customers into 6 clusters. How do the number of customers in these 6 clusters compare with the clusters you in Question 2 of Quiz 3?

## PA4 Question 4

A freshly-loaded copy of the NYT covid dataset is available as [`gs://datathinks-home/covid2.json`](gs://datathinks-home/covid2.json).

Please don’t upload your data! Instead, starting on March 1, 2020, for the first day of each month, which county had the worst numbers of confirmed cases, and deaths? They might not be the same county. In other words, develop a table that looks like this:

| Query | 4/1 | 5/1 | ...etc... |
| --- | --- | --- | --- |
| Confirmed Cases | nnn, County, State | nnn, County, State |   |
| Deaths | nnn, County, State | nnn, County, State |   |

This analysis should be done on Spark.

In [2]:
# to read the .json file
covidDF = spark.read.json('gs://datathinks-home/covid2.json')
# covidDF = spark.read.load('gs://datathinks-home/covid2.json', 
#                           format='json', inferSchema='true', header='true')

KeyboardInterrupt: 

In [4]:
co