# Chapter 6: Working with Key/Value Data

In [23]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import Row
import numpy as np

In [24]:
spark = SparkSession.builder.appName("key-value-data")\
    .master("local[*]").getOrCreate()

In [25]:
sc = spark.sparkContext

## The Goldilocks Example

In [26]:
list_data = [(15.0, 0.25, 2467.0, 0.0), 
            (2.0, 1000.0, 35.4, 0.0),
            (10.0, 2.0, 50.0, 0.0),
            (3.0, 8.5, 0.2, 98.0)]

rdd = sc.parallelize(list_data)\
.map(lambda x: Row(Happiness=x[0], Niceness=x[1], Softness=x[2], Sweetness=x[3]))

df = spark.createDataFrame(rdd)

In [27]:
df.show()

+---------+--------+--------+---------+
|Happiness|Niceness|Softness|Sweetness|
+---------+--------+--------+---------+
|     15.0|    0.25|  2467.0|      0.0|
|      2.0|  1000.0|    35.4|      0.0|
|     10.0|     2.0|    50.0|      0.0|
|      3.0|     8.5|     0.2|     98.0|
+---------+--------+--------+---------+



## Goldilocks Version 0: Iterative Solution

In [28]:
ranks = [2, 4]

In [29]:
def get_solution_v0(df, ranks):
    
    result = {}
    
    for idx in range(0, len(df.columns)):
        
        col_rdd = df.rdd.map(lambda row: row[idx])
        sorted_rdd = col_rdd.sortBy(lambda x: x).zipWithIndex()
        ranks_only = sorted_rdd.filter(lambda x: x[1]+1 in ranks)\
                        .map(lambda x: x[0])
        
        result[idx+1] = list(ranks_only.collect())
    
    return result

In [30]:
solution_v0 = get_solution_v0(df, ranks)

In [31]:
solution_v0

{1: [3.0, 15.0], 2: [2.0, 1000.0], 3: [35.4, 2467.0], 4: [0.0, 98.0]}

## Goldilocks Version 1: GroupByKey Solution

In [32]:
def get_solution_v1(df, ranks):
    
    len_columns = len(df.columns)
    
    result = df.rdd.flatMap(lambda x: [(idx+1, x[idx]) for idx in range(0, len_columns)])\
        .groupByKey().map(lambda x: (x[0], 
                                     [val for idx, val in enumerate(sorted(list(x[1]))) 
                                      if idx+1 in ranks])).collectAsMap()
    
    return result

In [33]:
solution_v1 = get_solution_v1(df, ranks)

In [34]:
solution_v1

{1: [3.0, 15.0], 2: [2.0, 1000.0], 3: [35.4, 2467.0], 4: [0.0, 98.0]}

## Goldilocks Version 2: Secondary Sort

In [35]:
def get_solution_v2(df, ranks):
    
    
    len_columns = len(df.columns)
    
    def filter_for_target_ranks(x_iter):

        for idx, val in enumerate(x_iter):

            if idx+1 in ranks:

                yield val
    
    pair_rdd = df.rdd\
        .flatMap(lambda x: [(idx, x[idx]) for idx in range(0, len_columns)])\
        .map(lambda x: (x, 1))
    
    num_partitions = len(df.columns)
    
    sorted_rdd = pair_rdd.repartitionAndSortWithinPartitions(num_partitions, 
                                                             lambda x: x[0]%num_partitions)
    
    filter_for_target = sorted_rdd\
        .mapPartitions(lambda x: filter_for_target_ranks(x))\
        .map(lambda x: x[0]).collect()
    
    results = {}
    
    for col, val in filter_for_target:
        
        if results.get(col) is None:
            results[col] = [val]
        else:
            results[col] = results[col] + [val]
    
    return results

In [36]:
solution_v2 = get_solution_v2(df, ranks)

In [37]:
solution_v2

{0: [3.0, 15.0], 1: [2.0, 1000.0], 2: [35.4, 2467.0], 3: [0.0, 98.0]}

## Goldilocks Version 3: Sort on Cell Vales

In [38]:
def get_solution_v3(df, ranks):
    
    len_columns = len(df.columns)
    
    def get_columns_fre_partition(idx_part, x_iter):
    
        columns_freq = [0, 0, 0, 0]

        for val, idx in x_iter:
            columns_freq[idx] += 1

        yield (idx_part, columns_freq)
    
    sorted_column_pairs = df.rdd\
        .flatMap(lambda x: [(x[idx], idx) for idx in range(0, len_columns)])\
        .sortByKey()
    
    
    columns_fre_partition = sorted_column_pairs\
        .mapPartitionsWithIndex(get_columns_fre_partition)
    
    
    ranks_locations = {}

    running_totals = [0,0,0,0]

    ranks = [2, 4]

    for partition, target_ranks in list(columns_fre_partition.collect()):

        curr_result = []
        old_running_totals = running_totals

        running_totals = list(np.array(running_totals) + 
                              np.array(target_ranks))

        for idx, (val_old, val_new) in enumerate(zip(old_running_totals, 
                                                     running_totals)):
            for jdx, val in enumerate(range(val_old+1, val_new+1)):
                if val in ranks:
                    curr_result.append((idx, jdx+1))

        ranks_locations[partition] = curr_result
        
        
    def get_target_ranks_values(idx, x_iter):
    
        x_list = list(x_iter)

        ranks_here = ranks_locations.get(idx)

        if len(ranks_here) > 0:

            for col, j_order in ranks_here:

                values = [val for val, col_idx in x_list if col_idx == col]

                yield (col, values[j_order-1])
                
    final_results = sorted_column_pairs.mapPartitionsWithIndex(get_target_ranks_values)\
        .groupByKey().map(lambda x: (x[0], list(x[1]))).collectAsMap()
    
    return final_results

In [39]:
solution_v3 = get_solution_v3(df, ranks)

In [40]:
solution_v3

{0: [3.0, 15.0], 1: [2.0, 1000.0], 2: [35.4, 2467.0], 3: [0.0, 98.0]}