In [1]:
import json
from pyspark.sql import Row, SparkSession
from pyspark.sql.functions import (col, collect_list, concat,
                                   create_map, lit, map_concat,
                                   map_from_arrays, substring_index,
                                   to_json, when)

In [2]:
spark = (SparkSession.builder
         .master('local[*]')
         .config('spark.driver.memory', '10G')
         .appName('generate-statistics')
         .getOrCreate())

spark

### 500 Row Dataframe

In [3]:
%%timeit -n1 -r1
df_500 = spark.read.parquet('../data/df_500/')

stats = ['count', 'mean', 'stddev', 'min', 'max',
         '1%','5%', '25%', '50%', '75%', '95%', '99%']

df_500_summary = (
    df_500
    .summary(*stats)
    .withColumn('summary',
                when(col('summary').contains('%'),
                     concat(lit('p'),
                            substring_index(col('summary'),
                                            '%', 1)))
                .otherwise(col('summary'))))

df_500_summary_cols = df_500_summary.columns

df_500_summary_metric_cols = [col for col in df_500_summary_cols if col != 'summary']

df_500_metrics_long = spark.createDataFrame(
    Row(name=col, metric=m_row['summary'], value=m_row[col])
    for m_row in df_500_summary.toLocalIterator()
    for col in df_500_summary_metric_cols)

def update_metrics_map(row):
    row['metrics_map'].update({'name': row['name']})
    return row['metrics_map']

result_list = (
    df_500_metrics_long
    .withColumn('value', col('value').astype('double'))
    .groupBy(col('name'))
    .agg(collect_list(col('metric')).alias('metric_array'),
         collect_list('value').alias('value_array'))
    .withColumn('metrics_map', map_from_arrays(col('metric_array'),
                                               col('value_array')))
    .select('name', 'metrics_map')
    .orderBy(col('name'))
    .rdd.map(lambda x: update_metrics_map(x))
    .collect())

with open('../data/metrics_for_500.json', 'w') as f:
    json.dump(result_list, f, indent=1)

2min 40s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**For the Distributed Stage in the Summary Task:** Input Size / Records	53.2 MB / 25000

**Timing Information for whole job:** 2min 40s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### 3000 Row Dataframe

In [4]:
%%timeit -n1 -r1
df_3000 = spark.read.parquet('../data/df_3000/')

stats = ['count', 'mean', 'stddev', 'min', 'max',
         '1%','5%', '25%', '50%', '75%', '95%', '99%']

df_3000_summary = (
    df_3000
    .repartition(48)
    .summary(*stats)
    .withColumn('summary',
                when(col('summary').contains('%'),
                     concat(lit('p'),
                            substring_index(col('summary'),
                                            '%', 1)))
                .otherwise(col('summary'))))

df_3000_summary_cols = df_3000_summary.columns

df_3000_summary_metric_cols = [col for col in df_3000_summary_cols if col != 'summary']

df_3000_metrics_long = spark.createDataFrame(
    Row(name=col, metric=m_row['summary'], value=m_row[col])
    for m_row in df_3000_summary.toLocalIterator()
    for col in df_3000_summary_metric_cols)

def update_metrics_map(row):
    row['metrics_map'].update({'name': row['name']})
    return row['metrics_map']

result_list = (
    df_3000_metrics_long
    .withColumn('value', col('value').astype('double'))
    .groupBy(col('name'))
    .agg(collect_list(col('metric')).alias('metric_array'),
         collect_list('value').alias('value_array'))
    .withColumn('metrics_map', map_from_arrays(col('metric_array'),
                                               col('value_array')))
    .select('name', 'metrics_map')
    .orderBy(col('name'))
    .rdd.map(lambda x: update_metrics_map(x))
    .collect())

with open('../data/metrics_for_3000.json', 'w') as f:
    json.dump(result_list, f, indent=1)

1h 18min 21s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**For the Distributed Stage in the Summary Task:** Input Size / Records	18.0 MB / 4168

**Timing Information for whole job:** 1h 18min 21s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [3]:
%%timeit -n1 -r1
df_500_of_3000 = spark.read.parquet('../data/df_3000/')

stats = ['count', 'mean', 'stddev', 'min', 'max',
         '1%','5%', '25%', '50%', '75%', '95%', '99%']

df_500_of_3000_summary = (
    df_500_of_3000
    .select(*[col for col in df_500_of_3000.columns if col.endswith('_1')])
    .repartition(8)
    .summary(*stats)
    .withColumn('summary',
                when(col('summary').contains('%'),
                     concat(lit('p'),
                            substring_index(col('summary'),
                                            '%', 1)))
                .otherwise(col('summary'))))

df_500_of_3000_summary_cols = df_500_of_3000_summary.columns

df_500_of_3000_summary_metric_cols = [col for col in df_500_of_3000_summary_cols if col != 'summary']

df_500_of_3000_metrics_long = spark.createDataFrame(
    Row(name=col, metric=m_row['summary'], value=m_row[col])
    for m_row in df_500_of_3000_summary.toLocalIterator()
    for col in df_500_of_3000_summary_metric_cols)

def update_metrics_map(row):
    row['metrics_map'].update({'name': row['name']})
    return row['metrics_map']

result_list = (
    df_500_of_3000_metrics_long
    .withColumn('value', col('value').astype('double'))
    .groupBy(col('name'))
    .agg(collect_list(col('metric')).alias('metric_array'),
         collect_list('value').alias('value_array'))
    .withColumn('metrics_map', map_from_arrays(col('metric_array'),
                                               col('value_array')))
    .select('name', 'metrics_map')
    .orderBy(col('name'))
    .rdd.map(lambda x: update_metrics_map(x))
    .collect())

with open('../data/metrics_for_500_of_3000.json', 'w') as f:
    json.dump(result_list, f, indent=1)

2min 36s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


**For the Distributed Stage in the Summary Task:** Input Size / Records	47.2 MB / 11182

**Timing Information for whole job:** 2min 36s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Conclusions

In it's current implementation, summary() seems to be dependent partially on number of columns.

In the 3000 row run, the runtime is 36 times that of the 500 row run. 6x increase in the runtime was contributed by 6x repartitioning I needed to do to fit the partition in my memory. Remaining 6x seems to have been contributed by the 6x increase in the number of rows.

The choice of parquet files as storage aids in breaking out the columns is aiding in the 500_from_3000 run as there seems to be no appreciable performance degradation in picking out the 500 columns from the 3000 as would probably have been the case with non-columnar storage say for example csv files.

But the summary method implementation is not able to benefit from the parquet files.

# Next Steps

The dataframe itself can be transformed to a long format (i.e. *col_name*, *value*) before the summarizing step, this should make it possible to scale linearly w.r.t. number of columns.