In [1]:
from math import log10
from random import choices, randint, random

from pyspark.sql import Row, SparkSession

In [2]:
spark = (SparkSession.builder
         .master('local[*]')
#          .config("spark.driver.memory", "4g")
         .appName('create_500_row_dataframe')
         .getOrCreate())

spark

In [3]:
# These constants are used as caps for lower and upper bounds
#  generated in get_random_column_bounds()
OVERALL_MIN_INT = 0
OVERALL_MAX_INT = 1000
OVERALL_MIN_DOUBLE = 0.0
OVERALL_MAX_DOUBLE = 1e5

In [4]:
def get_random_column_bounds(datatype):
    """Generate Randomized Lower and Upper bounds given a datatype.
    
    Params:
        datatype (string) - Can be either 'int' or 'double'
    
    Returns:
        Tuple constituting (<lower bound>, <upper bound>)
        
    Details:
        The lower and upper bounds will be capped by the values
         of static globals:
        - OVERALL_MIN_INT & OVERALL_MAX_INT
        - OVERALL_MIN_DOUBLE & OVERALL_MAX_DOUBLE
        
        To look like real-world data:
        - 'int' bounds will randomly look like either binary data
            or positive integers
        - 'double' bounds will randomly look like either normalized
            or original valued
    """
    if datatype == 'int':
        if choices(['binary','discrete']) == ['binary']:
            lower_bound = 0
            upper_bound = 1
        else:
            lower_bound = randint(OVERALL_MIN_INT, int(OVERALL_MAX_INT*0.5))
            upper_bound = randint(lower_bound, OVERALL_MAX_INT)
    elif datatype == 'double':
        if choices(['scaled', 'real']) == ['scaled']:
            lower_bound = -1.0
            upper_bound = 1.0
        else:
            lower_bound = (OVERALL_MIN_DOUBLE +
                           (random() *
                            ((OVERALL_MAX_DOUBLE * 0.5) -
                             OVERALL_MIN_DOUBLE)))
            upper_bound = (lower_bound +
                           (random() *
                            (OVERALL_MAX_DOUBLE - lower_bound)))
    else:
        raise AttributeError(
            "get_random_column_bounds() not defined for datatype: {}"
            .format(datatype))
        
    return (lower_bound, upper_bound)

In [5]:
def create_columns(number_of_columns):
    """Create dictionary of column names and datatypes.
    
    Params:
        number_of_columns (int) - Number of Columns to generate
    
    Returns:
        {'<Column Name>': {'dtype': 'int'|'double',
                           'bounds': (<lower_bound>, <upper_bound>),
                           }

    Details:
    There is a equal probability in a column being marked
     either an int or a double.
    """
    number_of_digits = int(log10(number_of_columns)) + 1
    column_name_format = "col_{{:0{}d}}".format(number_of_digits)
    return {
        column_name_format.format(i):
        {'dtype': d,
         'bounds': get_random_column_bounds(d)}
        for i, d in enumerate(
            choices(population=['int', 'double'],
                    k=number_of_columns))}

In [6]:
def create_random_observation(column_metadata):
    """Create a Random Observation given Column Metadata.
    
    Params:
        column_metadata (dict):
            Any of the values from dictionary returned by
             create_columns
    
    Returns:
        A random value based on datatype and bounds defined
         in column metadata
    """
    if column_metadata['dtype'] == 'int':
        obs = randint(*column_metadata['bounds'])
    elif column_metadata['dtype'] == 'double':
        obs = (column_metadata['bounds'][0] +
               (random() *
                (column_metadata['bounds'][1] -
                 column_metadata['bounds'][0])))
    return obs

In [7]:
def create_row_dict(columns):
    """Creates a dictionary representing a Row given columns information.
    
    Params:
        columns (dict):
            Dictionary as generated by create_columns()
            
    Returns:
        Dictionary with column names as keys and random values generated
         by create_random_observation()
    """
    return {
        col_name: create_random_observation(col_metadata)
        for col_name, col_metadata in columns.items()}

In [8]:
def create_dataframe(columns, num_rows):
    """Create DataFrame for columns with num_rows.
    
    Params:
        - columns (dict):
            Dictionary as generated by create_columns()
        - num_rows (int):
            Number of rows to create the Dataframe for
    
    Returns:
        Spark Dataframe
    """
    return (
        spark.sparkContext
        .parallelize(range(num_rows))
        .map(lambda x: Row(**create_row_dict(columns)))
        .toDF())

In [None]:
columns_500 = create_columns(500)

# df_500 = create_dataframe(columns_500, 200000)
# df_500.write.parquet('../data/df_500', mode='overwrite')

############################################################
# If running on a system where spinning up a 4G java       #
# process is not an option, the dataset creation can be    #
# batched using the following by tweaking num_batches      #
############################################################
def batch_create(path, columns, num_rows, num_batches):
    (batch_size,
     size_of_last_batch) = divmod(num_rows, num_batches)
    
    number_of_complete_batches = (num_batches
                                  if size_of_last_batch == 0
                                  else num_batches - 1)
    
    for batch in range(number_of_complete_batches):
        df_500 = create_dataframe(columns_500, batch_size)
        iter_mode = 'overwrite' if batch == 0 else 'append'
        df_500.write.parquet(path, mode=iter_mode)
        
    if size_of_last_batch > 0:
        df_500 = create_dataframe(columns_500, size_of_last_batch)
        df_500.write.parquet(path, mode='append')
        
batch_create('../data/df_500', columns_500, 200000, 5)

In [None]:
spark.read.parquet("../data/df_500/").count()

In [None]:
len(spark.read.parquet("../data/df_500/").columns)

In [None]:
spark.read.parquet("../data/df_500/").limit(50).toPandas()