In [1]:
from math import log10
from random import choices, randint, random

from pyspark.sql import Row, SparkSession

In [2]:
spark = (SparkSession.builder
         .master('local[*]')
#          .config("spark.driver.memory", "4g")
         .appName('create_500_row_dataframe')
         .getOrCreate())

spark

In [3]:
# These constants are used as caps for lower and upper bounds
#  generated in get_random_column_bounds()
OVERALL_MIN_INT = 0
OVERALL_MAX_INT = 1000
OVERALL_MIN_DOUBLE = 0.0
OVERALL_MAX_DOUBLE = 1e5

In [4]:
def get_random_column_bounds(datatype):
    """Generate Randomized Lower and Upper bounds given a datatype.
    
    Params:
        datatype (string) - Can be either 'int' or 'double'
    
    Returns:
        Tuple constituting (<lower bound>, <upper bound>)
        
    Details:
        The lower and upper bounds will be capped by the values
         of static globals:
        - OVERALL_MIN_INT & OVERALL_MAX_INT
        - OVERALL_MIN_DOUBLE & OVERALL_MAX_DOUBLE
        
        To look like real-world data:
        - 'int' bounds will randomly look like either binary data
            or positive integers
        - 'double' bounds will randomly look like either normalized
            or original valued
    """
    if datatype == 'int':
        if choices(['binary','discrete']) == ['binary']:
            lower_bound = 0
            upper_bound = 1
        else:
            lower_bound = randint(OVERALL_MIN_INT, int(OVERALL_MAX_INT*0.5))
            upper_bound = randint(lower_bound, OVERALL_MAX_INT)
    elif datatype == 'double':
        if choices(['scaled', 'real']) == ['scaled']:
            lower_bound = -1.0
            upper_bound = 1.0
        else:
            lower_bound = (OVERALL_MIN_DOUBLE +
                           (random() *
                            ((OVERALL_MAX_DOUBLE * 0.5) -
                             OVERALL_MIN_DOUBLE)))
            upper_bound = (lower_bound +
                           (random() *
                            (OVERALL_MAX_DOUBLE - lower_bound)))
    else:
        raise AttributeError(
            "get_random_column_bounds() not defined for datatype: {}"
            .format(datatype))
        
    return (lower_bound, upper_bound)

In [5]:
def create_columns(number_of_columns):
    """Create dictionary of column names and datatypes.
    
    Params:
        number_of_columns (int) - Number of Columns to generate
    
    Returns:
        {'<Column Name>': {'dtype': 'int'|'double',
                           'bounds': (<lower_bound>, <upper_bound>),
                           }

    Details:
    There is a equal probability in a column being marked
     either an int or a double.
    """
    number_of_digits = int(log10(number_of_columns)) + 1
    column_name_format = "col_{{:0{}d}}".format(number_of_digits)
    return {
        column_name_format.format(i):
        {'dtype': d,
         'bounds': get_random_column_bounds(d)}
        for i, d in enumerate(
            choices(population=['int', 'double'],
                    k=number_of_columns))}

In [6]:
def create_random_observation(column_metadata):
    """Create a Random Observation given Column Metadata.
    
    Params:
        column_metadata (dict):
            Any of the values from dictionary returned by
             create_columns
    
    Returns:
        A random value based on datatype and bounds defined
         in column metadata
    """
    if column_metadata['dtype'] == 'int':
        obs = randint(*column_metadata['bounds'])
    elif column_metadata['dtype'] == 'double':
        obs = (column_metadata['bounds'][0] +
               (random() *
                (column_metadata['bounds'][1] -
                 column_metadata['bounds'][0])))
    return obs

In [7]:
def create_row_dict(columns):
    """Creates a dictionary representing a Row given columns information.
    
    Params:
        columns (dict):
            Dictionary as generated by create_columns()
            
    Returns:
        Dictionary with column names as keys and random values generated
         by create_random_observation()
    """
    return {
        col_name: create_random_observation(col_metadata)
        for col_name, col_metadata in columns.items()}

In [8]:
def create_dataframe(columns, num_rows):
    """Create DataFrame for columns with num_rows.
    
    Params:
        - columns (dict):
            Dictionary as generated by create_columns()
        - num_rows (int):
            Number of rows to create the Dataframe for
    
    Returns:
        Spark Dataframe
    """
    return (
        spark.sparkContext
        .parallelize(range(num_rows))
        .map(lambda x: Row(**create_row_dict(columns)))
        .toDF())

In [9]:
columns_500 = create_columns(500)

# df_500 = create_dataframe(columns_500, 200000)
# df_500.write.parquet('../data/df_500', mode='overwrite')

############################################################
# If running on a system where spinning up a 4G java       #
# process is not an option, the dataset creation can be    #
# batched using the following by tweaking num_batches      #
############################################################
def batch_create(path, columns, num_rows, num_batches):
    (batch_size,
     size_of_last_batch) = divmod(num_rows, num_batches)
    
    number_of_complete_batches = (num_batches
                                  if size_of_last_batch == 0
                                  else num_batches - 1)
    
    for batch in range(number_of_complete_batches):
        df_500 = create_dataframe(columns_500, batch_size)
        iter_mode = 'overwrite' if batch == 0 else 'append'
        df_500.write.parquet(path, mode=iter_mode)
        
    if size_of_last_batch > 0:
        df_500 = create_dataframe(columns_500, size_of_last_batch)
        df_500.write.parquet(path, mode='append')
        
batch_create('../data/df_500', columns_500, 200000, 5)

In [10]:
spark.read.parquet("../data/df_500/").count()

200000

In [11]:
len(spark.read.parquet("../data/df_500/").columns)

500

In [12]:
spark.read.parquet("../data/df_500/").limit(50).toPandas()

Unnamed: 0,col_000,col_001,col_002,col_003,col_004,col_005,col_006,col_007,col_008,col_009,...,col_490,col_491,col_492,col_493,col_494,col_495,col_496,col_497,col_498,col_499
0,1,0.905863,-0.358027,0.85592,1,22777.012599,847,0.420119,59458.151017,48531.587747,...,1,0.578925,57805.129384,57882.974428,75610.45052,472,0.017659,35645.892765,-0.119702,100
1,1,-0.716329,-0.433568,0.677467,0,18077.919345,184,0.898818,50386.847307,46267.788668,...,1,-0.290999,41717.495418,48082.246481,67150.341545,570,-0.817736,35297.310233,-0.688858,258
2,1,0.993869,0.692001,-0.016711,1,14475.938939,578,-0.620685,41416.291514,58704.915629,...,0,0.232067,18940.272596,72656.443095,43763.807799,926,-0.637249,34963.965518,-0.638401,356
3,0,-0.150916,-0.131692,0.07048,1,52643.546885,897,-0.749421,70657.5467,49048.936374,...,0,-0.305226,30399.118125,91595.079456,65735.465953,634,0.254985,36004.046666,0.335832,280
4,1,-0.599298,0.989905,0.672086,0,28582.835171,371,-0.722171,21627.957178,58401.934541,...,1,0.599199,49720.30911,78897.154001,12895.079207,825,0.726262,35885.241193,-0.994152,377
5,1,0.371638,-0.086629,-0.598051,0,79515.745853,520,-0.315437,84406.101795,48014.307375,...,0,0.936679,67700.990751,39949.693289,64989.226429,673,-0.295487,34937.643594,-0.935011,287
6,1,-0.310775,0.13726,0.126408,0,42210.496685,541,-0.437835,69143.959118,51699.312179,...,0,-0.981933,34213.234186,54552.951179,90896.294655,893,-0.779399,35158.202555,0.203566,99
7,0,-0.066862,-0.652302,-0.165722,1,38344.563615,661,-0.434044,25254.96221,55201.956618,...,1,0.413793,59024.993979,66246.363076,11698.651208,837,-0.476148,35544.166347,0.122339,335
8,0,0.274597,-0.305064,0.087287,1,15405.286484,239,-0.457087,67361.370091,48125.221254,...,1,-0.836537,53911.20457,65362.132354,28457.469457,815,0.138424,35556.690841,-0.683545,152
9,1,-0.370873,-0.718838,-0.751675,1,81032.664495,511,0.137925,21898.760888,46108.48737,...,1,-0.908981,37746.180161,87647.296068,28849.565554,684,-0.153722,34953.541058,0.928781,84
