# Fake Data Generator

## Imports

In [75]:
import findspark
findspark.init()

In [76]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("fake_data_generator").master("local[*]").getOrCreate()
sc = spark.sparkContext

In [160]:
import random, string
from pyspark.sql import Row
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import rand

## Fake Data Generator Functions

In [167]:
def get_rand_int(lower_bound = 0, upper_bound = 150):
    return random.randint(lower_bound, upper_bound)

def get_rand_str(str_length = 10):
    return ''.join(random.choice(string.ascii_letters) for _ in range(str_length))

def get_rand_float(lower_bound = 0, upper_bound = 1.0):
    return random.uniform(lower_bound, upper_bound)

In [176]:
COLUMNS_CONFIG_DICT = {"rand_str":{"generator_function":get_rand_str, "sql_type": StringType()}, 
                       "rand_int":{"generator_function":get_rand_int, "sql_type": IntegerType()},
                       "rand_float":{"generator_function":get_rand_float, "sql_type": FloatType()}}

In [177]:
def get_column_generators(columns_config):
    return [(COLUMNS_CONFIG_DICT[config_dict["generator_function"]]["generator_function"],
             config_dict["config"]) for name, config_dict in columns_config]

def get_data_rdd(data_length, columns_config):
    return sc.parallelize(range(1, data_length+1)).map(lambda idx: tuple([idx] + [f(**config) for f, config in get_column_generators(columns_config)]))

def get_schema(columns_config):
    return StructType([StructField("id", StringType(), True)] + 
                      [StructField(name, COLUMNS_CONFIG_DICT[config_dict["generator_function"]]["sql_type"], True) for name, config_dict in columns_config])

def get_data_df(data_length, columns_config):
    return spark.createDataFrame(get_data_rdd(data_length, columns_config), get_schema(columns_config)).orderBy(rand())

In [180]:
data_length = 100

columns_config1 = [("name", {"generator_function":"rand_str", "config":{"str_length":25}}),
                   ("age", {"generator_function":"rand_int", "config":{"lower_bound":3, "upper_bound":25}})]

columns_config2 = [("balance", {"generator_function":"rand_float", "config":{"lower_bound":0, "upper_bound":1000}})]

In [183]:
df1 = get_data_df(data_length, columns_config1)
df2 = get_data_df(data_length, columns_config2)

In [184]:
df1.show(25)

+---+--------------------+---+
| id|                name|age|
+---+--------------------+---+
| 13|IDqPIyzqbHaZiqtPZ...| 25|
| 61|qjaHDqzFytaLxircK...|  6|
| 14|uhFuJZeCKcQiXGWbR...| 15|
| 75|hqMTlfiCbgLWQdmJX...| 16|
| 32|jDIjaRJwyMvPYVEAz...| 10|
| 35|oPSJyCXunbSyTwdrX...| 23|
| 62|HItIbSxzuGWykLQxD...| 17|
| 66|aqqCuUhwvvMhXSwgz...| 22|
| 87|HItIbSxzuGWykLQxD...| 17|
| 59|CAfIzyyiouQQUoHJO...| 10|
| 46|adjEgZuTqzvPweUho...| 24|
| 67|MreJYLmAQMVDOqBIV...| 15|
| 29|SntRqTjCvqYsALNmh...| 16|
| 41|aqqCuUhwvvMhXSwgz...| 22|
| 17|MreJYLmAQMVDOqBIV...| 15|
| 12|HItIbSxzuGWykLQxD...| 17|
| 30|ZvIvtddTuqBHYkScf...| 20|
| 73|czboCtKlJSnBipqIl...| 22|
| 11|qjaHDqzFytaLxircK...|  6|
|  4|SntRqTjCvqYsALNmh...| 16|
| 49|ZFigsvqGqvglqStxy...|  3|
|  9|CAfIzyyiouQQUoHJO...| 10|
| 40|IJxGWguEoxmhPYJyG...| 20|
| 83|FAxokPllKTICPjsYm...|  3|
| 18|JCnPUTNDNcdsBLgZQ...| 15|
+---+--------------------+---+
only showing top 25 rows



In [185]:
df2.show(25)

+---+---------+
| id|  balance|
+---+---------+
| 72| 747.4107|
| 51|748.49664|
|  8| 909.6728|
|  9|379.19287|
| 50|237.09807|
| 25|237.09807|
|  6| 937.0201|
| 35| 529.8583|
| 89| 889.9851|
| 80|340.68436|
| 60| 529.8583|
| 86| 661.9113|
| 74| 468.6085|
| 12| 406.9189|
| 49| 468.6085|
| 43|346.36746|
| 20|343.05225|
| 87| 406.9189|
| 77|249.29778|
| 58| 909.6728|
| 11| 661.9113|
| 95|343.05225|
| 21|10.488886|
|  2|249.29778|
| 36| 661.9113|
+---+---------+
only showing top 25 rows



In [190]:
final_df = df1.join(df2, "id", "inner")

In [191]:
final_df.show(100)

+---+--------------------+---+---------+
| id|                name|age|  balance|
+---+--------------------+---+---------+
| 51|VApiYcvbmVTyOHPQA...| 14|748.49664|
|  7|jDIjaRJwyMvPYVEAz...| 10| 738.0561|
| 15|IJxGWguEoxmhPYJyG...| 20| 901.6078|
| 54|SntRqTjCvqYsALNmh...| 16|791.04425|
| 11|qjaHDqzFytaLxircK...|  6| 661.9113|
| 29|SntRqTjCvqYsALNmh...| 16|791.04425|
| 69|xlYPjLbjsHQZrqUFK...|  7|484.66693|
| 42|MreJYLmAQMVDOqBIV...| 15|700.56085|
| 73|czboCtKlJSnBipqIl...| 22| 215.9689|
| 87|HItIbSxzuGWykLQxD...| 17| 406.9189|
| 64|uhFuJZeCKcQiXGWbR...| 15| 889.9851|
|  3|fbGIAVRRyeXzcfNxf...| 15|140.59113|
| 30|ZvIvtddTuqBHYkScf...| 20|340.68436|
| 34|CAfIzyyiouQQUoHJO...| 10|379.19287|
| 59|CAfIzyyiouQQUoHJO...| 10|379.19287|
|  8|FAxokPllKTICPjsYm...|  3| 909.6728|
| 22|MjsDIXnLviqRwxxfu...| 19| 747.4107|
| 28|fbGIAVRRyeXzcfNxf...| 15|140.59113|
| 85|oPSJyCXunbSyTwdrX...| 23| 529.8583|
| 16|aqqCuUhwvvMhXSwgz...| 22|266.98596|
| 35|oPSJyCXunbSyTwdrX...| 23| 529.8583|
| 52|AFNvMayVkns

## Fake Data Generator Class [UNDER CONSTRUCTION]

In [None]:
import random, string
from pyspark.sql import Row
import numpy as np
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType


def get_rand_int(self, lower_bound = 0, upper_bound = 150):
    return random.randint(lower_bound, upper_bound)

def get_rand_str(self, str_length = 10):
    return ''.join(random.choice(string.ascii_letters) for _ in range(str_length))

class FakeDataGenerator():
    
    def __init__(self, spark, sc, length):
        self.length = length
        self.spark = spark
        self.sc = sc

#     def get_rand_int(self, lower_bound = 0, upper_bound = 150):
#         return random.randint(lower_bound, upper_bound)

#     def get_rand_str(self, str_length = 10):
#         return ''.join(random.choice(string.ascii_letters) for _ in range(str_length))
    
#     def set_column_generators(self):
#         self.column_generators = [self.get_rand_int, self.get_rand_str]
    
#     def get_data_rdd(self):
#         return sc.parallelize(range(1, self.length+1)).map(lambda idx: tuple([idx] + [f() for f in self.column_generators]))

    @staticmethod
    def my_random(self):
        return random.randint
    
    def get_data_rdd(self):
        return self.sc.parallelize(range(1, self.length+1)).map(lambda idx: tuple([idx] + [self.my_random(1, 10)]))
    
    def set_schema(self):
        self.schema = StructType([StructField("id", IntegerType(), True),
                                  StructField("name", FloatType(), True)])
    
    def get_data_df(self):
#         self.set_column_generators()
        self.set_schema()
        return self.spark.createDataFrame(self.get_data_rdd(), self.schema)
        
        

In [None]:
myDataGenerator = FakeDataGenerator(spark, sc, 10)

In [None]:
myDataGenerator.get_data_df().show(5)

In [None]:
random.random()