In [2]:
import os
#pyspark_submit_args ='--master local[*] --executor-memory 2G --driver-memory 2G --num-executors 8 pyspark-shell' 
pyspark_submit_args ='--master local[*] --executor-memory 2G --driver-memory 2G --num-executors 4 pyspark-shell' 
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

In [3]:
import pandas as pd
from tqdm import tqdm
import csv
import random
import string
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os,datetime

from typing import List

In [4]:
class NAMES :
    
    __DIR_CSV : str
    __DIR_PARQUET: str
    
    def __init__(self,csv='csv',parquet='parquet') :
        self.__DIR_CSV = csv
        self.__DIR_PARQUET=parquet
       
    def __getnames(self,dir : str, ext: str) :
        name : List[str] = ['products','sellers','sales','orders']
        try:
            os.mkdir(dir)
        except OSError as error:
            print(f"{dir} already exists")
        return (os.path.join(dir,name[0] + '.' + ext),os.path.join(dir,name[1] + '.' + ext),os.path.join(dir,name[2] + '.' + ext),os.path.join(dir,name[3] + '.' + ext))
                
    def getcsv(self) : 
        return self.__getnames(self.__DIR_CSV,'csv')
                                                            
    def getparquet(self) : 
        return self.__getnames(self.__DIR_PARQUET,'parquet')
                                                                  

In [5]:
class GENERATOR :
    
    letters : str
    letter_upper : str
    product_ids : List[int]
    seller_ids : List[int]
    orders_ids : List[int]
    
    PROD_NUMBER= 7500000
    #PROD_NUMBER=7500
    SELLER_NUMBER=10
    TOTAL_ORDERS = 50000
    #RANGES=4
    BUFFER=100
    MAX_ITEMS=24
    
    productscsv : str
    sellerscsv : str
    salescsv : str
    orderscsv : str
    
    productsparquet : str
    sellersparquet : str
    salesparquet : str
    ordersparquet : str

    
    def __init__(self,N) :
        random.seed(1999)
        self.letters = string.ascii_lowercase
        letters_upper = string.ascii_uppercase
        for _ in range(10):
            self.letters += self.letters
        for _ in range(10):
            self.letters += letters_upper
            
        #print("Products between {} and {}".format(1, self.PROD_NUMBER))
        self.product_ids = [x for x in range(self.PROD_NUMBER)]
        self.seller_ids = [x for x in range(self.SELLER_NUMBER)]
        self.orders_ids = [x for x in range(self.TOTAL_ORDERS)]
        
        self.productscsv,self.sellerscsv,self.salescsv,self.orderscsv = N.getcsv()
        self.productsparquet,self.sellersparquet,self.salesparquet,self.ordersparquet = N.getparquet()


            
    def __random_string(self,stringLength=10):
        """Generate a random string of fixed length """
        return ''.join(random.sample(self.letters, stringLength))

    def gen_products(self) :
        #   Generate products
        products=[]
        for p in tqdm(self.product_ids):
            products.append([p, "product_{}".format(p), random.randint(1, 150)])
        #   Save dataframe
        df = pd.DataFrame(products)
        df.columns = ["product_id", "product_name", "price"]
        df.to_csv(self.productscsv, index=False)
        print("Done")
        
    def gen_sellers(self) :
        #   Generate sellers
        sellers=[]
        for s in tqdm(self.seller_ids):
            sellers.append([s, "seller_{}".format(s), random.randint(12000, 2000000)])
        #   Save dataframe
        df = pd.DataFrame(sellers)
        df.columns = ["seller_id", "seller_name", "daily_target"]
        df.to_csv(self.sellerscsv, index=False)
        print("Done")
        
    def gen_orders(self) :
        datestart = datetime.date(2021,1,1)
        orders = []
        for p in tqdm(self.orders_ids):
            orders.append([p, random.choice(self.seller_ids), datestart + datetime.timedelta(random.randint(0,365))])
        df = pd.DataFrame(orders)
        df.columns = ["order_id", "seller_id", "date"]
        df.to_csv(self.orderscsv, index=False)
        print("Done")
        
        
    def __write_sales(self,df_array,mode) :
        with open(self.salescsv, mode, newline='') as f:
            csvwriter = csv.writer(f)
            csvwriter.writerows(df_array)
    
        
    def gen_sales(self) :
        #   Generate sales
        df_array = [["order_id", "product_id", "num_pieces_sold", "bill_raw_text"]]
        self.__write_sales(df_array,'w')
        df_array = []
            
        for order_id in tqdm(self.orders_ids):
                  
            for _ in range(random.randint(1,self.MAX_ITEMS)) :
                    df_array.append([order_id, random.choice(self.product_ids), random.randint(1, 10), self.__random_string(500)])
                    
            if len(df_array) > self.BUFFER : 
                self.__write_sales(df_array,'a')
                df_array = []
             
        self.__write_sales(df_array,'a')

        print("Done")
        
    def __write_parquet(self,spark,csv,parquet) :
        dcsv = spark.read.csv(csv, header=True, inferSchema=True,mode="DROPMALFORMED")
        dcsv.show()
        dcsv.write.parquet(parquet, mode="overwrite")
        
            
    def gen_parquet(self) :
        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc)
        print("Products")
        self.__write_parquet(spark,self.productscsv,self.productsparquet)
        print("Sales")
        self.__write_parquet(spark,self.salescsv,self.salesparquet)
        print("Sellers")
        self.__write_parquet(spark,self.sellerscsv,self.sellersparquet)
        print("Orders")
        self.__write_parquet(spark,self.orderscsv,self.ordersparquet)
        print("Done")

        

In [6]:
class READDF:
    
    def __init__(self,N) :
        self.N = N
        
    def readdf(self) :
        productsname,sellersname,salesname = self.N.getparquet()
        products = spark.read.parquet(productsname)
        sales = spark.read.parquet(salesname)
        sellers = spark.read.parquet(sellersname)
        products.createOrReplaceTempView("products")
        sales.createOrReplaceTempView("sales")
        sellers.createOrReplaceTempView("sellers")
        return (products,sales,sellers)

In [7]:
N = NAMES('/tmp/csv','/tmp/parquet')
G = GENERATOR(N)

/tmp/csv already exists
parquet already exists


In [None]:
G.gen_products()
G.gen_sellers()
G.gen_orders()
G.gen_sales()

100%|██████████| 7500000/7500000 [00:18<00:00, 399086.87it/s]


Done


100%|██████████| 10/10 [00:00<00:00, 47339.77it/s]


Done


100%|██████████| 50000/50000 [00:00<00:00, 416233.72it/s]


Done


 46%|████▌     | 22991/50000 [01:07<01:09, 388.44it/s]

In [8]:
G.gen_parquet()

Products
+----------+------------+-----+
|product_id|product_name|price|
+----------+------------+-----+
|         0|   product_0|   30|
|         1|   product_1|   91|
|         2|   product_2|   37|
|         3|   product_3|  145|
|         4|   product_4|  128|
|         5|   product_5|   66|
|         6|   product_6|  145|
|         7|   product_7|   51|
|         8|   product_8|   44|
|         9|   product_9|   53|
|        10|  product_10|   13|
|        11|  product_11|  104|
|        12|  product_12|  102|
|        13|  product_13|   24|
|        14|  product_14|   14|
|        15|  product_15|   38|
|        16|  product_16|   72|
|        17|  product_17|   16|
|        18|  product_18|   46|
|        19|  product_19|   94|
+----------+------------+-----+
only showing top 20 rows

Sales
+--------+----------+---------------+--------------------+
|order_id|product_id|num_pieces_sold|       bill_raw_text|
+--------+----------+---------------+--------------------+
|       0|    