In [0]:
%pip install dbldatagen

In [0]:
%pip install Faker

In [0]:
from pyspark.sql.types import FloatType, IntegerType, StringType,TimestampType, DecimalType
from pyspark.sql.functions import col, date_format
import pyspark.sql.functions as F
import dbldatagen as dg
import re
import random
import requests
from string import ascii_letters
import numpy as np
from bs4 import BeautifulSoup as bs

In [0]:
def create_random_string(num_characters: int) -> str:
    return ''.join(random.choices(ascii_letters, k=num_characters))

random_strings = []

sample_size = 10
list_string_lengths = [random.randrange(10, 15, 1) for i in range(sample_size)]

for i in list_string_lengths:
    random_strings.append(create_random_string(i))

print(random_strings)

In [0]:
def create_random_username(num_characters: int, username_base: str) -> str:
    random_string = ''.join(random.choices(ascii_letters, k=num_characters))
    return f"{username_base.replace(' ', '')}_{random_string}"

# Create UDF to use in PySpark DataFrame column creation
usernames_udf = F.udf(create_random_username,StringType())

In [0]:
# Function to hit the MIT word list page, retrieving 10000 words as a list.
def get_word_list() -> list():
    word_site = f"https://www.mit.edu/~ecprice/wordlist.10000"
    
    response = requests.get(word_site)
    words = response.content.splitlines()

    words_array = np.array(words)
    words_unique = np.unique(words_array)
    words_unique = [f'{i.decode().capitalize()}' for i in words_unique ]
    
    return words_unique

words_unique = get_word_list()

sample_size = 10

random.shuffle(words_unique)
sample_words = words_unique[:sample_size]
print(sample_words)

In [0]:
# Documentation at https://restcountries.com/
import requests
from pprint import pprint

def get_countries() -> list():
    countries_site = f"https://restcountries.com/v3.1/all?fields=name"
    response = requests.get(countries_site)
    json_response = response.json()
    
    countries_list = []
    for i in json_response:
        countries_list.append(i['name']['common'])
    
    return countries_list

countries_list = get_countries()

pprint(sorted(countries_list))

sample_size = 10

random.shuffle(countries_list)
sample_countres = countries_list[:sample_size]
print(sample_countres)

In [0]:
#https://realpython.com/api-integration-in-python/

def get_string_from_api(page_id: int) -> str: 
    api_url = f"https://jsonplaceholder.typicode.com/todos/{i+1}"
    response = requests.get(api_url)
    title = response.json()['title']
    return title

api_words = []
for i in range(10):
    api_words.append(get_string_from_api(i))
print(api_words)

In [0]:
def web_scrape_japanese(page_number: int, list_japanese: list()) -> list():
    page_url = f"https://jisho.org/search/%23jlpt-n5%20%23words?page={page_number}"
    r = requests.get(page_url)
    soup = bs(r.content, features="html.parser")
    list_classes = soup.select(
        'div.concepts div.concept_light-representation span.text')
    local_list_japanese = [(i.text).strip() for i in list_classes]
    list_japanese.extend(local_list_japanese)
    return list_japanese

list_japanese = []
page_number = 1
sample_size = 25
while len(list_japanese) < sample_size:
    list_japanese = web_scrape_japanese(page_number, list_japanese)
    page_number += 1
print(list_japanese)

In [0]:
fake = Faker()

def create_faker_name():
    return fake.name()

sample_size = 100 
list_names = [create_faker_name() for i in range(sample_size)]
print(list_names)

def faker_udf_func(list_names: list(), person_id: int):
    return list_names[person_id]

faker_udf = F.udf(faker_udf_func, StringType())

In [0]:
def generate_unique_names(num_names):
    # List of sample names
    sample_names = ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Henry', 'Ivy', 'Jack']

    # Ensure the requested number of names is not greater than the number of sample names
    num_names = min(num_names, len(sample_names))

    # Shuffle the sample names to get a random order
    random.shuffle(sample_names)

    # Select the first 'num_names' names
    unique_names = sample_names[:num_names]

    return unique_names

# Generate 10 unique names
result = generate_unique_names(10)

# Print the result
print(result)


In [0]:
# Example Dimension

min_number = 0
partitions_requested = 4

data_rows = 100

dataspec = (dg.DataGenerator(spark, name="dataset",  rows=data_rows, partitions=partitions_requested)
            .withIdOutput()
            .withColumn("Country", "string", values=countries_list, random=True)
            )

df = (dataspec.build().cache())

df = df.withColumn("Full Name", faker_udf(F.lit(list_names), df["id"]))
df = df.withColumn("Username", usernames_udf(F.lit(5), df["Full Name"]))

display(df)