In [24]:
%pip install pyspark dbldatagen Faker

Note: you may need to restart the kernel to use updated packages.


In [15]:
%pip install jmespath numpy pandas requests pyparsing

Collecting pyparsing
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pyparsing-3.2.0-py3-none-any.whl (106 kB)
Installing collected packages: pyparsing
Successfully installed pyparsing-3.2.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
from pyspark.sql.types import FloatType, IntegerType, StringType,TimestampType, DecimalType
from pyspark.sql.functions import col, date_format
import pyspark.sql.functions as F
import dbldatagen as dg
import re
import random
import requests
from string import ascii_letters
import numpy as np
from pprint import pprint

In [17]:
def create_random_string(num_characters: int) -> str:
    return ''.join(random.choices(ascii_letters, k=num_characters))

random_strings = []

sample_size = 10
list_string_lengths = [random.randrange(10, 15, 1) for i in range(sample_size)]

for i in list_string_lengths:
    random_strings.append(create_random_string(i))

print(random_strings)

['CtEdmRVDrtqsF', 'JEvVTlzNQgAATu', 'zrwzWQKwsyLCg', 'ycfTAhchOILsNR', 'brLYCjNvtgGp', 'NfZTLCVECRWSn', 'LnfrGLOdHa', 'djCgsZHrDvHj', 'rLROIysqzHIq', 'crwqCPAagKCp']


In [18]:
def create_random_username(num_characters: int, username_base: str) -> str:
    random_string = ''.join(random.choices(ascii_letters, k=num_characters))
    return f"{username_base.replace(' ', '')}_{random_string}"

# Create UDF to use in PySpark DataFrame column creation
usernames_udf = F.udf(create_random_username,StringType())

In [None]:
# Function to hit the MIT word list page, retrieving 10000 words as a list.
def get_word_list() -> list[any]:
    word_site = f"https://www.mit.edu/~ecprice/wordlist.10000"
    
    response = requests.get(word_site)
    words = response.content.splitlines()

    words_array = np.array(words)
    words_unique = np.unique(words_array)
    words_unique = [f'{i.decode().capitalize()}' for i in words_unique ]
    
    return words_unique

words_unique = get_word_list()

sample_size = 10

random.shuffle(words_unique)
sample_words = words_unique[:sample_size]
print(sample_words)

['Theoretical', 'Funding', 'Voice', 'Allows', 'Douglas', 'Tee', 'Counter', 'Posing', 'Andrew', 'Religious']


In [20]:
# Documentation at https://restcountries.com/
def get_countries() -> list():
    countries_site = f"https://restcountries.com/v3.1/all?fields=name"
    response = requests.get(countries_site)
    json_response = response.json()
    
    countries_list = []
    for i in json_response:
        countries_list.append(i['name']['common'])
    
    return countries_list

countries_list = get_countries()

pprint(sorted(countries_list))

sample_size = 10

random.shuffle(countries_list)
sample_countres = countries_list[:sample_size]
print(sample_countres)

['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Cape Verde',
 'Caribbean Netherlands',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos (Keeling) Islands',
 'Colombia',
 'Comoros',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Curaçao',
 'Cyprus',
 'Czechia',
 'DR Congo',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia'

In [21]:
#https://realpython.com/api-integration-in-python/

def get_string_from_api(page_id: int) -> str: 
    api_url = f"https://jsonplaceholder.typicode.com/todos/{i+1}"
    response = requests.get(api_url)
    title = response.json()['title']
    return title

api_words = []
for i in range(10):
    api_words.append(get_string_from_api(i))
print(api_words)

['delectus aut autem', 'quis ut nam facilis et officia qui', 'fugiat veniam minus', 'et porro tempora', 'laboriosam mollitia et enim quasi adipisci quia provident illum', 'qui ullam ratione quibusdam voluptatem quia omnis', 'illo expedita consequatur quia in', 'quo adipisci enim quam ut ab', 'molestiae perspiciatis ipsa', 'illo est ratione doloremque quia maiores aut']


In [25]:
from faker import Faker

fake = Faker()

def create_faker_name():
    return fake.name()

sample_size = 100 
list_names = [create_faker_name() for i in range(sample_size)]
print(list_names)

def faker_udf_func(list_names: list(), person_id: int):
    return list_names[person_id]

faker_udf = F.udf(faker_udf_func, StringType())

['Benjamin Henry', 'Jessica Ward', 'Jennifer Garrett', 'Kevin Burch', 'Ian Daniel', 'Sean Marshall', 'Michael Johnson', 'Gregory Kelley', 'Deborah Powell', 'James Brown', 'Sharon Obrien', 'Sara Young', 'Jose Weiss', 'Jacob Spears', 'Robert Kelly', 'Jennifer Perez', 'Deborah Richardson', 'David Choi', 'Gary Mcguire', 'Jeremy Gallagher', 'Kristen Carter', 'Rhonda Odom', 'Toni Harvey', 'Douglas Hernandez', 'Mary Montgomery', 'James Kidd', 'William Riggs', 'Tina Fitzgerald', 'Ryan Evans', 'Jasmine Garcia', 'Taylor Orr', 'Robert Young', 'Anna Miller', 'John Lopez', 'Ian Smith', 'Gregory Lawson', 'Jonathan Kirk', 'Melvin Rodriguez', 'Sarah Thomas', 'David Bonilla', 'Jose Moore', 'Heather Poole', 'Kendra Winters', 'Elizabeth Hopkins', 'Laura Robles', 'Brian James', 'Teresa Potter', 'Carol Becker', 'Dr. Stacey Matthews', 'Michael Torres', 'Michelle Watkins', 'Linda Gonzalez', 'Nancy Hanson', 'Sherri Martin', 'Hannah Stephens', 'Timothy Franklin', 'Anthony Gibbs', 'Jonathan Moran', 'Karen Garci

In [26]:
def get_bored_activity():
    # Bored API endpoint
    api_url = "https://www.boredapi.com/api/activity"

    try:
        # Make a GET request to the Bored API
        response = requests.get(api_url)
        
        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Parse the JSON response
            activity_data = response.json()

            # Extract and return the activity
            activity = activity_data.get("activity", "No activity found")
            return activity
        else:
            # Print an error message if the request was not successful
            print(f"Error: Unable to fetch activity. Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
bored_activity = get_bored_activity()
print(bored_activity)


Error: Unable to fetch activity. Status code: 503
None


In [28]:
# Example Dimension
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
min_number = 0
partitions_requested = 4

data_rows = 100

dataspec = (dg.DataGenerator(spark, name="dataset",  rows=data_rows, partitions=partitions_requested)
            .withIdOutput()
            .withColumn("Country", "string", values=countries_list, random=True)
            )

df = (dataspec.build().cache())

df = df.withColumn("Full Name", faker_udf(F.lit(list_names), df["id"]))
df = df.withColumn("Username", usernames_udf(F.lit(5), df["Full Name"]))

display(df)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/10 16:29:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame[id: bigint, Country: string, Full Name: string, Username: string]