# Generating raw dataset using Python Faker Library.📊🐍

Importing all the necessary libraries..

In [1]:
import pandas as pd
from faker import Faker
import random
import numpy as np
import sqlite3

Creating a Faker class object..

In [None]:
fake = Faker()
Faker.seed(42)
random.seed(42)

Define constants like number of rows and noisy rows in your dataset..

In [None]:
num_rows = 300_000
noise_rows = 50_000
product_categories = ["Electronics", "Clothing", "Books", "Toys", "Groceries"]
payment_methods = ["Credit Card", "Debit Card", "UPI", "Cash", "Bank Transfer"]
null_chance = 0.03  # 3% chance a field will be None

Dummy method to inject `NULL` values in your dataset..

In [5]:
def maybe_null(value, null_probability=null_chance):
    return value if random.random() > null_probability else None

Method to generate dataset with necessary columns and with dummy data..

In [6]:
def generate_row():
    row = {
        # Invoice Info
        "invoice_id": maybe_null(fake.uuid4()),
        "invoice_date": maybe_null(fake.date_between(start_date='-2y', end_date='today')),

        # Product Info
        "product_id": maybe_null(fake.uuid4()),
        "product_name": maybe_null(fake.word().capitalize() + " " + random.choice(product_categories)),
        "product_category": maybe_null(random.choice(product_categories)),
        "price": maybe_null(round(random.uniform(1, 10000), 2)),
        "quantity": maybe_null(random.randint(1, 200)),

        # Payment Info
        "payment_method": maybe_null(random.choice(payment_methods)),

        # Customer Info
        "customer_id": maybe_null(fake.uuid4()),
        "customer_name": maybe_null(fake.name()),
        "customer_age": maybe_null(random.randint(18, 80)), 
        "customer_gender":(random.choice(['Male', 'Female', 'Other'])),
        "customer_email":(fake.email()),
        "city": maybe_null(fake.city()),
        "country": maybe_null(fake.country()),

        # Inventory Info
        "inventory_id": fake.uuid4(),
        "stock_available": maybe_null(random.randint(0, 100)),
        "restock_date": maybe_null(fake.date_between(start_date='today', end_date='+60d')),

        # Supplier Info
        "supplier_id": fake.uuid4(),
        "supplier_name": maybe_null(fake.company()),
        "supplier_email":(fake.company_email()),
        "supplier_phone": maybe_null(fake.msisdn())
    }
    
    # Generate payment_id based on payment_method
    payment_method = row["payment_method"]
    if payment_method in ["Credit Card", "Debit Card"]:
        row["payment_id"] =fake.credit_card_number(card_type=None)[:12]
    elif payment_method == "UPI":
        row["payment_id"] =fake.bothify(text='??????').upper()
    else:  # Cash/Bank Transfer or null
        row["payment_id"] =("N/A")
    
    return row

# Generate the dataset
data = [generate_row() for _ in range(num_rows)]
df = pd.DataFrame(data)

Inject some noises in the rows to pre-process them furthur..

In [7]:
def inject_realistic_noise(row):
    col = random.choice(df.columns)
    noise_type = random.choice(["null", "wrong"])

    if noise_type == "null":
        row[col] = None

    elif noise_type == "wrong":
        if col == "price":
            row[col] = -abs(row[col])  # negative price
        elif col == "quantity":
            row[col] = -random.randint(1, 10)  # negative quantity
        elif col == "product_category":
            row[col] = "UnknownCategory"  # invalid category
        elif col == "customer_name":
            row[col] = ''.join(fake.random_letters(length=10))  # garbled name

    return row

Inject some duplicate rows as well..

In [8]:
# Step 2: Sample 10,000 unique rows to duplicate
duplicate_rows = df.sample(n=10000, random_state=42)

# Step 3: Create random indices to insert them
insert_indices = np.random.choice(df.index, size=10000, replace=False)

# Step 4: Reset index to allow safe row shuffling
df.reset_index(drop=True, inplace=True)
duplicate_rows.reset_index(drop=True, inplace=True)

# Step 5: Insert duplicate rows at the random positions
df_with_dupes = pd.concat([df, duplicate_rows], ignore_index=True)

# Step 6: Shuffle to mix original and duplicate rows randomly
df_with_dupes = df_with_dupes.sample(frac=1, random_state=99).reset_index(drop=True)

# Step 7: Verify
print("Duplication completed!♊")

Duplication completed!♊


In [9]:
# Assuming you already have the DataFrame `df` with 310000 rows
chunk_size = 77500

Save a chunk of your original dataset in `csv` format..

In [10]:
# Chunk 1 → CSV
chunk1 = df_with_dupes.iloc[0:chunk_size]
chunk1.to_csv("chunk_1.csv", index=False)
print("Chunk 1 saved as CSV.📝")

Chunk 1 saved as CSV.📝


Save a chunk of your original dataset in `xlsx` format..

In [11]:
# Chunk 2 → Excel
chunk2 = df_with_dupes.iloc[chunk_size:chunk_size*2]
chunk2.to_excel("chunk_2.xlsx", index=False)
print("Chunk 2 saved as Excel.📈")

Chunk 2 saved as Excel.📈


Save a chunk of your original dataset in `json` format..

In [12]:
# Chunk 3 → JSON
chunk3 = df_with_dupes.iloc[chunk_size*2:chunk_size*3]
chunk3.to_json("chunk_3.json", orient="records", lines=True)
print("Chunk 3 saved as JSON.🏷️")

Chunk 3 saved as JSON.🏷️


Save a chunk of your original dataset in `DB(sql)` format..

In [13]:
# Chunk 4 → SQL (SQLite)
chunk4 = df_with_dupes.iloc[chunk_size*3:chunk_size*4]
conn = sqlite3.connect("chunk_4.db")
chunk4.to_sql("invoices", conn, if_exists="replace", index=False)

# Dump as .sql file
with open("chunk_4.sql", "w") as f:
    for line in conn.iterdump():
        f.write(f"{line}\n")
conn.close()
print("Chunk 4 saved as SQL.🗃️")

Chunk 4 saved as SQL.🗃️


# DATA GENERATION COMPLETED. TIME FOR ETL..