Data Generation

In [1]:
# Importing the 'random' module, which provides functions to generate random numbers, shuffle data, and make random selections.
import random

# Importing the 'Faker' class from the 'faker' library, which allows the generation of fake data such as names, addresses, emails, and more.
from faker import Faker

# Initialize Faker instance
fake = Faker()

# Predefined data
data = {
    'payment_types': ['Card', 'Internet Banking', 'UPI', 'Wallet'],
    'countries': ['USA', 'UK', 'Germany', 'India', 'Canada'],
    'categories': ['Electronics', 'Clothing', 'Home & Kitchen', 'Books', 'Sports'],
    'sites': ['Amazon', 'eBay', 'Flipkart', 'Walmart', 'Shopify'],
    'payment_status': ['Y', 'N'],
    'failure_reasons': ['Insufficient funds', 'Payment gateway error', 'Card expired', None],
    'product_name': [
        "Milk", "Bread", "Coffee", "Eggs", "Juice", "Toothpaste", "Shampoo", "Soap", "Deodorant", 
        "Lotion", "Detergent", "Trash Bags", "Paper Towels", "Cleaning Spray", "Light Bulbs", 
        "Smartphones", "Laptops", "Headphones", "Chargers", "Batteries", "Gasoline",
        "Car Maintenance Products (e.g., motor oil)", "Public Transport Cards", "Pens", "Notebooks",
        "Printer Paper", "Sticky Notes", "Stapler", "Hand Sanitizer", "Tissues", "Aluminum Foil",
        # Additional product names here...
    ]
}

# Function to generate a single record
def generate_record():
    payment_success = random.choice(data['payment_status'])
    return {
        'Order Id': fake.uuid4(),
        'Customer Id': fake.uuid4(),
        'Customer Name': fake.name(),
        'Product Id': fake.uuid4(),
        'Product Name': random.choice(data['product_name']),
        'Product Category': random.choice(data['categories']),
        'Payment Type': random.choice(data['payment_types']),
        'Quantity ordered': random.randint(1, 5),
        'Price': round(random.uniform(10, 1000), 2),
        'Date and time when order was placed': fake.date_time_this_decade(),
        'Customer Country': random.choice(data['countries']),
        'Customer City': fake.city(),
        'Site from where order was placed': random.choice(data['sites']),
        'Payment Transaction Confirmation Id': fake.uuid4(),
        'Payment Success or Failure': payment_success,
        'Reason for payment failure': random.choice(data['failure_reasons']) if payment_success == 'N' else None
    } 

In [2]:
# Importing the 'csv' module, which provides functionality to read from and write to CSV (Comma-Separated Values) files in Python.
import csv

# Generate and save 10,000 records to a CSV file
def save_records_to_csv(filename, num_records=10000):
    # Generate one sample record to get the fieldnames
    sample_record = generate_record()

    # Write records to CSV
    with open(filename, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=sample_record.keys())
        writer.writeheader()

        # Generate and write all records
        writer.writerows(generate_record() for _ in range(num_records))

# Call the function to save 10,000 records
save_records_to_csv('transaction_data.csv')

In [3]:
# Importing the 'pandas' library with alias name pd
import pandas as pd


# Load CSV directly into a DataFrame and preview the first few rows
df = pd.read_csv('transaction_data.csv', usecols=None)

# Display the first few rows of the DataFrame
print(df.head())


                               Order Id                           Customer Id  \
0  9893ef2a-a3e8-49a9-865b-a6a5e65d529d  cd8fbd77-60ac-4a57-8809-7007cfaaf03a   
1  9fe07613-8679-4975-acfd-a7b9426b3611  f01fe095-cabb-4472-8d81-9e63a6867a86   
2  f5dfe05f-3116-4733-b85c-9de04e6d69f2  ee9215d2-abad-4b75-8072-c1fd02ae2616   
3  edc2adc0-bd94-4d69-8247-a4ee2533d184  49e33d15-33ba-4167-bb7b-d4c61c9e0f65   
4  a2f9cfe3-470f-4dc8-b46b-6f5204101378  7b91edf1-b005-48c4-9078-4a8abe5fd059   

       Customer Name                            Product Id  \
0        Tina Thomas  e87b9479-5cf2-4d03-b938-85cf8fc03e4c   
1       Dawn Bridges  af51211c-79c4-459e-95be-a356d520e0dc   
2  Alexander Johnson  1bb30b02-31c3-4b42-8edc-905a8afa8f3a   
3   Kathleen Mcguire  1ddd61d3-5601-405a-bf54-4014a1f86c51   
4      Cindy Leonard  95b292d4-91cc-46fd-87ec-deefea61dddc   

                                 Product Name Product Category  \
0                                      Lotion      Electronics   
1       

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Order Id                             10000 non-null  object 
 1   Customer Id                          10000 non-null  object 
 2   Customer Name                        10000 non-null  object 
 3   Product Id                           10000 non-null  object 
 4   Product Name                         10000 non-null  object 
 5   Product Category                     10000 non-null  object 
 6   Payment Type                         10000 non-null  object 
 7   Quantity ordered                     10000 non-null  int64  
 8   Price                                10000 non-null  float64
 9   Date and time when order was placed  10000 non-null  object 
 10  Customer Country                     10000 non-null  object 
 11  Customer City                