In [3]:
pip install faker

Collecting faker
  Downloading Faker-19.6.1-py3-none-any.whl (1.7 MB)
Installing collected packages: faker
Successfully installed faker-19.6.1
Note: you may need to restart the kernel to use updated packages.


Dataset 01: Retail Dataset

In [5]:
import pandas as pd
import numpy as np

# Setting a random seed for reproducibility
np.random.seed(42)

# Generate a date range for the monthly data (for 3 years to get a good amount of data)
date_range = pd.date_range(start='2020-01-01', end='2022-12-31', freq='M')

# Number of rows (one for each month)
num_rows = len(date_range)

# Generate random data for each KPI metric
data = {
    'Date': date_range,
    'Monthly Sales': np.random.randint(100000, 200000, size=num_rows),
    'Average Transaction Value': np.random.randint(50, 200, size=num_rows),
    'Customer Retention Rate': np.random.uniform(0.5, 0.9, size=num_rows),
    'New Customers Acquired': np.random.randint(100, 1000, size=num_rows),
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Generate random data for product category sales
product_categories = ['Electronics', 'Furniture', 'Groceries', 'Clothing', 'Accessories']
for category in product_categories:
    df[f'{category} Sales'] = np.random.randint(10000, 50000, size=num_rows)

# Generate random data for customer demographics
demographics = {
    'Age 18-25': np.random.randint(50, 200, size=num_rows),
    'Age 26-35': np.random.randint(100, 300, size=num_rows),
    'Age 36-45': np.random.randint(80, 250, size=num_rows),
    'Age 46-60': np.random.randint(60, 200, size=num_rows),
    'Age 60+': np.random.randint(30, 100, size=num_rows),
    'Male': np.random.randint(100, 500, size=num_rows),
    'Female': np.random.randint(100, 500, size=num_rows),
    'Other': np.random.randint(10, 50, size=num_rows),
}
for demo, values in demographics.items():
    df[demo] = values

# Generate random data for peak shopping hours
peak_hours = ['Morning (6-12)', 'Afternoon (12-18)', 'Evening (18-24)', 'Night (0-6)']
for hour in peak_hours:
    df[hour] = np.random.randint(100, 1000, size=num_rows)

# Display the first few rows of the DataFrame
df.head()

# Save the DataFrame to a CSV file
df.to_csv('Retail_Dashboard_Data.csv', index=False)


Dataset 02: Finance Dataset

In [13]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from faker import Faker

# Initialize Faker for generating fake data
fake = Faker()

# Generate data for 36 months (3 years)
start_date = '2021-01-01'
end_date = '2023-12-31'
dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Initialize an empty DataFrame
data = pd.DataFrame({'Month': dates})

# Generate Assets Under Management data
data['Assets Under Management'] = np.random.randint(1000000, 10000000, size=len(months))

# Generate Net Profit Margin data (as a percentage)
data['Net Profit Margin'] = np.random.uniform(5, 20, size=len(months))

# Generate Customer Churn rate (as a percentage)
data['Customer Churn'] = np.random.uniform(1, 10, size=len(months))

# Generate Cost Per Acquisition data
data['Cost Per Acquisition'] = np.random.randint(100, 1000, size=len(months))

# Generate Investment Types data
investment_types = ['Stocks', 'Bonds', 'Real Estate', 'Mutual Funds']
for inv_type in investment_types:
    data[inv_type] = np.random.randint(50000, 500000, size=len(months))

# Generate Monthly Inflow and Outflow data
data['Monthly Inflow'] = np.random.randint(100000, 500000, size=len(months))
data['Monthly Outflow'] = np.random.randint(80000, 450000, size=len(months))

# Generate Customer Age data
data['Customer Age'] = [fake.random_int(min=18, max=70) for _ in range(len(months))]

# Generate Customer Age Group data
age_bins = [0, 30, 45, 60, 100]
age_labels = ['18-30', '31-45', '46-60', '61+']
data['Customer Age Group'] = pd.cut(data['Customer Age'], bins=age_bins, labels=age_labels)

# Generate Customer Feedback data
feedback_options = ['Excellent', 'Good', 'Neutral', 'Poor']
data['Customer Feedback'] = [random.choice(feedback_options) for _ in range(len(months))]

# Save the generated data to a CSV file
data.to_csv('financial_data.csv', index=False)

Dataset 03 

In [15]:
import pandas as pd
import numpy as np
from faker import Faker

# Initialize Faker for generating fake data
fake = Faker()

# Generate data for 84 months (7 years)
start_date = '2016-01-01'
end_date = '2022-12-31'
months = pd.date_range(start=start_date, end=end_date, freq='M')

# Initialize an empty DataFrame
data = pd.DataFrame({'Month': months})

# Ensure we have at least 1000 rows
while len(data) < 1000:
    new_data = pd.DataFrame({'Month': pd.date_range(start=data['Month'].max() + pd.DateOffset(months=1), 
                                                   periods=10, freq='M')})
    data = pd.concat([data, new_data], ignore_index=True)

# Generate ROI (Return on Investment) data as a percentage
data['ROI'] = np.random.uniform(0.01, 0.10, size=len(data)) * 100

# Generate IRR (Internal Rate of Return) data as a percentage
data['IRR'] = np.random.uniform(0.05, 0.15, size=len(data)) * 100

# Generate Investment Performance data
data['Investment Performance'] = np.random.uniform(0.02, 0.08, size=len(data)) * 100

# Generate fake investment breakdown data
investment_categories = ['Stocks', 'Bonds', 'Real Estate', 'Mutual Funds', 'Cryptocurrency']
for category in investment_categories:
    data[category] = np.random.randint(10000, 1000000, size=len(data))

# Generate fake risk assessment data
risk_scores = ['Low', 'Medium', 'High']
data['Risk Assessment'] = [fake.random_element(elements=risk_scores) for _ in range(len(data))]

# Save the generated data to a CSV file
data.to_csv('portfolio_performance_data_monthly.csv', index=False)


Dataset 04

In [2]:
import pandas as pd
from faker import Faker
import random
import numpy as np
import datetime

# Initialize Faker
fake = Faker()

# Define the number of rows (between 500 and 1000)
num_rows = random.randint(500, 1000)

# Create an empty DataFrame
data = pd.DataFrame(columns=[
    'Date',
    'Program',
    'Student_Enrollment',
    'Course_Completion_Rate',
    'Average_Test_Scores',
    'Alumni_Donation_Rate'
])

# Generate data for each row
for _ in range(num_rows):
    # Generate a random date within the last year
    date = fake.date_between(start_date='-365d', end_date='today')
    
    # Generate a program name
    program = fake.random_element(elements=('Mathematics', 'Computer Science', 'Physics', 'Biology', 'Chemistry'))
    
    # Generate student enrollment
    student_enrollment = random.randint(20, 200)
    
    # Generate course completion rate (percentage)
    course_completion_rate = random.uniform(60, 100)
    
    # Generate average test scores (between 0 and 100)
    average_test_scores = random.uniform(0, 100)
    
    # Generate alumni donation rate (percentage)
    alumni_donation_rate = random.uniform(0, 10)
    
    # Append the generated data to the DataFrame
    data = data.append({
        'Date': date,
        'Program': program,
        'Student_Enrollment': student_enrollment,
        'Course_Completion_Rate': course_completion_rate,
        'Average_Test_Scores': average_test_scores,
        'Alumni_Donation_Rate': alumni_donation_rate
    }, ignore_index=True)

# Sort the data by date
data = data.sort_values(by='Date')

# Export the data to a CSV file
data.to_csv('education_dashboard_data.csv', index=False)

Dataset 05

In [3]:
import pandas as pd
import random

# Generating a date range from January 1, 2022, to July 31, 2023
date_range = pd.date_range(start='2022-01-01', end='2023-07-31', freq='D')

# Defining a list of fake department names and business units
departments = ['Marketing', 'Sales', 'HR', 'Finance', 'IT', 'Operations']
business_units = ['BU1', 'BU2', 'BU3', 'BU4']

# Generating random data for each metric
data = {
    'Date': date_range,
    'Department': [random.choice(departments) for _ in date_range],
    'Business Unit': [random.choice(business_units) for _ in date_range],
    'Data Integration Success Rate': [random.uniform(80, 100) for _ in date_range],
    'Data Anomaly Detection': [random.randint(0, 20) for _ in date_range],
    'Analytics Implementation Impact': [random.uniform(0, 1) for _ in date_range],
    'Big Data Adoption Rate': [random.uniform(0, 100) for _ in date_range],
    'Analytics Reporting Accuracy': [random.uniform(90, 100) for _ in date_range],
}

# Creating a DataFrame
df = pd.DataFrame(data)

# Exporting the data to a CSV file
df.to_csv('it_analytics_monthly_dataset.csv', index=False)

print("Data has been generated and saved to 'it_analytics_monthly_dataset.csv'")

Data has been generated and saved to 'it_analytics_monthly_dataset.csv'


Dataset 06

In [4]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker for generating fake data
fake = Faker()

# Create an empty DataFrame
data = pd.DataFrame(columns=[
    'Date',
    'Device Type',
    'Time Spent (minutes)',
    'Product',
    'Conversion Rate',
    'Cart Abandonment Rate',
    'Average Order Value',
    'Customer Lifetime Value',
    'Sales'
])

# Generate data for 1000 rows
for _ in range(1000):
    date = fake.date_between(start_date='-12M', end_date='today')
    device_type = random.choice(['Desktop', 'Mobile', 'Tablet'])
    time_spent = random.uniform(1, 60)
    product = fake.word()
    conversion_rate = random.uniform(0.01, 0.5)
    abandonment_rate = random.uniform(0.01, 0.5)
    avg_order_value = random.uniform(20, 200)
    customer_lifetime_value = random.uniform(50, 500)
    sales = random.uniform(100, 1000)

    data = data.append({
        'Date': date,
        'Device Type': device_type,
        'Time Spent (minutes)': time_spent,
        'Product': product,
        'Conversion Rate': conversion_rate,
        'Cart Abandonment Rate': abandonment_rate,
        'Average Order Value': avg_order_value,
        'Customer Lifetime Value': customer_lifetime_value,
        'Sales': sales
    }, ignore_index=True)

# Export data to CSV
data.to_csv('ecommerce_data.csv', index=False)


Dataset 07

In [5]:
import pandas as pd
import random
from faker import Faker
import datetime

# Create a Faker instance for generating property names
fake = Faker()

# Generate fake data for 1000 rows
data = []

for _ in range(1000):
    property_type = random.choice(['Apartment', 'House', 'Condo'])
    occupancy_rate = random.uniform(0.7, 0.95)
    monthly_revenue = random.randint(5000, 10000)
    avg_rental_length = random.uniform(6, 12)
    maintenance_costs = random.randint(500, 2000)
    maintenance_issues = random.randint(0, 20)
    customer_satisfaction = random.uniform(3.0, 5.0)
    date = fake.date_between(start_date='-1y', end_date='today')
    
    data.append([property_type, occupancy_rate, monthly_revenue, avg_rental_length, maintenance_costs, maintenance_issues, customer_satisfaction, date])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Property_Type', 'Occupancy_Rate', 'Monthly_Revenue', 'Avg_Rental_Length', 'Maintenance_Costs', 'Maintenance_Issues', 'Customer_Satisfaction', 'Date'])

# Export the data to a CSV file
df.to_csv('real_estate_data.csv', index=False)

print("Data has been generated and exported to 'real_estate_data.csv'.")


Data has been generated and exported to 'real_estate_data.csv'.


Dataset 08

In [6]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Define the date range for the data
start_date = datetime(2022, 1, 1)
end_date = datetime(2023, 12, 31)
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Create an empty DataFrame to store the data
data = pd.DataFrame(columns=["Date", "Production Efficiency", "Inventory Turnover", "Product Quality", "Time to Market"])

# Generate fake data for each metric
for date in date_range:
    data = data.append({
        "Date": date,
        "Production Efficiency": random.uniform(0.6, 0.9),  # Adjust the range as needed
        "Inventory Turnover": random.uniform(4, 8),         # Adjust the range as needed
        "Product Quality": random.uniform(85, 99),         # Adjust the range as needed
        "Time to Market": random.uniform(15, 30)           # Adjust the range as needed
    }, ignore_index=True)

# Export the data to a CSV file
data.to_csv("manufacturing_kpi_data.csv", index=False)

Dataset 09

In [7]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random dates within a monthly time frame
def generate_random_dates(start_date, end_date, num_dates):
    date_list = []
    for _ in range(num_dates):
        random_date = start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        date_list.append(random_date)
    return date_list

# Set the start and end date for the data
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)

# Generate random dates for the 1000 rows in a monthly time frame
dates = generate_random_dates(start_date, end_date, 1000)

# Generate fake data for KPI metrics
data = {
    "Date": dates,
    "Risk Identification": [random.randint(1, 100) for _ in range(1000)],
    "Risk Tracking": [random.randint(1, 100) for _ in range(1000)],
    "Risk Mitigation": [random.randint(1, 100) for _ in range(1000)],
    # Add more KPI metrics as needed
}

# Create a DataFrame
df = pd.DataFrame(data)

# Export the data to a CSV file
df.to_csv("risk_management_data.csv", index=False)

print("Fake data generated and exported to risk_management_data.csv")

Fake data generated and exported to risk_management_data.csv


Dataset 10

In [8]:
import pandas as pd
from faker import Faker
import random
import datetime

# Initialize Faker for generating fake startup names
fake = Faker()

# Create an empty DataFrame
data = pd.DataFrame(columns=['Startup Name', 'Valuation Date', 'DCF Valuation', 'Comparables Valuation', 'Risk-Adjusted Returns'])

# Generate 1000 rows of fake data
for _ in range(1000):
    startup_name = fake.company()
    valuation_date = fake.date_between(start_date='-2y', end_date='today')  # Random date within the last 2 years
    dcf_valuation = round(random.uniform(1e6, 1e8), 2)  # Fake DCF Valuation between 1 million and 100 million
    comparables_valuation = round(random.uniform(1e6, 1e8), 2)  # Fake Comparables Valuation
    risk_adjusted_returns = round(random.uniform(0.05, 0.2), 4)  # Fake Risk-Adjusted Returns between 5% and 20%

    data = data.append({'Startup Name': startup_name,
                        'Valuation Date': valuation_date,
                        'DCF Valuation': dcf_valuation,
                        'Comparables Valuation': comparables_valuation,
                        'Risk-Adjusted Returns': risk_adjusted_returns},
                       ignore_index=True)

# Export the data to a CSV file
data.to_csv('startup_valuation_data.csv', index=False)

print("Data generated and exported to startup_valuation_data.csv")


Data generated and exported to startup_valuation_data.csv
