In [1]:
import pandas as pd
from datetime import datetime
from numpy import random, ceil
from pandas_profiling import ProfileReport

In [2]:
# Generate a simple 1000 row sales_data dataset to demonstrate using ProfileReport

first_names = ["Ben", "Claire", "Steve", "Jenny", "Simon", "Geoff", "Sandra", "Terry", "Janet", "Josephine", "Gillian", "Barry"]
surnames = ["Smith", "Jones", "Wilson", "Cox", "Jackson", "Davis", "Carter", "Brown", "Green", "Harper", "Gill", "Sedgewick"]
in_store_or_online = ["In Store", "Online"]

sales_data = []

for i in range(1,1000):
    sales_data_row = []
    
    order_id = i
    year = random.randint(2019,2022)
    month = random.randint(1,13)
    
    if month == 2:
        day = random.randint(1,29)
    elif month in [4,6,9,11]:
        day = random.randint(1,31)
    else:
        day = 31
        
    date = datetime(year, month, day)
    
    store_or_online = in_store_or_online[random.randint(0,2)]
    
    customer_name = f"{first_names[random.randint(0,len(first_names))]} {surnames[random.randint(0,len(surnames))]}"
        
    random_product_id_number = random.randint(1,500)
    
    product_name = f"Product {random_product_id_number}"
    
    # The majority of products (1-350 out of 500) will be in Category 1
    
    if random_product_id_number < 350:
        category_name = "Category 1"
    else:
        category_name = "Category 2"
        
    sales_amount = random.randint(10,500)
    quantity = random.randint(1,10)
    
    # Profit per sale is improved if the sale is Online and Category 2 - 30-50%
    # Profit per sale is less if the sale is In Store or Category 1 - -5-20%
    
    if store_or_online == "Online" and category_name == "Category 2":
        profit_based_on_sales_amount = ceil( sales_amount * ( random.randint( 30, 50 ) / 100 ) )
    else:
        profit_based_on_sales_amount = ceil( sales_amount * ( random.randint( -5, 20 ) / 100 ) )
    
    sales_data_row = [
        order_id, 
        date, 
        store_or_online, 
        customer_name, 
        random_product_id_number, 
        product_name, 
        category_name, 
        sales_amount, 
        quantity, 
        profit_based_on_sales_amount
    ]
    sales_data.append(sales_data_row)
    
    
df = pd.DataFrame(sales_data, 
    columns=[
        "Order ID",
        "Order Date",
        "In Store or Online",
        "Customer Name",
        "Product ID",
        "Product Name",
        "Category",
        "Sales",
        "Quantity",
        "Profit"
])
df.head(10)

Unnamed: 0,Order ID,Order Date,In Store or Online,Customer Name,Product ID,Product Name,Category,Sales,Quantity,Profit
0,1,2021-03-31,Online,Ben Smith,214,Product 214,Category 1,98,6,17.0
1,2,2021-04-11,In Store,Ben Wilson,401,Product 401,Category 2,139,2,2.0
2,3,2020-07-31,In Store,Geoff Sedgewick,265,Product 265,Category 1,46,2,8.0
3,4,2019-05-31,Online,Jenny Brown,91,Product 91,Category 1,436,2,14.0
4,5,2020-07-31,In Store,Janet Cox,455,Product 455,Category 2,497,9,70.0
5,6,2021-12-31,Online,Terry Harper,340,Product 340,Category 1,71,1,14.0
6,7,2021-08-31,Online,Gillian Brown,368,Product 368,Category 2,392,8,118.0
7,8,2021-03-31,In Store,Ben Wilson,133,Product 133,Category 1,419,8,63.0
8,9,2021-09-20,Online,Terry Davis,377,Product 377,Category 2,185,9,56.0
9,10,2019-12-31,Online,Ben Davis,257,Product 257,Category 1,237,4,27.0


- Advanced Profiling can be done with the ProfileReport class.
- Profiling tells you if there are any missing values, columns with unique values, the cardinality level, correlation between columns, mean, max, min and also the memory size for each column.
- The detailed Correlation charts are a great starting point for further analysis.

# 1. Advanced Profiling

In [3]:
# Profile Report class setup
profile_report = ProfileReport(df, title="Sales Data Profiling")

- The to_widgets method will print an advanced interactive profiling report here in the notebook.

In [4]:
# View profile report in this notebook
profile_report.to_widgets()

- You can also send the report to an html file that will give you an interactive web page.

In [5]:
# The report can also be published as a web page (with menu options)
profile_report.to_file("sales_data_profiling.html")

**The dataset above has been created so that:**
- Higher sales amounts will have a relatively high profit amount (within a range)
- Sales that are both Online and products that are in Category 2 produce the best profit.

**The advanced analysis report on the above dataset would tell us:**
- Customer Name and Product Name columns have High Cardinality
- Category is 'Categorical' (you can perform further grouping for insights using this column)
- In Store or Online is also 'Categorical'
- Order ID has unique fields
- Profit has a small percentage of 0 fields
- There is a correlation between Sales and Profit (as programmed in the sales_data dataset above)
- There is a correlation between Product ID and Profit (the products in Category 2 have higher profit margins)
- There is a correlation between Category and Profit
- There is a correlation between In Store or Online and Profit (Online along with Category 2 products have higher profit margins)