In [1]:
import pandas as pd
from datetime import datetime
from numpy import random, ceil
from pandas_profiling import ProfileReport

In [2]:
# Generate a simple 1000 row sales_data dataset to demonstrate using ProfileReport

first_names = ["Ben", "Claire", "Steve", "Jenny", "Simon", "Geoff", "Sandra", "Terry", "Janet", "Josephine", "Gillian", "Barry"]
surnames = ["Smith", "Jones", "Wilson", "Cox", "Jackson", "Davis", "Carter", "Brown", "Green", "Harper", "Gill", "Sedgewick"]
in_store_or_online = ["In Store", "Online"]

sales_data = []

for i in range(1,1000):
    sales_data_row = []
    
    order_id = i
    year = random.randint(2019,2022)
    month = random.randint(1,13)
    
    if month == 2:
        day = random.randint(1,29)
    elif month in [4,6,9,11]:
        day = random.randint(1,31)
    else:
        day = 31
        
    date = datetime(year, month, day)
    
    store_or_online = in_store_or_online[random.randint(0,2)]
    
    customer_name = f"{first_names[random.randint(0,len(first_names))]} {surnames[random.randint(0,len(surnames))]}"
        
    random_product_id_number = random.randint(1,500)
    
    product_name = f"Product {random_product_id_number}"
    
    if random_product_id_number < 100:
        category_name = "Category 1"
    else:
        category_name = f"Category {int(ceil(random_product_id_number / 100))}"
        
    sales_amount = random.randint(10,500)
    quantity = random.randint(1,10)
    
    # Random profit per sale between -5% and 20%
    profit_based_on_sales_amount = ceil( sales_amount * ( random.randint( -5, 20 ) / 100 ) )
    
    sales_data_row = [
        order_id, 
        date, 
        store_or_online, 
        customer_name, 
        random_product_id_number, 
        product_name, 
        category_name, 
        sales_amount, 
        quantity, 
        profit_based_on_sales_amount
    ]
    sales_data.append(sales_data_row)
    
    
df = pd.DataFrame(sales_data, 
    columns=[
        "Order ID",
        "Order Date",
        "In Store or Online",
        "Customer Name",
        "Product ID",
        "Product Name",
        "Category",
        "Sales",
        "Quantity",
        "Profit"
])
df.head(10)

Unnamed: 0,Order ID,Order Date,In Store or Online,Customer Name,Product ID,Product Name,Category,Sales,Quantity,Profit
0,1,2021-05-31,Online,Jenny Cox,370,Product 370,Category 4,315,3,26.0
1,2,2021-09-10,Online,Ben Cox,201,Product 201,Category 3,463,4,38.0
2,3,2020-05-31,Online,Geoff Sedgewick,339,Product 339,Category 4,238,4,-4.0
3,4,2021-12-31,In Store,Josephine Davis,77,Product 77,Category 1,337,1,-3.0
4,5,2021-03-31,Online,Sandra Jones,99,Product 99,Category 1,282,7,12.0
5,6,2019-10-31,Online,Ben Wilson,353,Product 353,Category 4,372,8,12.0
6,7,2020-12-31,In Store,Simon Jones,101,Product 101,Category 2,431,2,35.0
7,8,2019-09-27,Online,Claire Cox,207,Product 207,Category 3,182,7,22.0
8,9,2021-12-31,In Store,Ben Jackson,384,Product 384,Category 4,343,5,55.0
9,10,2019-09-01,Online,Janet Sedgewick,94,Product 94,Category 1,433,6,48.0


- Advanced Profiling can be done with the ProfileReport class.
- You can get valuable and detailed information about each column in a dataframe such as correlation between columns, missing values, columns with uniform distribution, cardinality level, if each column is categorical, distinct counts, zero counts, mean size, max size, min size and also details about the memory size for each column.

# 1. Advanced Profiling

In [3]:
# Profile Report class setup
profile_report = ProfileReport(df, title="Sales Data Profiling")

- The to_widgets method will print an advanced interactive profiling report here in the notebook.

In [4]:
# View profile report in this notebook
profile_report.to_widgets()

- You can also send the report to an html file that will give you an interactive web page.

In [5]:
# The report can also be published as a web page (with menu options)
profile_report.to_file("sales_data_profiling.html")

- The dataset above has been created so that although the profit amount is based on sales multipled by a random profit margin, the higher the sale, the higher the profit will be in general.

The advanced analysis report on the above dataset would tell you:
- Customer Name and Product Name columns have High Cardinality
- Category is 'Categorical' (you can perform further grouping for insights using this column)
- In Store or Online is also 'Categorical'
- Order ID has unique fields
- Profit has a small percentage of 0 fields
- There is a correlation between sales and profit (as programmed in the sales_data dataset above)