## Exploring the raw data files.

In [19]:
import os
import numpy as np
import pandas as pd

In [4]:
for file in os.listdir("../../Data/raw"):
    if "olist" in file:
        print("################################")
        print(file)
        curr_df = pd.read_csv(f"../../Data/raw/{file}")
        print(curr_df.head(5))
        print(curr_df.columns)

################################
olist_closed_deals_dataset.csv
                             mql_id                         seller_id  \
0  5420aad7fec3549a85876ba1c529bd84  2c43fb513632d29b3b58df74816f1b06   
1  a555fb36b9368110ede0f043dfc3b9a0  bbb7d7893a450660432ea6652310ebb7   
2  327174d3648a2d047e8940d7d15204ca  612170e34b97004b3ba37eae81836b4c   
3  f5fee8f7da74f4887f5bcae2bafb6dd6  21e1781e36faf92725dde4730a88ca0f   
4  ffe640179b554e295c167a2f6be528e0  ed8cb7b190ceb6067227478e48cf8dde   

                             sdr_id                             sr_id  \
0  a8387c01a09e99ce014107505b92388c  4ef15afb4b2723d8f3d81e51ec7afefe   
1  09285259593c61296eef10c734121d5b  d3d1e91a157ea7f90548eef82f1955e3   
2  b90f87164b5f8c2cfa5c8572834dbe3f  6565aa9ce3178a5caf6171827af3a9ba   
3  56bf83c4bb35763a51c2baab501b4c67  d3d1e91a157ea7f90548eef82f1955e3   
4  4b339f9567d060bcea4f5136b9f5949e  d3d1e91a157ea7f90548eef82f1955e3   

              won_date business_segment      lead_type lea

In [6]:
# Data Paths

data_files = {
    "closed_deals_data": "../../Data/raw/olist_closed_deals_dataset.csv",
    "customer_data": "../../Data/raw/olist_customers_dataset.csv",
    "geolocation_data": "../../Data/raw/olist_geolocation_dataset.csv",
    "marketing_leads_data": "../../Data/raw/olist_marketing_qualified_leads_dataset.csv",
    "orders_data": "../../Data/raw/olist_orders_dataset.csv",
    "orders_items_data": "../../Data/raw/olist_order_items_dataset.csv",
    "payments_data": "../../Data/raw/olist_order_payments_dataset.csv",
    "reviews_data": "../../Data/raw/olist_order_reviews_dataset.csv",
    "products_data": "../../Data/raw/olist_products_dataset.csv",
    "sellers_data": "../../Data/raw/olist_sellers_dataset.csv",
    "products_names_data": "../../Data/raw/product_category_name_translation.csv"
}

### Checking Dataset Level Statistics

In [18]:
for key in data_files.keys():
    print("********************************")
    print(key)
    curr_df = pd.read_csv(data_files[key])
    print(curr_df.shape)
    print(curr_df.isnull().sum())
    print("Dataset Level Duplicate Rows: ", curr_df.shape[0] - curr_df.drop_duplicates().shape[0])

********************************
closed_deals_data
(842, 14)
mql_id                             0
seller_id                          0
sdr_id                             0
sr_id                              0
won_date                           0
business_segment                   1
lead_type                          6
lead_behaviour_profile           177
has_company                      779
has_gtin                         778
average_stock                    776
business_type                     10
declared_product_catalog_size    773
declared_monthly_revenue           0
dtype: int64
Dataset Level Duplicate Rows:  0
********************************
customer_data
(99441, 5)
customer_id                 0
customer_unique_id          0
customer_zip_code_prefix    0
customer_city               0
customer_state              0
dtype: int64
Dataset Level Duplicate Rows:  0
********************************
geolocation_data
(1000163, 5)
geolocation_zip_code_prefix    0
geolocation_lat          

### Initial Observations:

- There are no duplicate rows present in any dataset.
- There are some null values present in datasets like closed deals, orders, reviews etc.
- The dataset sizes are not too large and pandas libraries can be efficiently used for data processing and modeling.

### Generating Data Integrity and Data Quality Report for Feature Level Analysis

In [20]:
output_file = "../../Reports/didq_report.xls"

with pd.ExcelWriter(output_file, engine="xlsxwriter") as writer:
    for key in data_files.keys():
        print(f"Processing {key}")
        df = pd.read_csv(data_files[key])

        report_rows = []

        for col in df.columns:
            col_data = df[col]

            row = {
                "column_name": col,
                "dtype": col_data.dtype,
                "null_count": col_data.isna().sum(),
                "null_pct": col_data.isna().mean(),
                "unique_count": col_data.nunique(dropna=True),
                "duplicate_count": col_data.duplicated().sum()
            }

            if pd.api.types.is_numeric_dtype(col_data):
                row.update({
                    "mean": col_data.mean(),
                    "min": col_data.min(),
                    "max": col_data.max(),
                    "variance": col_data.var(),
                    "most_frequent_value": np.nan,
                    "most_frequent_count": np.nan,
                    "least_frequent_value": np.nan,
                    "least_frequent_count": np.nan
                })

            else:
                value_counts = col_data.value_counts(dropna=True)

                if not value_counts.empty:
                    row.update({
                        "most_frequent_value": value_counts.idxmax(),
                        "most_frequent_count": value_counts.max(),
                        "least_frequent_value": value_counts.idxmin(),
                        "least_frequent_count": value_counts.min(),
                        "mean": np.nan,
                        "min": np.nan,
                        "max": np.nan,
                        "variance": np.nan
                    })
                else:
                    row.update({
                        "most_frequent_value": np.nan,
                        "most_frequent_count": np.nan,
                        "least_frequent_value": np.nan,
                        "least_frequent_count": np.nan,
                        "mean": np.nan,
                        "min": np.nan,
                        "max": np.nan,
                        "variance": np.nan
                    })

            report_rows.append(row)

        report_df = pd.DataFrame(report_rows)

        sheet_name = os.path.splitext(file)[0][:31]

        report_df.to_excel(writer, sheet_name=sheet_name, index=False)

print("DIDQ report generated:", output_file)

Processing closed_deals_data
Processing customer_data
Processing geolocation_data
Processing marketing_leads_data
Processing orders_data
Processing orders_items_data
Processing payments_data
Processing reviews_data
Processing products_data
Processing sellers_data
Processing products_names_data
DIDQ report generated: ../../Reports/didq_report.xls
