In [None]:
import commons
import functions as func
import pandas as pd

# Pandas Variables
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 40)

In [None]:
products = pd.read_parquet(commons.PRODUCTS_CLEAN_PARQUET_PATH)
users = pd.read_parquet(commons.USERS_CLEAN_PARQUET_PATH)
transactions = pd.read_parquet(commons.TRANSACTIONS_CLEAN_PARQUET_PATH)


products_nulls = func.get_null_counts(products)
users_nulls = func.get_null_counts(users)
transactions_nulls = func.get_null_counts(transactions)

In [None]:
plot = func.plot_null_percentages(transactions_nulls)
plot.title("Transactions Table")
plot.savefig(commons.TRANSACTIONS_NULL_PCT_CHART_PATH, dpi=300, format="png")

plot = func.plot_null_percentages(users_nulls)
plot.title('Users Table')
plot.savefig(commons.USERS_NULL_PCT_CHART_PATH, dpi=300, format="png")

plot = func.plot_null_percentages(products_nulls)
plot.title('Products Table')
plot.savefig(commons.PRODUCTS_NULL_PCT_CHART_PATH, dpi=300, format="png")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


def plot_null_stats_by_date(df, col: str):
    df[col] = pd.to_datetime(df[col])
    df["year"] = df["CREATED_DATE"].dt.strftime("%Y")

    # Count nulls and non-nulls for each year-month
    null_counts = (
        df.groupby("year").apply(lambda x: x.isnull().sum().sum()).reset_index()
    )
    non_null_counts = (
        df.groupby("year").apply(lambda x: x.notnull().sum().sum()).reset_index()
    )

    null_counts.columns = ["year", "null_count"]
    non_null_counts.columns = ["year", "non_null_count"]

    # Merge counts
    stats = pd.merge(null_counts, non_null_counts, on="year")
    stats = stats.sort_values("year")

    # Plot
    plt.figure(figsize=(12, 6))
    sns.barplot(x="year", y="null_count", data=stats, color="red", label="Null")
    sns.barplot(
        x="year",
        y="non_null_count",
        data=stats,
        color="green",
        label="Non-Null",
        bottom=stats["null_count"],
    )

    plt.xticks(rotation=45, ha="right")
    plt.xlabel("Year-Month")
    plt.ylabel("Count")
    plt.legend()
    plt.tight_layout()

    return plt


plot_null_stats_by_date(users)
plt.show()

In [None]:
products_nulls