In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg, col, monotonically_increasing_id

spark = SparkSession.builder.appName("GoldLayerCreation").getOrCreate()

In [0]:
silver_sellers = spark.read.table("seller_df_silver")
silver_buyers = spark.read.table("buyer_df_silver")
silver_users = spark.read.table("user_df_silver")
silver_countries = spark.read.table("countries_df_silver")

In [0]:
silver_countries.show(5)

+---------+-------+----------+--------------+------------------+---------------------+-------------+-----------+----------------+--------------+----------------+-------------+--------------------+-----------------+----------------------+-------------------+-------------------+---------------------+----------------+------------------+---------------+------------------+-------------+-------------+----------------+----------------+
|  country|sellers|topsellers|topsellerratio|femalesellersratio|topfemalesellersratio|femalesellers|malesellers|topfemalesellers|topmalesellers|countrysoldratio|bestsoldratio|toptotalproductssold|totalproductssold|toptotalproductslisted|totalproductslisted|topmeanproductssold|topmeanproductslisted|meanproductssold|meanproductslisted|meanofflinedays|topmeanofflinedays|meanfollowers|meanfollowing|topmeanfollowers|topmeanfollowing|
+---------+-------+----------+--------------+------------------+---------------------+-------------+-----------+----------------+-----

In [0]:
# Perform the join operations
comprehensive_user_table = (
    silver_users
        .join(silver_countries, ["country"], "outer")
        .join(silver_buyers, ["country"], "outer")
        .join(silver_sellers, ["country"], "outer")
)

# Select and alias columns from each dataframe to ensure uniqueness
comprehensive_user_table = comprehensive_user_table.select(
    # Country
    silver_users["country"].alias("Country"),

    # From silver_users
    silver_users["productsSold"].alias("Users_productsSold"),
    silver_users["productsWished"].alias("Users_productsWished"),
    silver_users["account_age_years"].alias("Users_account_age_years"),
    silver_users["account_age_group"].alias("Users_account_age_group"),
    silver_users["hasanyapp"].alias("Users_hasanyapp"),
    silver_users["socialnbfollowers"].alias("Users_socialnbfollowers"),
    silver_users["flag_long_title"].alias("Users_flag_long_title"),

    # From silver_countries
    silver_countries["sellers"].alias("Countries_Sellers"),
    silver_countries["topsellers"].alias("Countries_TopSellers"),
    silver_countries["femalesellers"].alias("Countries_FemaleSellers"),
    silver_countries["malesellers"].alias("Countries_MaleSellers"),
    silver_countries["topfemalesellers"].alias("Countries_TopFemaleSellers"),
    silver_countries["topmalesellers"].alias("Countries_TopMaleSellers"),

    # Add columns from silver_buyers as needed
    # From silver_buyers
    silver_buyers["buyers"].alias("Buyers_Total"),
    silver_buyers["topbuyers"].alias("Buyers_Top"),
    silver_buyers["femalebuyers"].alias("Buyers_Female"),
    silver_buyers["malebuyers"].alias("Buyers_Male"),
    silver_buyers["topfemalebuyers"].alias("Buyers_TopFemale"),
    silver_buyers["topmalebuyers"].alias("Buyers_TopMale"),
    
    # Continue with other silver_buyers columns as needed...

    # From silver_sellers
    silver_sellers["nbsellers"].alias("Sellers_Total"),
    silver_sellers["sex"].alias("Sellers_Sex"),
    silver_sellers["meanproductssold"].alias("Sellers_MeanProductsSold"),
    silver_sellers["meanproductslisted"].alias("Sellers_MeanProductsListed"),
    # Continue with other silver_sellers columns as needed...

)


In [0]:
comprehensive_user_table.show()

+-------+------------------+--------------------+-----------------------+-----------------------+---------------+-----------------------+---------------------+-----------------+--------------------+-----------------------+---------------------+--------------------------+------------------------+------------+----------+-------------+-----------+----------------+--------------+-------------+-----------+------------------------+--------------------------+
|Country|Users_productsSold|Users_productsWished|Users_account_age_years|Users_account_age_group|Users_hasanyapp|Users_socialnbfollowers|Users_flag_long_title|Countries_Sellers|Countries_TopSellers|Countries_FemaleSellers|Countries_MaleSellers|Countries_TopFemaleSellers|Countries_TopMaleSellers|Buyers_Total|Buyers_Top|Buyers_Female|Buyers_Male|Buyers_TopFemale|Buyers_TopMale|Sellers_Total|Sellers_Sex|Sellers_MeanProductsSold|Sellers_MeanProductsListed|
+-------+------------------+--------------------+-----------------------+-------------

In [0]:
spark.conf.set(
  "fs.azure.account.auth.type.ecomadlsrakesh.dfs.core.windows.net",
  "OAuth"
)

spark.conf.set(
  "fs.azure.account.oauth.provider.type.ecomadlsrakesh.dfs.core.windows.net",
  "org.apache.hadoop.fs.azurebfs.oauth2.MsiTokenProvider"
)




In [0]:

comprehensive_user_table = comprehensive_user_table.cache()
comprehensive_user_table.count()


93284

In [0]:
comprehensive_user_table.createOrReplaceTempView("comprehensive_user_table")


In [0]:
spark.sql("""
CREATE OR REPLACE TABLE ecom_data
USING DELTA
AS
SELECT * FROM comprehensive_user_table
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]