In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, avg, col,monotonically_increasing_id 

spark = SparkSession.builder.appName("GoldLayerCreation").getOrCreate()

# Read the necessary Silver tables
silver_sellers = spark.read.format("delta").load("/mnt/delta/tables/silver/sellers")
silver_buyers = spark.read.format("delta").load("/mnt/delta/tables/silver/buyers")
silver_users = spark.read.format("delta").load("/mnt/delta/tables/silver/users")
silver_countries = spark.read.format("delta").load("/mnt/delta/tables/silver/countries")

In [0]:
print(silver_buyers.count())
print(silver_countries.count())
print(silver_sellers.count())
print(silver_users.count() )

62
19
73
19783


In [0]:
print(silver_buyers.select(col("Country")).distinct().count())
print(silver_countries.select(col("Country")).distinct().count())
print(silver_sellers.select(col("Country")).distinct().count())
print(silver_users.select(col("Country")).distinct().count())

62
19
48
150


In [0]:
# Perform the join operations
comprehensive_user_table = silver_users \
    .join(silver_countries, ["country"], "outer") \
    .join(silver_buyers, ["country"], "outer") \
    .join(silver_sellers, ["country"], "outer")

# Select and alias columns from each dataframe to ensure uniqueness
comprehensive_user_table = comprehensive_user_table.select(
    silver_users["country"].alias("Country"),
    # From silver_users
    silver_users["productsSold"].alias("Users_productsSold"),
    silver_users["productsWished"].alias("Users_productsWished"),
    silver_users["account_age_years"].alias("Users_account_age_years"),
    silver_users["account_age_group"].alias("Users_account_age_group"),
    silver_users["hasanyapp"].alias("Users_hasanyapp"),
    silver_users["socialnbfollowers"].alias("Users_socialnbfollowers"),
    silver_users["flag_long_title"].alias("Users_flag_long_title"),
    # Continue with other silver_users columns as needed...
    
    # From silver_countries
    silver_countries["sellers"].alias("Countries_Sellers"),
    silver_countries["topsellers"].alias("Countries_TopSellers"),
    silver_countries["femalesellers"].alias("Countries_FemaleSellers"),
    silver_countries["malesellers"].alias("Countries_MaleSellers"),
    silver_countries["topfemalesellers"].alias("Countries_TopFemaleSellers"),
    silver_countries["topmalesellers"].alias("Countries_TopMaleSellers"),
    # Continue with other silver_countries columns as needed...
    
    # From silver_buyers
    silver_buyers["buyers"].alias("Buyers_Total"),
    silver_buyers["topbuyers"].alias("Buyers_Top"),
    silver_buyers["femalebuyers"].alias("Buyers_Female"),
    silver_buyers["malebuyers"].alias("Buyers_Male"),
    silver_buyers["topfemalebuyers"].alias("Buyers_TopFemale"),
    silver_buyers["topmalebuyers"].alias("Buyers_TopMale"),
    # Continue with other silver_buyers columns as needed...
    
    # From silver_sellers
    silver_sellers["nbsellers"].alias("Sellers_Total"),
    silver_sellers["sex"].alias("Sellers_Sex"),
    silver_sellers["meanproductssold"].alias("Sellers_MeanProductsSold"),
    silver_sellers["meanproductslisted"].alias("Sellers_MeanProductsListed"),
    # Continue with other silver_sellers columns as needed...
)


In [0]:
comprehensive_user_table.write.format("delta").mode("overwrite").save("/mnt/delta/tables/gold/ecom_one_big_table")

In [0]:
%sql
select * from delta.`/mnt/delta/tables/gold/ecom_one_big_table`

Country,Users_productsSold,Users_productsWished,Users_account_age_years,Users_account_age_group,Users_hasanyapp,Users_socialnbfollowers,Users_flag_long_title,Countries_Sellers,Countries_TopSellers,Countries_FemaleSellers,Countries_MaleSellers,Countries_TopFemaleSellers,Countries_TopMaleSellers,Buyers_Total,Buyers_Top,Buyers_Female,Buyers_Male,Buyers_TopFemale,Buyers_TopMale,Sellers_Total,Sellers_Sex,Sellers_MeanProductsSold,Sellers_MeanProductsListed
Singapour,0,2,8.78,Experienced,False,4,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,True,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6
Singapour,0,0,8.78,Experienced,False,3,False,,,,,,,16.0,1.0,12.0,4.0,0.0,1.0,5.0,FEMALE,2.2,1.6


In [0]:
OneBig_table_path = "/mnt/delta/tables/gold/ecom_one_big_table"

# Function to get the row count of a Delta table
def get_row_count(delta_table_path):
    df = spark.read.format("delta").load(delta_table_path)
    return df.count()

# Get the number of rows in each table
num_rows = get_row_count(OneBig_table_path)
# Print the counts
print(f"The Delta table at {OneBig_table_path} has {num_rows} rows.")
