# 💎 Notebook: Criação de Features - Camada Platinum
Este notebook tem como objetivo consolidar métricas e atributos dos criadores, preparando uma tabela de **features na camada Platinum** para análises avançadas.


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
spark = SparkSession.builder.getOrCreate()

In [0]:
df_posts = spark.table("default.posts_creator")
df_wiki = spark.table("default.creators_scrape_wiki")

In [0]:
df_posts = df_posts.withColumnRenamed("yt_user", "creator") \
                   .withColumn("creator", F.lower(F.col("creator")))

df_wiki = df_wiki.select(
    F.lower(F.col("wiki_page")).alias("creator")
)

## 📊 Cálculo de métricas de engajamento
Agregação de métricas por criador, como quantidade de posts e interações.


In [0]:
engagement = df_posts.groupBy("creator").agg(
    F.count("*").alias("qtd_posts"),
    F.sum("likes").alias("sum_likes"),
    F.sum("views").alias("sum_views"),
    F.avg("likes").alias("avg_likes"),
    F.avg("views").alias("avg_views"),
    F.stddev("likes").alias("std_likes"),
    F.stddev("views").alias("std_views")
).withColumn(
    "engagement_rate",
    F.when(F.col("sum_views") > 0, F.col("sum_likes") / F.col("sum_views")).otherwise(0)
)

## 🔗 Junção com dados de criadores da Wikipedia
Relaciona as métricas de engajamento com informações adicionais sobre cada criador.


In [0]:
df_features = engagement.join(df_wiki, "creator", "left")

# Garantir que valores nulos sejam tratados
df_features = df_features.fillna(0, subset=[
    "sum_likes", "sum_views", "avg_likes", "avg_views", 
    "std_likes", "std_views", "engagement_rate"
])

## 🏆 Ranking de criadores
Uso de janelas analíticas para ordenar os criadores por número de posts.


In [0]:
window_posts = Window.orderBy(F.desc("qtd_posts"))
window_views = Window.orderBy(F.desc("avg_views"))
window_likes = Window.orderBy(F.desc("avg_likes"))
window_eng = Window.orderBy(F.desc("engagement_rate"))

df_features = df_features \
    .withColumn("rank_posts", F.dense_rank().over(window_posts)) \
    .withColumn("rank_views", F.dense_rank().over(window_views)) \
    .withColumn("rank_likes", F.dense_rank().over(window_likes)) \
    .withColumn("rank_engagement", F.dense_rank().over(window_eng))



In [0]:
spark.sql("CREATE SCHEMA IF NOT EXISTS platinum")

df_features.write.format("delta").mode("overwrite").saveAsTable("platinum.creators_features")



In [0]:
total_creators = df_features.count()
exec_date = spark.sql("SELECT current_date()").collect()[0][0]

print("✅ Tabela 'platinum.creators_features' criada com sucesso 🚀")
print(f"📊 Processados {total_creators} creators em {exec_date}")



✅ Tabela 'platinum.creators_features' criada com sucesso 🚀
📊 Processados 14 creators em 2025-08-20


In [0]:
%sql
SELECT * FROM platinum.creators_features

creator,qtd_posts,sum_likes,sum_views,avg_likes,avg_views,std_likes,std_views,engagement_rate,rank_posts,rank_views,rank_likes,rank_engagement
pirulla25,66,459317,4386983,6959.348484848485,66469.43939393939,6713.221353286991,74828.15751473582,0.1046999726235547,10,13,9,1
felipeneto,374,30842412,389793199,82466.3422459893,1042227.8048128342,47725.13375037449,763564.4644155223,0.0791250644678385,4,8,5,2
raywilliamjohnson,646,191717764,2641789893,296776.72445820435,4089458.03869969,375883.8740853597,5249644.585367253,0.0725711626454466,2,4,1,3
pewdiepie,33,7776421,108025064,235649.1212121212,3273486.787878788,250094.94842618745,2249700.2024825965,0.0719871917872689,12,5,2,4
portadosfundos,312,13768422,206194181,44129.557692307695,660878.7852564103,53377.925583562894,1426437.3631013432,0.0667740570234617,7,9,7,5
morezoella,43,1111331,28379209,25844.906976744187,659981.6046511628,11601.153230470449,231287.3684267443,0.0391600414232828,11,10,8,6
canalkondzilla,334,964551,28325511,2887.877245508982,84806.91916167665,11715.525805306226,425725.1312338873,0.0340523777311554,5,12,12,7
luisitocomunica,127,20148369,617971894,158648.57480314962,4865920.425196851,206064.1654115394,4070503.993067633,0.0326040216320906,9,3,3,8
luccasneto,492,33365504,1146428009,67816.06504065041,2330138.2296747966,131174.48153622408,6174286.197369049,0.0291038806955736,3,6,6,9
tedtalksdirector,326,1580830,58251164,4849.171779141105,178684.55214723927,19038.225470181776,622692.7341693891,0.0271381701488402,6,11,11,10
