In [1]:
# Instala o PySpark (se ainda não estiver no ambiente)
!pip install -q pyspark


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col


In [3]:
spark = SparkSession.builder \
    .appName("Videos Stats") \
    .getOrCreate()


In [4]:
from google.colab import files

uploaded = files.upload()  # Vai abrir um seletor para escolher o arquivo CSV


Saving comments.csv to comments.csv
Saving USvideos.csv to USvideos.csv
Saving videos-stats.csv to videos-stats.csv


In [5]:
df_video = spark.read.csv("videos-stats.csv", header=True, inferSchema=True)
df_video.show()
df_video.printSchema()


+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [6]:
df_video_clean = df_video.fillna({
    "Likes": 0,
    "Comments": 0,
    "Views": 0
})
df_video_clean.show()


+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [7]:
df_comentario = spark.read.csv("comments.csv", header=True, inferSchema=True)
df_comentario.show()
df_comentario.printSchema()


+--------------+-----------+--------------------+------+---------+
|           _c0|   Video ID|             Comment| Likes|Sentiment|
+--------------+-----------+--------------------+------+---------+
|             0|wAZZ-UWGVHI|Let's not forget ...|  95.0|      1.0|
|             1|wAZZ-UWGVHI|Here in NZ 50% of...|  19.0|      0.0|
|             2|wAZZ-UWGVHI|I will forever ac...| 161.0|      2.0|
|             3|wAZZ-UWGVHI|Whenever I go to ...|   8.0|      0.0|
|             4|wAZZ-UWGVHI|Apple Pay is so c...|  34.0|      2.0|
|             5|wAZZ-UWGVHI|We’ve been houndi...|   8.0|      1.0|
|             6|wAZZ-UWGVHI|We only got Apple...|  29.0|      2.0|
|             7|wAZZ-UWGVHI|For now, I need b...|   7.0|      1.0|
|             8|wAZZ-UWGVHI|In the United Sta...|   2.0|      2.0|
|             9|wAZZ-UWGVHI|In Cambodia, we h...|  28.0|      1.0|
|            10|b3x28s61q3c|Wow, you really w...|1344.0|      2.0|
|            11|b3x28s61q3c|The lab is the mo...| 198.0|      

In [8]:
print("Registros em df_video:", df_video.count())
print("Registros em df_comentario:", df_comentario.count())


Registros em df_video: 1881
Registros em df_comentario: 30036


In [9]:
df_video = df_video.filter(col("Video ID").isNotNull())
df_comentario = df_comentario.filter(col("Video ID").isNotNull())

print("Registros em df_video (sem nulos):", df_video.count())
print("Registros em df_comentario (sem nulos):", df_comentario.count())


Registros em df_video (sem nulos): 1881
Registros em df_comentario (sem nulos): 22555


In [10]:
df_video = df_video.dropDuplicates(["Video ID"])
print("Registros em df_video (sem duplicatas):", df_video.count())


Registros em df_video (sem duplicatas): 1869


In [11]:
df_video = df_video.withColumn("Likes", col("Likes").cast("int")) \
                   .withColumn("Comments", col("Comments").cast("int")) \
                   .withColumn("Views", col("Views").cast("int"))
df_video.printSchema()


root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: integer (nullable = true)
 |-- Comments: integer (nullable = true)
 |-- Views: integer (nullable = true)



In [12]:
df_comentario = df_comentario.withColumn("Likes", col("Likes").cast("int")) \
                             .withColumn("Sentiment", col("Sentiment").cast("int")) \
                             .withColumnRenamed("Likes", "Likes Comment")
df_comentario.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Likes Comment: integer (nullable = true)
 |-- Sentiment: integer (nullable = true)



In [13]:
from pyspark.sql.functions import col

df_video = df_video.withColumn(
    "Interaction",
    col("Likes") + col("Comments") + col("Views")
)
df_video.select("Video ID", "Interaction").show()


+-----------+-----------+
|   Video ID|Interaction|
+-----------+-----------+
|115amzVdV44|   53053975|
|m7Jw3a7CpNA|     458123|
|V6hofBnlJLY|      83140|
|Rk4bAofG8xE|     263672|
|zdsdEVngg7Y|    2946362|
|U3DNz5asasA|    2678233|
|UI9I2p71ct0|      74931|
|sZ12W9uqxR4|    4774149|
|c1aKaQu6LRE|      25104|
|l7R4EyRoAD0|      35982|
|glL1QjeCH80|      21280|
|qvit3SUUmQE|     650176|
|_AWZZP3J5Nw|     174867|
|2WPA1L9uJqo|     609218|
|XWv_4L1_Z7Q|    8962007|
|nKW8Ndu7Mjw|     888321|
|dwV04XuiWq4|   58168145|
|gzeDQDbJMAU|     633058|
|j8u_IK6_OI8|      62497|
|-EDUPj7Vg7o|    1436302|
+-----------+-----------+
only showing top 20 rows



In [14]:
from pyspark.sql.functions import to_date

df_video = df_video.withColumn("Published At", to_date(col("Published At")))
df_video.select("Video ID", "Published At").show()


+-----------+------------+
|   Video ID|Published At|
+-----------+------------+
|115amzVdV44|  2020-08-18|
|m7Jw3a7CpNA|  2021-12-03|
|V6hofBnlJLY|  2022-08-18|
|Rk4bAofG8xE|  2022-08-18|
|zdsdEVngg7Y|  2021-04-22|
|U3DNz5asasA|  2022-05-11|
|UI9I2p71ct0|  2021-05-28|
|sZ12W9uqxR4|  2022-05-04|
|c1aKaQu6LRE|  2022-08-24|
|l7R4EyRoAD0|  2022-08-24|
|glL1QjeCH80|  2022-08-24|
|qvit3SUUmQE|  2022-08-02|
|_AWZZP3J5Nw|  2019-12-03|
|2WPA1L9uJqo|  2022-04-14|
|XWv_4L1_Z7Q|  2019-05-07|
|nKW8Ndu7Mjw|  2017-08-31|
|dwV04XuiWq4|  2019-04-22|
|gzeDQDbJMAU|  2020-09-09|
|j8u_IK6_OI8|  2022-08-24|
|-EDUPj7Vg7o|  2022-08-23|
+-----------+------------+
only showing top 20 rows



In [15]:
from pyspark.sql.functions import year

df_video = df_video.withColumn("Year", year(col("Published At")))
df_video.select("Video ID", "Year").show()


+-----------+----+
|   Video ID|Year|
+-----------+----+
|115amzVdV44|2020|
|m7Jw3a7CpNA|2021|
|V6hofBnlJLY|2022|
|Rk4bAofG8xE|2022|
|zdsdEVngg7Y|2021|
|U3DNz5asasA|2022|
|UI9I2p71ct0|2021|
|sZ12W9uqxR4|2022|
|c1aKaQu6LRE|2022|
|l7R4EyRoAD0|2022|
|glL1QjeCH80|2022|
|qvit3SUUmQE|2022|
|_AWZZP3J5Nw|2019|
|2WPA1L9uJqo|2022|
|XWv_4L1_Z7Q|2019|
|nKW8Ndu7Mjw|2017|
|dwV04XuiWq4|2019|
|gzeDQDbJMAU|2020|
|j8u_IK6_OI8|2022|
|-EDUPj7Vg7o|2022|
+-----------+----+
only showing top 20 rows



In [16]:
df_join_video_comments = df_video.join(df_comentario, on="Video ID", how="inner")
df_join_video_comments.show()


+-----------+---+--------------------+------------+-------+-----+--------+-------+-----------+----+---+--------------------+-------------+---------+
|   Video ID|_c0|               Title|Published At|Keyword|Likes|Comments|  Views|Interaction|Year|_c0|             Comment|Likes Comment|Sentiment|
+-----------+---+--------------------+------------+-------+-----+--------+-------+-----------+----+---+--------------------+-------------+---------+
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  0|Let's not forget ...|           95|        1|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  1|Here in NZ 50% of...|           19|        0|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  2|I will forever ac...|          161|        2|
|wAZZ-UWGVHI|  0|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672| 135612|     139691|2022|  3|Whe

In [17]:
df_us_videos = spark.read.csv("USvideos.csv", header=True, inferSchema=True)
df_us_videos.show()
df_us_videos.printSchema()


+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [18]:
df_join_video_usvideos = df_video.join(df_us_videos, on="Title", how="inner")
df_join_video_usvideos.show()


+--------------------+----+-----------+------------+----------------+------+--------+---------+-----------+----+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|               Title| _c0|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|   video_id|trending_date|    channel_title|category_id|        publish_time|                tags|   views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+--------------------+----+-----------+------------+----------------+------+--------+---------+-----------+----+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+------------

In [19]:
from pyspark.sql.functions import col, sum as _sum, when

df_video.select([
    _sum(when(col(c).isNull(), 1).otherwise(0)).alias(c)
    for c in df_video.columns
]).show()


+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|_c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|  0|    0|       0|           0|      0|    2|       2|    2|          2|   0|
+---+-----+--------+------------+-------+-----+--------+-----+-----------+----+



In [20]:
# Remover a coluna _c0, se existir
if "_c0" in df_video.columns:
    df_video = df_video.drop("_c0")

# Salvar o DataFrame
df_video.write \
    .mode("overwrite") \
    .option("header", True) \
    .parquet("videos-tratados-parquet")


In [21]:
if "_c0" in df_join_video_comments.columns:
    df_join_video_comments = df_join_video_comments.drop("_c0")

df_join_video_comments.write \
    .mode("overwrite") \
    .option("header", True) \
    .parquet("videos-comments-tratados-parquet")


In [24]:
from google.colab import files
files.download('tratamento.ipynb')


FileNotFoundError: Cannot find file: tratamento.ipynb