Nessa parte, utilizei outro dataset de domínio público: https://www.kaggle.com/datasets/datasnaek/youtube-new?select=USvideos.csv
/
For this part, I used another public domain dataset: https://www.kaggle.com/datasets/datasnaek/youtube-new?select=USvideos.csv

In [41]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.getOrCreate()

In [42]:
# Lendo os arquivos com cabeçalho e inferindo o schema
# Reading files with headers and inferring the schema
df_video = spark.read.option("header", "true").option("inferSchema", "true").csv("videos-stats.csv")

# A leitura de comentários deve ser mais robusta, devido a possíveis vírgulas nos comentários que podem embaralhar o arquivo CSV
# The reading of comments must be more robust, due to possible commas in the comments that can scramble the CSV file
df_comentario = (
    spark.read
        .option("header", "true")
        .option("sep", ",")
        .option("quote", "\"")
        .option("escape", "\"")
        .option("multiLine", "true")
        .option("mode", "PERMISSIVE")
        .option("inferSchema", "true")
        .csv("comments.csv")
)

df_us_videos = spark.read.option("header", "true").option("inferSchema", "true").csv("USvideos.csv")

In [43]:
# Alterando os valores nulos dos campos 'Likes', 'Comments' e 'Views' para o valor 0
# Changing null values of 'Likes', 'Comments', and 'Views' fields to 0
df_video = df_video.fillna({'Likes': 0, 'Comments': 0, 'Views': 0})
df_comentario = df_comentario.fillna({'Likes' : 0, 'Sentiment' : 0})

In [44]:
# Calculando a quantidade de registros do df_video e df_comentario
# Calculating the number of records for df_video and df_comentario
df_video_count = df_video.select([count(col(c)).alias(c) for c in df_video.columns])
df_comentario_count = df_comentario.select([count(col(c)).alias(c) for c in df_comentario.columns])

df_video_count.show()
df_comentario_count.show()

+----+-----+--------+------------+-------+-----+--------+-----+
| _c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|
+----+-----+--------+------------+-------+-----+--------+-----+
|1881| 1881|    1881|        1881|   1881| 1881|    1881| 1881|
+----+-----+--------+------------+-------+-----+--------+-----+

+-----+--------+-------+-----+---------+
|  _c0|Video ID|Comment|Likes|Sentiment|
+-----+--------+-------+-----+---------+
|18409|   18409|  18408|18409|    18409|
+-----+--------+-------+-----+---------+



In [45]:
# Removendo os registros do df_video e df_comentario quee possuem o campo 'Video ID' nulos e calculando novamente a quantidade de registros
# Removing records from df_video and df_comentario that have null 'Video ID' field and recalculating the number of records
df_video = df_video.na.drop(subset=['Video ID'])
df_comentario = df_comentario.na.drop(subset=['Video ID'])

df_video_count_drop = df_video.select([count(col(c)).alias(c) for c in df_video.columns])
df_comentario_count_drop = df_comentario.select([count(col(c)).alias(c) for c in df_comentario.columns])

df_video_count_drop.show()

df_comentario_count_drop.show()

+----+-----+--------+------------+-------+-----+--------+-----+
| _c0|Title|Video ID|Published At|Keyword|Likes|Comments|Views|
+----+-----+--------+------------+-------+-----+--------+-----+
|1881| 1881|    1881|        1881|   1881| 1881|    1881| 1881|
+----+-----+--------+------------+-------+-----+--------+-----+

+-----+--------+-------+-----+---------+
|  _c0|Video ID|Comment|Likes|Sentiment|
+-----+--------+-------+-----+---------+
|18409|   18409|  18408|18409|    18409|
+-----+--------+-------+-----+---------+



In [46]:
# Removendo os registros apenas do df_video que possuem o campo 'Video ID' duplicados
# Removing only the records from df_video that have duplicate 'Video ID' field
df_video_unicos = df_video.dropDuplicates(subset=['Video ID'])

In [47]:
# Convertendo os campos Likes, Comments e Views para 'long' no dataframe df_video
# Converting Likes, Comments, and Views fields to 'long' in the df_video dataframe
df_video = df_video.\
  withColumn('Likes', col('Likes').cast('long')).\
  withColumn('Comments', col('Comments').cast('long')).\
  withColumn('Views', col('Views').cast('long'))

In [48]:
# Convertendo os campos Likes e Sentiment para 'int' no dataframe df_comentario, além disso, altere o nome do campo Likes para 'Likes Comment'
# Converting Likes and Sentiment fields to 'int' in the df_comentario dataframe, and renaming the Likes field to 'Likes Comment'
df_comentario = df_comentario.\
  withColumnRenamed('Likes', 'Likes Comment').\
  withColumn('Likes Comment', col('Likes Comment').cast('int')).\
  withColumn('Sentiment', col('Sentiment').cast('int'))

In [49]:
# Criando o campo 'Interaction' no dataframe df_video, com a soma dos campos Likes, Comments e Views
# Creating the 'Interaction' field in the df_video dataframe, with the sum of Likes, Comments, and Views fields
df_video = df_video.withColumn('Interaction', col('Likes') + col('Comments') + col('Views'))
df_video = df_video.withColumn('Interaction', col('Interaction').cast('long'))

df_video.show(10)

+---+--------------------+-----------+------------+-------+------+--------+-------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword| Likes|Comments|  Views|Interaction|
+---+--------------------+-----------+------------+-------+------+--------+-------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407|     672| 135612|     139691|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779|    4306|1758063|    1839148|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825|    3338|1564007|    1631170|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566|    1426| 922918|     995910|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513|    5155|1855644|    1957312|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570|    1643| 943119|     978332|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047|    9367|5937790|    6082204|
|  7|I bought the STRA...|2TqOmtTAMRY|  

In [50]:
# Convertendo os campos 'Published At' para 'date' no dataframe df_video
# Converting 'Published At' fields to 'date' in the df_video dataframe
df_video = df_video.withColumn('Published At', col('Published At').cast('date'))

df_video.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Video ID: string (nullable = true)
 |-- Published At: date (nullable = true)
 |-- Keyword: string (nullable = true)
 |-- Likes: long (nullable = true)
 |-- Comments: long (nullable = true)
 |-- Views: long (nullable = true)
 |-- Interaction: long (nullable = true)



In [51]:
# Criando o campo 'Year' no dataframe df_video, extraindo apenas o ano do campo 'Published At'
# Creating the 'Year' field in the df_video dataframe, extracting only the year from the 'Published At' field
df_video = df_video.withColumn('Year', year(col('Published At')))

In [52]:
# Removendo as colunas '_c0'
# Removing the '_c0' columns
if '_c0' in df_video.columns:
    df_video = df_video.drop('_c0')
if '_c0' in df_comentario.columns:
    df_comentario = df_comentario.drop('_c0')

In [59]:
# Mesclando os dados df_comentario no dataframe df_video em relação ao campo Video ID e crie o dataframe df_join_video_comments
# Merging df_comentario data into the df_video dataframe based on the Video ID field and creating the df_join_video_comments dataframe
df_join_video_comments = df_video.join(df_comentario, 'Video ID')

df_join_video_comments.show(10)

+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+-------------+---------+
|   Video ID|               Title|Published At|Keyword|Likes|Comments| Views|Interaction|Year|             Comment|Likes Comment|Sentiment|
+-----------+--------------------+------------+-------+-----+--------+------+-----------+----+--------------------+-------------+---------+
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Let's not forget ...|           95|        1|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Here in NZ 50% of...|           19|        0|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|I will forever ac...|          161|        2|
|wAZZ-UWGVHI|Apple Pay Is Kill...|  2022-08-23|   tech| 3407|     672|135612|     139691|2022|Whenever I go to ...|            8|        0|
|wAZZ-UWGVHI|Apple P

In [54]:
# Mesclando os dados df_us_videos no dataframe df_video em relação ao campo Title e crie e visualize o dataframe df_join_video_usvideos
# Merging df_us_videos data into the df_video dataframe based on the Title field and creating and displaying the df_join_video_usvideos dataframe
df_join_video_usvideos = df_us_videos.join(df_video, 'Title')

df_join_video_usvideos.show(10)

+--------------------+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+-----------+------------+----------------+------+--------+---------+-----------+----+
|               title|   video_id|trending_date|    channel_title|category_id|        publish_time|                tags|   views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|   Video ID|Published At|         Keyword| Likes|Comments|    Views|Interaction|Year|
+--------------------+-----------+-------------+-----------------+-----------+--------------------+--------------------+--------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+-----------+------------+----------------+------+--

In [55]:
# Verificando a quantidade de campos nulos em todos os campos do dataframe df_video
# Checking the number of null fields in all columns of the df_video dataframe
df_video.select([count(when(col(c).isNull(), 1)).alias(c) for c in df_video.columns]).show()

+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|Title|Video ID|Published At|Keyword|Likes|Comments|Views|Interaction|Year|
+-----+--------+------------+-------+-----+--------+-----+-----------+----+
|    0|       0|           0|      0|    0|       0|    0|          0|   0|
+-----+--------+------------+-------+-----+--------+-----+-----------+----+



In [56]:
# Salvando o dataframe df_video como 'videos-tratados-parquet' no formato parquet e adicione o cabeçalho nos dados
# Saving the df_video dataframe as 'videos-tratados-parquet' in parquet format and adding the header to the data
df_video.write.mode('overwrite').option('header', 'true').parquet('videos-tratados-parquet')

In [57]:
# Salvando o dataframe df_join_video_comments como 'videos-comments-tratados-parquet' no formato parquet e adicione o cabeçalho nos dados
# Saving the df_join_video_comments dataframe as 'videos-comments-tratados-parquet' in parquet format and adding the header to the data
df_join_video_comments.write.mode('overwrite').option('header', 'true').parquet('videos-comments-tratados-parquet')