In [1]:
!pip install pyspark



In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
# Começa a sessão spark
spark = SparkSession.builder.getOrCreate()

In [4]:
# Lê o arquivo csv
df = spark.read.option('header','true').csv('videos-stats.csv')

In [5]:
# Mostra as primeiras 8 linhas do Dataframe
df.show(8)

+---+--------------------+-----------+------------+-------+--------+--------+---------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|    Views|
+---+--------------------+-----------+------------+-------+--------+--------+---------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0| 135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0| 922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0| 943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|4782514.0|
+---+--------------------+------

In [6]:
# Mostra o esquema do Dataframe
df.schema

StructType([StructField('_c0', StringType(), True), StructField('Title', StringType(), True), StructField('Video ID', StringType(), True), StructField('Published At', StringType(), True), StructField('Keyword', StringType(), True), StructField('Likes', StringType(), True), StructField('Comments', StringType(), True), StructField('Views', StringType(), True)])

In [7]:
# Lê o arquivo inferindo o esquema
df = spark.read.option('header','true').option('inferSchema', 'true').csv('videos-stats.csv')

In [8]:
# Mostra novamente o esquema
df.schema

StructType([StructField('_c0', IntegerType(), True), StructField('Title', StringType(), True), StructField('Video ID', StringType(), True), StructField('Published At', DateType(), True), StructField('Keyword', StringType(), True), StructField('Likes', DoubleType(), True), StructField('Comments', DoubleType(), True), StructField('Views', DoubleType(), True)])

In [9]:
# Salva o arquivo com caneçalho em parquet
df.write.option('header','true').parquet('videos-parquet')

In [10]:
# Lê o arquivo em parquet e o mostra
df_parquet = spark.read.parquet('videos-parquet')
df.show()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [11]:
# Salva o arquivo parquet em uma tabela
df_parquet.write.saveAsTable('tb_videos')

In [12]:
# Mostra as tabelas do catálogo
spark.catalog.listTables()

[Table(name='tb_videos', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False)]

In [13]:
# Seleciona a tabela através de um comando SQL
spark.sql('select * from tb_videos').show()

+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|_c0|               Title|   Video ID|Published At|Keyword|   Likes|Comments|      Views|
+---+--------------------+-----------+------------+-------+--------+--------+-----------+
|  0|Apple Pay Is Kill...|wAZZ-UWGVHI|  2022-08-23|   tech|  3407.0|   672.0|   135612.0|
|  1|The most EXPENSIV...|b3x28s61q3c|  2022-08-24|   tech| 76779.0|  4306.0|  1758063.0|
|  2|My New House Gami...|4mgePWWCAmA|  2022-08-23|   tech| 63825.0|  3338.0|  1564007.0|
|  3|Petrol Vs Liquid ...|kXiYSI7H2b0|  2022-08-23|   tech| 71566.0|  1426.0|   922918.0|
|  4|Best Back to Scho...|ErMwWXQxHp0|  2022-08-08|   tech| 96513.0|  5155.0|  1855644.0|
|  5|Brewmaster Answer...|18fwz9Itbvo|  2021-11-05|   tech| 33570.0|  1643.0|   943119.0|
|  6|Tech Monopolies: ...|jXf04bhcjbg|  2022-06-13|   tech|135047.0|  9367.0|  5937790.0|
|  7|I bought the STRA...|2TqOmtTAMRY|  2022-08-07|   tech|216935.0| 12605.0|  4782514.0|
|  8|15 Em

In [14]:
# Lê o arquivo de comentários e o mostra
df_comments = spark.read.option('header','true').option('inferSchema', 'true').csv('comments.csv')
df_comments.show()

+--------------+-----------+--------------------+------+---------+
|           _c0|   Video ID|             Comment| Likes|Sentiment|
+--------------+-----------+--------------------+------+---------+
|             0|wAZZ-UWGVHI|Let's not forget ...|  95.0|      1.0|
|             1|wAZZ-UWGVHI|Here in NZ 50% of...|  19.0|      0.0|
|             2|wAZZ-UWGVHI|I will forever ac...| 161.0|      2.0|
|             3|wAZZ-UWGVHI|Whenever I go to ...|   8.0|      0.0|
|             4|wAZZ-UWGVHI|Apple Pay is so c...|  34.0|      2.0|
|             5|wAZZ-UWGVHI|We’ve been houndi...|   8.0|      1.0|
|             6|wAZZ-UWGVHI|We only got Apple...|  29.0|      2.0|
|             7|wAZZ-UWGVHI|For now, I need b...|   7.0|      1.0|
|             8|wAZZ-UWGVHI|In the United Sta...|   2.0|      2.0|
|             9|wAZZ-UWGVHI|In Cambodia, we h...|  28.0|      1.0|
|            10|b3x28s61q3c|Wow, you really w...|1344.0|      2.0|
|            11|b3x28s61q3c|The lab is the mo...| 198.0|      

In [15]:
# Salva o arquivo csv de comentários em parquet
df_comments.write.option('header', 'true').parquet('comments-parquet')