In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("YouTube Trending Analysis") \
    .getOrCreate()

In [5]:
# schema for csv file

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType

schema = StructType([
    StructField("video_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("publishedAt", StringType(), True),
    StructField("channelId", StringType(), True),
    StructField("channelTitle", StringType(), True),
    StructField("categoryId", StringType(), True),
    StructField("trending_date", StringType(), True),
    StructField("tags", StringType(), True),
    StructField("view_count", IntegerType(), True),
    StructField("likes", IntegerType(), True),
    StructField("dislikes", IntegerType(), True),
    StructField("comment_count", IntegerType(), True),
    StructField("thumbnail_link", StringType(), True),
    StructField("comments_disabled", BooleanType(), True),
    StructField("ratings_disabled", BooleanType(), True),
    StructField("description", StringType(), True)
])

In [6]:
file_path = 'data/US_youtube_trending_data.csv'  # Adjust the path accordingly

# Load your dataset into a Spark DataFrame
df = spark.read.csv(file_path, header=True, schema=schema, escape='"', quote='"', multiLine=True)

df.show()

                                                                                

+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+
|   video_id|               title|         publishedAt|           channelId|        channelTitle|categoryId|       trending_date|                tags|view_count| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|         description|
+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+----------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+
|3C66w5Z0ixs|I ASKED HER TO BE...|2020-08-11T19:20:14Z|UCvtRTOMP2TqYqu51...|            Brawadis|        22|2020-08-12T00:00:00Z|brawadis|prank|ba...|   1514614|156908|    5855|        35313|ht

In [7]:
# Print the schema of the DataFrame
df.printSchema()

# Show the first few rows of the DataFrame
df.show(5)


root
 |-- video_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- publishedAt: string (nullable = true)
 |-- channelId: string (nullable = true)
 |-- channelTitle: string (nullable = true)
 |-- categoryId: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- view_count: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: integer (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: boolean (nullable = true)
 |-- ratings_disabled: boolean (nullable = true)
 |-- description: string (nullable = true)

+-----------+--------------------+--------------------+--------------------+-------------+----------+--------------------+--------------------+----------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+
|   video_id|               title|         publi

## Data Type Conversion

In [8]:
from pyspark.sql.functions import col

# List of columns to convert from string to integer
columns_to_convert = ['view_count', 'likes', 'dislikes', 'comment_count']

# Convert columns
for column in columns_to_convert:
    df = df.withColumn(column, col(column).cast('int'))

### Date Formatting

In [9]:
# from pyspark.sql.functions import to_date

# # Convert date columns to date type
# df = df.withColumn("publishedAt", to_date(col("publishedAt"), "yyyy-MM-dd'T'HH:mm:ss'Z'"))
# df = df.withColumn("trending_date", to_date(col("trending_date"), "yyyy-MM-dd"))

# # Show the first few rows of the DataFrame
# df.show(5)

### Handling Missing Values

In [10]:
# Drop rows with any missing values
df_cleaned = df.na.drop()

# Or fill missing values with a default value
df_filled = df.na.fill({"likes": 0, "dislikes": 0, "comment_count": 0})  # Example for numeric columns


## Saving Processed Data


In [11]:
# Example: Specifying an absolute path and ensuring the directory exists
import os

output_path = "/home/gr00stl/Nextcloud/Projects/social-media-sentiment/youtube/data/processed/processed_US_youtube_trending_data.parquet"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

df.write.mode('overwrite').parquet(output_path)

                                                                                

24/03/29 18:29:44 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2984655 ms exceeds timeout 120000 ms
24/03/29 18:29:44 WARN SparkContext: Killing executors is not supported by current scheduler.
24/03/29 18:29:44 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$