# Data Preparation

In [1]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.2.0

# restart kernel
from IPython.core.display import HTML

%pip install  numpy==1.23.1

HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.3.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrad

In [3]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    .getOrCreate()
)

print(spark.version)



:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c450c22c-b6f3-41da-8688-b0b6340d43e7;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 420ms :: artifacts dl 23ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------

3.2.0


In [4]:
## other libs
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql.types import StringType
from urllib.parse import urlparse
import regex as re
import numpy as np

In [5]:
%%time
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
output_prefix_data_submissions = "project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
print(f"reading submissions from {s3_path}")
posts = spark.read.parquet(s3_path, header=True)
print(f"shape of the posts dataframe is {posts.count():,}x{len(posts.columns)}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sagemaker-us-east-1-839279087569/project/submissions/yyyy=*


24/04/03 16:51:09 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/04/03 16:51:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

shape of the posts dataframe is 1,847,138x68
CPU times: user 2.83 s, sys: 298 ms, total: 3.12 s
Wall time: 3min 34s


                                                                                

In [6]:
#reading comments
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
comments = spark.read.parquet(s3_path, header=True)

## Create Source column

In [8]:
def extract_refined_domain(url):
    try:
        parsed_url = urlparse(url)
        # Handling Google AMP URLs
        if "www.google.com" in parsed_url.netloc in url:
            # amp_url = url.split("/amp/s/")
            if 'amp/s/' in url:
                amp_url =  url.split('amp/s/')[1]
            elif 'url=' in url:
                amp_url=  url.split('url=')[1]
            domain = urlparse('http://' + amp_url).netloc
        else:
            domain = parsed_url.netloc

        # Splitting the domain and returning the first meaningful part
        domain_parts = domain.split('.')
        domain_part = domain_parts[1] if domain_parts[0] in ['www', 'amp'] else domain_parts[0]


        # For Twitter, returning the username
        if "twitter.com" in domain:
            path_parts = parsed_url.path.split('/')
            return path_parts[1] if len(path_parts) > 1 else domain_part

        return domain_part.lower()
    except:
        return None
    

extract_refined_domain_udf = f.udf(extract_refined_domain, StringType())

posts = posts.withColumn("source", extract_refined_domain_udf("url"))
# filter out source=reddit, which mean the post is deleted
posts = posts.filter((f.col("source") != "reddit") &(f.col("source") != ""))
posts = posts.filter((f.col("url") != "") & (f.col("url").isNotNull()))
posts = posts.select("title", "url", "subreddit", "score", "source", "created_utc", "num_comments")

print(f"shape of the posts dataframe is {posts.count():,}x{len(posts.columns)}")
posts.show(5, truncate=False)

                                                                                

shape of the posts dataframe is 1,753,936x7


[Stage 8:>                                                          (0 + 1) / 1]

+----------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+-----+----------+-------------------+------------+
|title                                                                             |url                                                                                                                                                                                                                      |subreddit|score|source    |created_utc        |num_comments|
+----------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 663, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 564, in read_int
    raise EOFError
EOFError
Traceback (most recent call last):                                              
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/opt/conda/lib/python3.10/site-packages/pyspark/python/lib/pyspark.zip/p

### Top table of all

In [9]:
agg_df = posts.groupBy('source').agg(
    count('source').alias('frequency'), 
    sum('score').alias('total_score')
)

# Order by frequency and total_score in descending order and take the top 100
top_agg_df = agg_df.orderBy(f.col('frequency').desc(), f.col('total_score').desc()).limit(100)


In [10]:
# Path for storing the CSV
csv_path = '../data/csv/top100_source.csv'

# Store the DataFrame in CSV format
top_agg_df.write.csv(csv_path, header=True, mode="overwrite")

                                                                                

## Top table for each subreddit

In [11]:
agg_df = posts.groupBy('source', 'subreddit').agg(
    count('source').alias('frequency'), 
    sum('score').alias('total_score')
)

# Order by frequency in descending order and take the top results
top_agg_df = agg_df.orderBy('frequency', ascending=False)

# Show the results
top_agg_df.show(truncate=False)



+----------------+----------+---------+-----------+
|source          |subreddit |frequency|total_score|
+----------------+----------+---------+-----------+
|Reuters         |newsbotbot|319430   |326981     |
|rajacreator     |news      |68724    |68724      |
|AP              |newsbotbot|63863    |67048      |
|bluzz           |worldnews |57339    |57337      |
|BBCWorld        |newsbotbot|33872    |34721      |
|tellygupshup    |news      |31600    |31596      |
|popularnews     |news      |30489    |30488      |
|reuters         |worldnews |30239    |22228485   |
|news            |news      |26432    |3752334    |
|youtube         |news      |24798    |24722      |
|bbc             |worldnews |20050    |9322004    |
|newsnationglobal|news      |17184    |17183      |
|theguardian     |worldnews |16865    |13074555   |
|roknonline      |news      |15960    |15960      |
|BBCNews         |newsbotbot|15717    |16134      |
|cnn             |news      |13803    |21969606   |
|youtube    

                                                                                

In [12]:
print(f"shape of the agg_df dataframe is {agg_df.count():,}x{len(agg_df.columns)}")



shape of the agg_df dataframe is 60,670x4


                                                                                

In [13]:
agg_df.orderBy('total_score', ascending=False).show()



+---------------+---------+---------+-----------+
|         source|subreddit|frequency|total_score|
+---------------+---------+---------+-----------+
|        reuters|worldnews|    30239|   22228485|
|            cnn|     news|    13803|   21969606|
|        nbcnews|     news|     8055|   18513337|
|         apnews|     news|    10870|   17623972|
|businessinsider|worldnews|     4554|   15379993|
|    theguardian|worldnews|    16865|   13074555|
|           cnbc|     news|     4539|   13032172|
|        cbsnews|     news|     3748|   12349286|
|        reuters|     news|     9508|   12002004|
|    theguardian|     news|     7783|   11855783|
|           news|worldnews|     8096|   10240638|
|            bbc|worldnews|    20050|    9322004|
|        abcnews|     news|     5304|    9320267|
|       newsweek|worldnews|     3512|    7927665|
|         pravda|worldnews|     2457|    6852223|
|            bbc|     news|     8984|    6051611|
|         apnews|worldnews|     9537|    5923777|


                                                                                

In [14]:
# Define a window specification
windowSpec = Window.partitionBy("subreddit").orderBy(f.col("frequency").desc(), f.col("total_score").desc())

# Rank each row within its subreddit based on frequency and score
ranked_df = agg_df.withColumn("rank", row_number().over(windowSpec))

# Filter for the top 20 in each subreddit
top_100_each_subreddit = ranked_df.filter(f.col("rank") <= 100)

# Show the results
top_100_each_subreddit.show(truncate=False)



+----------------+---------+---------+-----------+----+
|source          |subreddit|frequency|total_score|rank|
+----------------+---------+---------+-----------+----+
|rajacreator     |news     |68724    |68724      |1   |
|tellygupshup    |news     |31600    |31596      |2   |
|popularnews     |news     |30489    |30488      |3   |
|news            |news     |26432    |3752334    |4   |
|youtube         |news     |24798    |24722      |5   |
|newsnationglobal|news     |17184    |17183      |6   |
|roknonline      |news     |15960    |15960      |7   |
|cnn             |news     |13803    |21969606   |8   |
|cde             |news     |13240    |13276      |9   |
|newspotng       |news     |12812    |12811      |10  |
|apnews          |news     |10870    |17623972   |11  |
|youtu           |news     |10686    |10672      |12  |
|timesofindia    |news     |9784     |9788       |13  |
|reuters         |news     |9508     |12002004   |14  |
|laquilablog     |news     |9118     |9118      

                                                                                

In [15]:
print(f"shape of the agg_df dataframe is {top_100_each_subreddit.count():,}x{len(top_100_each_subreddit.columns)}")



shape of the agg_df dataframe is 204x5


                                                                                

## creating aggregated csv files

In [7]:
#comments containing misinformation 
misinfo = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda'), True)\
                    .otherwise(False)).groupBy('misinfo_class').count().toPandas()

                                                                                

In [8]:
misinfo.to_csv('../data/csv/misinformation.csv', index = False)

In [21]:
#comments per year
comments_year = comments.withColumn('year', f.year(f.col('created_utc'))).groupBy(['year']).count().toPandas()

                                                                                

In [22]:
comments_year.to_csv('../data/csv/comments_per_year.csv', index = False)

In [23]:
#comments on day of the week
day_of_week = comments.withColumn('day', f.dayofweek(f.col('created_utc'))).groupBy(['day']).count().toPandas()

                                                                                

In [24]:
day_of_week.to_csv('../data/csv/comments_day.csv', index = False)

In [25]:
#submissions per year
submissions_year = posts.withColumn('year', f.year(f.col('created_utc'))).groupBy('year').count().toPandas()

                                                                                

In [26]:
submissions_year.to_csv('../data/csv//submissions_per_year.csv', index = False)

In [11]:
#submissions per day of the week
submissions_day = posts.withColumn('day', f.dayofweek(f.col('created_utc'))).groupBy('day').count().toPandas()

                                                                                

In [12]:
submissions_day.to_csv('../data/csv/submissions_day.csv', index = False)

In [28]:
#summary on comments score
score = comments.select('score').summary().toPandas()

                                                                                

In [29]:
score.to_csv('../data/csv/comments_score.csv', index = False)

## Analyze top users

In [15]:
#reading submissions
output_prefix_data_submissions = f"project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
submissions = spark.read.parquet(s3_path, header=True)

In [16]:
author_df = submissions.select("subreddit", "created_utc","author").withColumn("year", f.year("created_utc")) 

In [17]:
news_df = author_df.filter(author_df["subreddit"] == "news")
worldnews_df = author_df.filter(author_df["subreddit"] == "worldnews")

In [20]:
#function to aggregate data
def author_agg(df, num):
    new_df = df.groupBy("author").agg(f.count("*").alias("submission_count")) \
                          .filter(f.col("submission_count") > num)
    return new_df

In [21]:
news_df_filtered = author_agg(news_df,5000)

In [22]:
#getting top users
top_news_authors = news_df_filtered.orderBy(f.desc("submission_count")).limit(11).toPandas()

                                                                                

In [24]:
top_news_authors.to_csv('../data/csv/top_news_authors.csv', index = False)

In [25]:
#same as above
worldnews_df_filtered = author_agg(worldnews_df,5000)

In [26]:
top_worldnews_authors = worldnews_df_filtered.orderBy(f.desc("submission_count")).limit(11).toPandas()

                                                                                

In [27]:
top_worldnews_authors

Unnamed: 0,author,submission_count
0,[deleted],114086
1,harryg888,57339
2,theworldnnews,5509
3,AdrienSergent,5139
4,Infoseven7,5107


In [28]:
top_worldnews_authors.to_csv('../data/csv/top_worldnews_authors.csv', index = False)