In [1]:
import org.apache.spark

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.226.1:4040
SparkContext available as 'sc' (version = 3.5.1, master = local[*], app id = local-1739448461710)
SparkSession available as 'spark'


import org.apache.spark


In [2]:
val path_to_bigDatasets = "../../../../datasets/big/"
val path_to_datasets = "../../../../datasets/"

val path_Fullml_posts = path_to_bigDatasets + "the-reddit-covid-dataset-posts.csv"
val path_Fullml_comments = path_to_bigDatasets + "the-reddit-covid-dataset-comments.csv"

val path_sample_posts = path_to_datasets + "postsSample.csv"
val path_sample_comments = path_to_datasets + "commentsSample.csv"

path_to_bigDatasets: String = ../../../../datasets/big/
path_to_datasets: String = ../../../../datasets/
path_Fullml_posts: String = ../../../../datasets/big/the-reddit-covid-dataset-posts.csv
path_Fullml_comments: String = ../../../../datasets/big/the-reddit-covid-dataset-comments.csv
path_sample_posts: String = ../../../../datasets/postsSample.csv
path_sample_comments: String = ../../../../datasets/commentsSample.csv


In [3]:
import java.util.Calendar
import org.apache.spark.sql.SaveMode
import org.apache.spark.HashPartitioner

object CovidConversationsParser {

    val commaSplit = ","
    val quotes = "\""

    /** Convert from timestamp (String) to day (Int) */
    def dayFromTimestamp(timestamp: String): Int = {
        val cal = Calendar.getInstance()
        cal.setTimeInMillis(timestamp.trim.toLong * 1000L)
        cal.get(Calendar.DAY_OF_MONTH)
    }

    /** Function to parse reddit posts
    *
    *  @param line line that has to be parsed
    *  @return tuple containing id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score. none in case of input errors
    */
    def parseRedditPost(line: String): Option[(String, String, String, Boolean, Int, String, String, String, String, String, Int)] = {
        try {
            val input = line.split(commaSplit)
            if(input.size != 12) {
                return Some(("", "", "", false, -1, "", "", "", "", "", -1))
            }
            var url = "None"
            if(input(8).trim.nonEmpty) {
                url = input(8).trim
            }
            var selftext = "None"
            if(input(9).trim.nonEmpty) {
                selftext = input(9).trim
            }
            val number = input(11).trim.replaceAll(quotes, "")
            Some((input(1).trim, input(2).trim, input(3).trim, input(4).trim.toBoolean, dayFromTimestamp(input(5)), input(6).trim, input(7).trim, url, selftext, input(10).trim , number.toInt))
        } catch {
            case _: Exception => None
        }
    }

    /** Function to parse reddit comments
    *
    *  @param line line that has to be parsed
    *  @return tuple containing id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score. none in case of input errors
    */
    def parseRedditComment(line: String): Option[(String, String, String, Boolean, Int, String, String, Double, Int)] = {
        try {
            val input = line.split(commaSplit)
            if(input.size != 10) {
                return Some(("", "", "", false, -1, "", "", -0.1, -1))
            }
            val number = input(9).trim.replaceAll(quotes, "")
            Some((input(1).trim, input(2).trim, input(3).trim, input(4).trim.toBoolean, dayFromTimestamp(input(5)), input(6).trim, input(7).trim, input(8).trim.toDouble, number.toInt))
        } catch {
            case _: Exception => None
        }
    }
}

import java.util.Calendar
import org.apache.spark.sql.SaveMode
import org.apache.spark.HashPartitioner
defined object CovidConversationsParser


In [None]:
//per recuperare dei samples dei files
//sc.textFile(path_Fullml_posts).sample(false, 0.05).coalesce(1).toDF().write.format("csv").mode(SaveMode.Overwrite).save("../../../../datasets/sample")
sc.textFile(path_Fullml_comments).sample(false, 0.02).coalesce(1).toDF().write.format("csv").mode(SaveMode.Overwrite).save("../../../../datasets/sample")

In [4]:
/*getting RDDs of Comments and Posts*/
val rddPosts = sc.textFile(path_sample_posts).flatMap(CovidConversationsParser.parseRedditPost).filter(x => x != ("","","",false,-1,"","","","","",-1))
val rddComments = sc.textFile(path_sample_comments).flatMap(CovidConversationsParser.parseRedditComment)
.filter(x => x != ("", "", "", false, -1, "", "", -0.1, -1))
rddComments.first()

rddPosts: org.apache.spark.rdd.RDD[(String, String, String, Boolean, Int, String, String, String, String, String, Int)] = MapPartitionsRDD[3] at filter at <console>:32
rddComments: org.apache.spark.rdd.RDD[(String, String, String, Boolean, Int, String, String, Double, Int)] = MapPartitionsRDD[7] at filter at <console>:34
res0: (String, String, String, Boolean, Int, String, String, Double, Int) = (hi1vjyo,2y77d,antiwork,false,26,https://old.reddit.com/r/antiwork/comments/qft1w1/i_work_for_a_lab_doing_covid_testing_everyday_ive/hi1vjyo/,Can I come love with you? I can do Covid testing for free and take up very little space.,0.8176,11)


In [5]:
//PART 1: Aggregate on temporal dimension and obtain percentage of posts classified as NSFW
//Posts are :(id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score)
/* solution is better using  reduceByKey becasue:
    -Avoid collecting all values into memory before processing
    -Perform aggregation as data is being processed
    -Reduce network shuffle by combining data locally first
*/
val percentageNSFWPosts = rddPosts.map(x => (x._5, (1, if (x._4) 1 else 0))) // (created_utc, (posts count, nsfw flag))
.reduceByKey ({ case ((total1, nsfw1), (total2, nsfw2)) =>
(total1 + total2, nsfw1 + nsfw2)
})
.mapValues ({ case (total, nsfw) =>
    //Calculate the percentage
    val percentage = (nsfw.toDouble * 100) / total
    "Percentage NSFW Posts: " + percentage + "%"
})

//.coalesce(1) //riduzione a singola partizione (per analisi, con singola partizione abbiamo singolo file da analizzare)
//.collect()

percentageNSFWPosts: org.apache.spark.rdd.RDD[(Int, String)] = MapPartitionsRDD[10] at mapValues at <console>:39


In [6]:
//PART 2: Aggregate on temporal dimension and obtain average sentiment in comments and percentage of comments classified as NSFW
//Comments are: (id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score)

val avgSentimentWithNSFWComment = rddComments.map(x => (x._5, (if (x._4) 1 else 0, 1, x._8))) //(created_utc, nsfw flag, number of posts, sentiment)
.reduceByKey((a, b) => (
    a._1 + b._1,  // Sum NSFW counters
    a._2 + b._2,  // Sum total post counters
    a._3 + b._3   // Sum sentiment values
))
.mapValues(reduced => {
    val (nsfwCount, totalCount, totalSentiment) = reduced
    val nsfwPercentage = (nsfwCount * 100.0) / totalCount
    val avgSentiment = totalSentiment / totalCount
    ("Percentage NSFW Comments: "+ nsfwPercentage + "%", "Average Sentiment: " + avgSentiment)
})
//.coalesce(1) //riduzione a singola partizione (per analisi, con singola partizione abbiamo singolo file da analizzare)
//.collect()

avgSentimentWithNSFWComment: org.apache.spark.rdd.RDD[(Int, (String, String))] = MapPartitionsRDD[13] at mapValues at <console>:37


In [7]:
val path_output = "../../../../output/covidPostCommentResults"
sc.getPersistentRDDs.foreach(_._2.unpersist())

path_output: String = ../../../../output/covidPostCommentResults


In [8]:
// PART 3: Join on Temporal Dimension and then write the output

val finalResult = percentageNSFWPosts.join(avgSentimentWithNSFWComment) 
//with the join i get an Array[(Int,(String,(String, String)))], in order to properly write it, i map it first then change it
.map(x => (x._1, x._2._1, x._2._2._1, x._2._2._2))
.coalesce(1) //riduzione a singola partizione (per analisi, con singola partizione abbiamo singolo file da analizzare)
.toDF().write.format("csv").mode(SaveMode.Overwrite).save(path_output)

finalResult: Unit = ()
