In [18]:
import org.apache.spark

import org.apache.spark


In [19]:
val path_to_datasets = "../../../../datasets/big/"

val path_ml_posts = path_to_datasets + "the-reddit-covid-dataset-posts.csv"
val path_ml_comments = path_to_datasets + "the-reddit-covid-dataset-comments.csv"

path_to_datasets: String = ../../../../datasets/big/
path_ml_posts: String = ../../../../datasets/big/the-reddit-covid-dataset-posts.csv
path_ml_comments: String = ../../../../datasets/big/the-reddit-covid-dataset-comments.csv


In [20]:
import java.util.Calendar
import org.apache.spark.sql.SaveMode
import org.apache.spark.HashPartitioner

object CovidConversationsParser {

    val commaRegex = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
    val pipeRegex = "\\|(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)"
    val quotes = "\""

    /** Convert from timestamp (String) to year (Int) */
    def yearFromTimestamp(timestamp: String): Int = {
        val cal = Calendar.getInstance()
        cal.setTimeInMillis(timestamp.trim.toLong * 1000L)
        cal.get(Calendar.YEAR)
    }
    
    /** Combines splits created into their respective string values */
    def combineText(input: Array[String], initialSplit: Int): (String, Int) = {
        if (initialSplit >= input.length) return ("", initialSplit)
    
        var fullText = input(initialSplit)
        var nSplit = initialSplit + 1
    
        if (fullText.startsWith(quotes)) {
            while (nSplit < input.length && !input(nSplit).trim.endsWith(quotes)) {
                fullText += input(nSplit).trim
                nSplit += 1
            }
            if (nSplit < input.length) {
                fullText += input(nSplit).trim
                nSplit += 1
            }
        }
        (fullText, nSplit)
    }

    /** Function to parse reddit posts
    *
    *  @param line line that has to be parsed
    *  @return tuple containing id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score. none in case of input errors
    */
    def parseRedditPost(line: String): Option[(String, String, String, Boolean, Int, String, String, java.io.Serializable, java.io.Serializable, String, String)] = {
        try {
            val input = line.split(commaRegex)
            val url = if(input(8).trim.nonEmpty) input(8).trim else None
            val selftext = if(input(9).trim.nonEmpty) combineText(input, 9)._1 else None
            var n = combineText(input, 9)._2
            val title = combineText(input, n)._1
            n = combineText(input, n)._2
            val score = combineText(input, n)._1
            Some((input(1).trim, input(2).trim, input(3).trim, input(4).trim.toBoolean, yearFromTimestamp(input(5)), input(6).trim, input(7).trim, url, selftext, title, score))   
        } catch {
            case _: Exception => None
        }
    }

    /** Function to parse reddit comments
    *
    *  @param line line that has to be parsed
    *  @return tuple containing id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score. none in case of input errors
    */
    def parseRedditComment(line: String): Option[(String, String, String, Boolean, Int, String, String, Double, Int)] = {
        try {
            val input = line.split(commaRegex)
            val body = combineText(input, 7)._1
            var n = combineText(input, 7)._2
            val sentiment = combineText(input, n)._1.toDouble
            n = combineText(input, n)._2
            val score = combineText(input, n)._1.toInt
            Some((input(1).trim, input(2).trim, input(3).trim, input(4).trim.toBoolean, yearFromTimestamp(input(5)), input(6).trim, body, sentiment, score))
        } catch {
            case _: Exception => None
        }
    }
}

import java.util.Calendar
import org.apache.spark.sql.SaveMode
import org.apache.spark.HashPartitioner
defined object CovidConversationsParser


In [15]:
//case class CovidPost(id:String, subredditId:String, subredditName:String, subredditNSFW:boolean, created_UTC: Calendar,permalink:String, domain:String, URL:String, selftext:Option[String], title:String, score:Int)

//case class CovidComment(id:String, subredditId:String, subredditName:String, subredditNSFW:boolean, created_UTC: Calendar,permalink:String, body:String, sentiment:Float, score:Int)

In [22]:
//per recuperare dei samples dei files
//sc.textFile(path_ml_posts).sample(false, 0.2).coalesce(1).toDF().write.format("csv").mode(SaveMode.Overwrite).save("../../../../datasets/")

In [38]:
/*getting RDDs of Comments and Posts*/
val rddComments = sc.textFile(path_ml_posts).flatMap(CovidConversationsParser.parseRedditComment)
val rddPosts = sc.textFile(path_ml_comments).flatMap(CovidConversationsParser.parseRedditPost)
rddPosts.collect()

rddComments: org.apache.spark.rdd.RDD[(String, String, String, Boolean, Int, String, String, Double, Int)] = MapPartitionsRDD[74] at flatMap at <console>:52
rddPosts: org.apache.spark.rdd.RDD[(String, String, String, Boolean, Int, String, String, java.io.Serializable, java.io.Serializable, String, String)] = MapPartitionsRDD[77] at flatMap at <console>:53
res9: Array[(String, String, String, Boolean, Int, String, String, java.io.Serializable, java.io.Serializable, String, String)] = Array()


In [42]:
//PART 1: Aggregate on temporal dimension and obtain percentage of posts classified as NSFW
//Posts are :(id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score)
val percentageNSFWPosts = rddPosts.map(x => (x._5, if (x._4) 1 else 0))  // (created_utc, nsfw flag)
.reduceByKey(_ + _)  // count NSFW posts per timestamp
.mapValues(nsfwCount => {
    val totalPostsAtTime = rddPosts.filter(y => y._5 == y._5).count()
    (nsfwCount.toDouble / totalPostsAtTime) * 100
})

//.coalesce(1) //riduzione a singola partizione (per analisi, con singola partizione abbiamo singolo file da analizzare)
//.collect()

percentageNSFWPosts: org.apache.spark.rdd.RDD[(Int, Double)] = MapPartitionsRDD[87] at mapValues at <console>:51


In [41]:
//PART 2: Aggregate on temporal dimension and obtain average sentiment in comments and percentage of comments classified as NSFW
//Comments are: (id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,body,sentiment,score)
val avgSentimentWithNSFWPost = rddComments.map(x => (x._5, (if (x._4) 1 else 0, x._8)))  // (created_utc, (nsfw flag, sentiment))
.reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))  // aggregate NSFW count and sentiment
.mapValues(avgSentimentAndNSFWCount => {
    val totalCommentsAtTime = rddComments.filter(y => y._5 == y._5).count()
    val nsfwPercentage = (avgSentimentAndNSFWCount._1.toDouble / totalCommentsAtTime) * 100
    val avgSentiment = avgSentimentAndNSFWCount._2 / totalCommentsAtTime
    (nsfwPercentage, avgSentiment)
})

avgSentimentWithNSFWPost: org.apache.spark.rdd.RDD[(Int, (Double, Double))] = MapPartitionsRDD[84] at mapValues at <console>:51


In [43]:
// PART 3: Join on Temporal Dimension
val finalResult = percentageNSFWPosts.join(avgSentimentWithNSFWPost)

finalResult: org.apache.spark.rdd.RDD[(Int, (Double, (Double, Double)))] = MapPartitionsRDD[90] at join at <console>:49
