In [1]:
import sys

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
35,application_1671409217564_0057,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [1]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import *
from operator import add

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
28,application_1671409217564_0050,pyspark3,idle,Link,Link,✔


SparkSession available as 'spark'.


In [16]:
filePath = 'abfs://dda-2022-12-15t21-04-18-212z@ddasta.dfs.core.windows.net/reddit/2020/'
years = ['RC_2020-10.json', 'RC_2020-11.json', 'RC_2020-12.json']
# years = ['RC_2020-10.json']
jsonFiles = []

for y in years:
    p = filePath+y
    jsonFiles.append(p)


schema = StructType([
    StructField('subreddit', StringType(), nullable=True),
    StructField('body', StringType(), nullable=True)
])

df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

# Loop through the list of JSON files and read them
for file in jsonFiles:
    temp_df = spark.read.json(file, schema=schema)
    df = df.union(temp_df)

# Print the schema of the DataFrame
df.printSchema()

root
 |-- subreddit: string (nullable = true)
 |-- body: string (nullable = true)

In [17]:
df.take(1)

[Row(subreddit='ApexUncovered', body='[deleted]')]

# Text Analysis

## 1 - Write a job to find subreddits where users scream a lot

### Approach 

The idea is that first we can groupBy the subreddits, so now our keys are the subreddits. Then we can use a forEach and go through the body.
Here we need to keep a track of the total number of word in this subreddit, and total number of capitalized words. We then take the percentage
If the percentage is less than 20%, we then do not consider this subreddit as one where users scream alot, and can filter this out
Out of the remaining subreddits, we can then sort, or take the top 5 subreddits where people scream a lot

A good idea might be to remove subreddits below a certain number of posts or comments, so that way we can ensure that we do not get some small or random subreddit where all people do is shout

orrrr

Group by subreddits, go through each comment, assign a weighted score, if 80% of the body is capitalized, then assign a score of 1, if 60% is capitalized assign a score of 0.8, and so on

Get the subreddits with the max score

Lets try finding subreddits which are not so popular but people scream alot, so subreddits with less than a million posts and more than 50,000 posts
This will ensure that we get some esoteric subreddit

In [49]:
# body is an array
def screamScore (body):
    if (not isinstance(body, str) and body == '[deleted]'):
        return 0
    
    splitWords = body.split()
    totalWords = len(splitWords)
    
    capitalizedWords = 0
    
    for word in splitWords:
        if word.isupper():
            capitalizedWords += 1
    
    if (capitalizedWords == 0):
        return 0
    percentageOfCapitalizedWords = capitalizedWords/totalWords
    return percentageOfCapitalizedWords

# screamScore_udf = udf(screamScore)

In [25]:
subredditScreamScore = df.rdd.map(lambda row: (row.subreddit, screamScore(row.body)))
type(subredditScreamScore)

<class 'pyspark.rdd.PipelinedRDD'>

In [26]:
subredditScreamScore.take(1)

[('ApexUncovered', 0)]

In [27]:
summedUpScore = subredditScreamScore.reduceByKey(add)

In [28]:
# else we get a divide by zero error
filterOutZeros = summedUpScore.filter(lambda row: row[1] != 0)

In [29]:
filterOutZeros.cache()

PythonRDD[94] at RDD at PythonRDD.scala:54

In [30]:
topScores = filterOutZeros.sortBy(lambda row: row[1], ascending=False)

In [31]:
topScores.take(10)

[('AskReddit', 691090.521987397), ('memes', 394380.4005751417), ('AskOuija', 374777.1909179756), ('wallstreetbets', 299794.6291793922), ('politics', 272700.8543230486), ('AmItheAsshole', 219580.70116264967), ('nfl', 208655.37449750188), ('teenagers', 207598.3396334076), ('RedditSessions', 177383.2983844934), ('fantasyfootball', 138710.6550371505)]

# Readability Score

### Github is not showing the graph in the jupyer notebook. We have added an image separately in the repository.

Same approach as before, but now we have a different score

In [18]:
import spacy
# from spacy_syllables import SpacySyllables

nlp = spacy.load("en_core_web_sm")

In [19]:
def fleshKinkaid (body):
    splitWord = body.split()
    totalWords = len(splitWord)
    
    complexWords = 0
    vowels = "aeiouy"
    totalSyllables = 0
    
    for word in splitWord:  
        # https://stackoverflow.com/questions/46759492/syllable-count-in-python
        count = 0
        if (word[0] in vowels):
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if (word.endswith("e")):
            count -= 1
        if (count == 0):
            count += 1
        
        if (count >= 3):
            complexWords += 1
        
        totalSyllables += count
    
    doc = nlp(body)
    numberOfSentences = len(list(doc.sents))
    
    fleshKincaid = 206.835 - ( (1.015* (totalWords/numberOfSentences )) - ( 84.6 * (totalSyllables/totalWords)))
    return fleshKincaid

def gunningFrogIndex(body):
    splitWord = body.split()
    totalWords = len(splitWord)
    
    complexWords = 0
    vowels = "aeiouy"
    
    for word in splitWord:  
        # https://stackoverflow.com/questions/46759492/syllable-count-in-python
        count = 0
        if (word[0] in vowels):
            count += 1
        for index in range(1, len(word)):
            if word[index] in vowels and word[index - 1] not in vowels:
                count += 1
        if (word.endswith("e")):
            count -= 1
        if (count == 0):
            count += 1
        
        if (count >= 3):
            complexWords += 1
    
    doc = nlp(body)
    numberOfSentences = len(list(doc.sents))
    
    gunningFrogIndex = 0.4*((totalWords/numberOfSentences) + (100 * (complexWords/totalWords)))
    return gunningFrogIndex

In [20]:
gfScoreUDF = udf(gunningFrogIndex)
fkScoreUDF = udf(fleshKinkaid)

In [21]:
df_filtered = df.filter((df.subreddit == 'soccer') | (df.subreddit == 'nba'))

In [22]:
df_readability_scores = df_filtered.withColumn("flesch_kincaid_grade", fkScoreUDF(df_filtered.body)) \
                                   .withColumn("gunning_fog_index", gfScoreUDF(df_filtered.body)).cache()

In [50]:
df_readability_scores.take(1)

[Row(subreddit='soccer', body='[Streamable mirror](https://streamable.com/62rm14)[](/9pasfs)', flesch_kincaid_grade='585.505', gunning_fog_index='20.8')]

In [23]:
dfSoccer = df_readability_scores.filter(df_readability_scores.subreddit == 'soccer').cache()
dfNba = df_readability_scores.filter(df_readability_scores.subreddit == 'nba').cache()

In [24]:
take1000Soccer = dfSoccer.take(1000)

In [25]:
take1000Nba = dfNba.take(1000)

In [32]:
take1000Soccer

[Row(subreddit='soccer', body='[Streamable mirror](https://streamable.com/62rm14)[](/9pasfs)', flesch_kincaid_grade='585.505', gunning_fog_index='20.8'), Row(subreddit='soccer', body="If this isn't sarcasm ol mate Howard is batting above his average.", flesch_kincaid_grade='328.605', gunning_fog_index='8.133333333333333'), Row(subreddit='soccer', body='Lol why do you keep using tbvh you dont have to be very honest just tbh works too', flesch_kincaid_grade='287.265', gunning_fog_index='7.2'), Row(subreddit='soccer', body='[deleted]', flesch_kincaid_grade='459.62', gunning_fog_index='40.400000000000006'), Row(subreddit='soccer', body='Sheffield United defense probably stronger than the omnishambles we have', flesch_kincaid_grade='357.42499999999995', gunning_fog_index='12.0'), Row(subreddit='soccer', body='Midtjylland, Basaksehir, Krasnodar and... you know the other one', flesch_kincaid_grade='357.5', gunning_fog_index='16.933333333333334'), Row(subreddit='soccer', body='Müller ist a tea

In [43]:
type(take1000Soccer[0].body)

<class 'str'>

In [26]:
take1000Nba

[Row(subreddit='nba', body='lmao', flesch_kincaid_grade='290.42', gunning_fog_index='0.4'), Row(subreddit='nba', body='I think letting the person who has coached this team the whole season coach would give them the best chance.', flesch_kincaid_grade='292.285', gunning_fog_index='8.0'), Row(subreddit='nba', body="You're tripping\n\n[https://www.youtube.com/watch?v=PoFbqjZR2BI](https://www.youtube.com/watch?v=PoFbqjZR2BI)", flesch_kincaid_grade='626.79', gunning_fog_index='14.533333333333331'), Row(subreddit='nba', body="Your greatest ability is availability. MJ couldn't handle the pressure so he quit twice. Dont give me that crap he wanted to play baseball. He quit. If i was an employer I'd rather have someone who is great and always available, rather one who quit everytime something got hard.", flesch_kincaid_grade='328.10432653061224', gunning_fog_index='8.81795918367347'), Row(subreddit='nba', body="Gotta play if off by complimenting her skin, y'all need to think on your feet.", fle

In [4]:
import matplotlib.pyplot as plt

In [5]:
from matplotlib.pyplot import figure
figure(figsize=(80, 80), dpi=100)

<Figure size 8000x8000 with 0 Axes>

# Plot results

The legends are the following :

blue - r/soccer flesh kinkaid score

red - r/nba flesh kinkaid score



green - r/soccer gunning score

black - r/nba gunning score


Plot is kinda cluttered, but it is clearly visible nba fans have a much higher gunning frog score and a lower fk score

In [6]:
xAxisValues = []

for i in range(1, 1001, 1):
    xAxisValues.append(i)

fkScoreSoccer = []
fkScoreNba = []

gfScoreSoccer = []
gfScoreNba = []

# Append the scores for each subreddit to the appropriate lists
for row in take1000Soccer:
    gfScoreSoccer.append(row[3])
    fkScoreSoccer.append(row[2])

for row in take1000Nba:
    gfScoreNba.append(row[3])
    fkScoreNba.append(row[2])

# Plot the Flesch-Kincaid scores
plt.plot(xAxisValues, fkScoreSoccer, color="blue", label="fkScoreSoccer")
plt.plot(xAxisValues, fkScoreNba, color="red", label="fkScoreNba")

# Plot the Gunning Fog Index scores
plt.plot(xAxisValues, gfScoreSoccer, color="green", label="gfScoreSoccer")
plt.plot(xAxisValues, gfScoreNba, color="black", label="gfScoreNba")

plt.legend()
plt.title("Readability score accross subreddits")
plt.show()

HTML(value=u'<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAGPAAABhQCAYAAABnTQ9bAAAAOXRFWHRTb2Z0d2FyZ…

In [34]:
import matplotlib.rcsetup as rcsetup
print(rcsetup.all_backends)

['GTK3Agg', 'GTK3Cairo', 'GTK4Agg', 'GTK4Cairo', 'MacOSX', 'nbAgg', 'QtAgg', 'QtCairo', 'Qt5Agg', 'Qt5Cairo', 'TkAgg', 'TkCairo', 'WebAgg', 'WX', 'WXAgg', 'WXCairo', 'agg', 'cairo', 'pdf', 'pgf', 'ps', 'svg', 'template', 'ms_inline']

In [8]:
minScoreFkSoccer = min(fkScoreSoccer)
maxScoreFkSoccer = max(fkScoreSoccer)

print(minScoreFkSoccer)
print(maxScoreFkSoccer)

1052.3274999999999
882.62

In [9]:
minScoreFkNba = min(fkScoreNba)
maxScoreFkNba = max(fkScoreNba)

print(minScoreFkNba)
print(maxScoreFkNba)

233.73860759493672
798.02