# NLP Topic Modeling and Sentiment 

In [1]:
# Setup - Run only once per Kernel App
%conda install https://anaconda.org/conda-forge/openjdk/11.0.1/download/linux-64/openjdk-11.0.1-hacce0ff_1021.tar.bz2

# install PySpark
!pip install sagemaker_pyspark

%pip install spark-nlp==5.1.3

%pip install numpy==1.23.1

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("sagemaker-spark") \
.master("local[*]") \
.config("spark.driver.memory", "8G") \
.config("spark.driver.maxResultSize", "0") \
.config("spark.kryoserializer.buffer.max", "2000M") \
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2") \
.config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider") \
.getOrCreate()

print(spark.version)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d26c86fb-f692-410e-be68-e60da5fffae9;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in c

24/04/28 21:55:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


3.3.0


In [3]:
sc = spark.sparkContext

In [4]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
output_prefix_data_submissions = "project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
print(f"reading submissions from {s3_path}")
posts = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sagemaker-us-east-1-562166416351/project/submissions/yyyy=*
24/04/28 21:55:35 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


                                                                                

24/04/28 21:55:40 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [5]:
#reading comments
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
comments = spark.read.parquet(s3_path, header=True)

# Adding variables

In [7]:
import pyspark.sql.functions as f

In [8]:
comments = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda'), True)\
                    .otherwise(False))

# LDA

In [9]:
#from pyspark.ml.feature import Tokenizer, StopWordsRemover
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Lemmatizer, Normalizer, Tokenizer, StopWordsCleaner
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import chain

In [10]:
#create small df to use for LDA
small_df = posts.select('title', 'id')

In [11]:
documentAssembler = DocumentAssembler()       
documentAssembler.setInputCol('title')      
documentAssembler.setOutputCol('document')

DocumentAssembler_b9e0027956a5

In [12]:
#create tokenizer
tokenizer = Tokenizer() 
tokenizer.setInputCols(["document"]) 
tokenizer.setOutputCol("token")

Tokenizer_a6894445bce4

In [13]:
#normalize
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)

In [14]:
#remove stop words
StopWords = stopwords.words("english")
#adding news sources and stopwords in other languages
StopWords += ['reuters', 'في','از', 'ap', 'says', 'bbcworld', 'amp', 'rt', 'apentertainment', 'la', 'user', 'deleted',
             'क', 'di','در','آهنگ', 'de', 'el', 'en','دانلود', 'به', 'म']
stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setStopWords(StopWords)\
      .setCaseSensitive(False)

In [15]:
finisher = Finisher() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCols(["tokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(False)

In [16]:
#count vectorizer
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", vocabSize=5000, minDF=25)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")

In [17]:
#lda model 
lda = LDA()
lda.setK(8)
lda.setMaxIter(20)
lda.setSeed(13)

LDA_bc3b20696163

In [18]:
pipeline = Pipeline(stages=[documentAssembler, 
                            tokenizer, 
                            normalizer, 
                            stopwords_cleaner, 
                            finisher, 
                            cv, 
                            idf, 
                            lda])

In [19]:
model = pipeline.fit(small_df)

                                                                                

24/04/28 22:11:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/04/28 22:11:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

In [20]:
small_df = model.transform(small_df)

In [21]:
topics = model.stages[-1].describeTopics()
terms = model.stages[-3].vocabulary

In [22]:
#get word from index of term 
def indices_to_terms(indices, terms=terms):
        terms_subset = [terms[index] for index in indices]
        return terms_subset
# Defining Spark UDF from above function
udf_indices_to_terms = f.udf(indices_to_terms, ArrayType(StringType()))

topics = (
    topics
       .withColumn("terms", udf_indices_to_terms(f.col("termIndices")))
    )

In [23]:
topics.take(10)

                                                                                

[Row(topic=0, termIndices=[3, 5, 4, 16, 0, 59, 44, 73, 75, 1], termWeights=[0.01523288802545935, 0.010485966412544834, 0.010254985354030934, 0.008019252593690828, 0.007197787751295313, 0.00663928369728547, 0.005761071866242008, 0.004795471554043201, 0.004670295470041335, 0.0046590487674986625], terms=['ukraine', 'russian', 'russia', 'war', 'us', 'putin', 'minister', 'eu', 'media', 'new']),
 Row(topic=1, termIndices=[33, 1, 74, 127, 2, 223, 209, 7, 343, 432], termWeights=[0.009090685817965462, 0.0070490682605516995, 0.00697780724115689, 0.005716209232198239, 0.005388807972733376, 0.005135984878955494, 0.005023314583232904, 0.004732374604808212, 0.0043867201924320314, 0.0039042769001899473], terms=['video', 'new', 'twitter', 'youtube', 'covid', 'elon', 'musk', 'news', 'hong', 'kong']),
 Row(topic=2, termIndices=[9, 19, 95, 124, 93, 30, 108, 0, 48, 77], termWeights=[0.008739848346906, 0.00573235920679256, 0.005584539395288148, 0.0049359615855023315, 0.004845691343441333, 0.004172595441602

In [24]:
#naming topics 
topic_dict = {0: 'russia&ukraine', 1: 'social media', 2: 'current events', 3: 'tv shows', 4: 'covid', 
              5: 'foriegn relations', 6: 'emerging tech', 7: 'demographic info'}
              

In [25]:
#map to topics
mapping_expr = f.create_map([f.lit(x) for x in chain(*topic_dict.items())])

In [26]:
#udf to get the top topic 
max_topic = f.udf(lambda v:float(v.argmax()),FloatType())
#using mao and udf to create a topic column
topic = small_df.withColumn('topic_num', max_topic("topicDistribution"))\
.withColumn("topic", mapping_expr[f.col("topic_num")]).select('id','topic')

In [27]:
mini_posts = posts.select('created_utc', 'title', 'id')

In [28]:
#merging relevant columns wuth the topic column
merged_df = mini_posts.join(topic, 'id')

#### Adding comments

In [29]:
#renaming columns and removing the t3_ from the link id to get the post id on the comment
mini_comments = comments\
.withColumn('comment_created', f.col('created_utc')).withColumn('comment_id', f.col('id'))\
.withColumn('id', f.regexp_extract('link_id', 't3_(.*)$', 1))\
.select('comment_created','body','misinfo_class', 'comment_id', 'id')

In [30]:
#merging dataframes
total_df = merged_df.join(mini_comments, 'id')

In [31]:
#save clean df
bucket = session.default_bucket()
s3_path = f"s3a://{bucket}/project/clean_topic_data.parquet"
total_df.write.mode("overwrite").parquet(s3_path)

                                                                                

### Analysis

In [6]:
output_prefix_data_submissions = "project/clean_topic_data.parquet"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
total_df = spark.read.parquet(s3_path, header=True)

In [9]:
topic_misinfo_counts = total_df.filter(f.col('misinfo_class') == True).groupBy('topic').count().toPandas()

                                                                                

In [10]:
topic_misinfo_counts.to_csv('../data/csv/topic_misinfo_true_count.csv', index = False)

In [11]:
topic_counts = total_df.groupBy('topic').count().toPandas()

                                                                                

In [12]:
topic_counts.to_csv('../data/csv/topic_counts.csv', index = False)

## Sentiment Analysis

In [32]:
!pip install vaderSentiment textblob
# reference for VADER:
# https://medium.com/@tom.bailey.courses/sentiment-analysis-in-snowflake-using-python-31d7296abe1a
# https://github.com/cjhutto/vaderSentiment

[0m

In [33]:
output_prefix_data_submissions = "project/clean_topic_data.parquet"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
total_df = spark.read.parquet(s3_path, header=True)

In [34]:
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = true)
 |-- comment_id: string (nullable = true)



In [35]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyspark.sql.types as T
from textblob import TextBlob

In [36]:
# Use two libraries comparing the sentiment result
def vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vader_score = analyzer.polarity_scores(text)
    return vader_score['compound']  


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [37]:
# UDF
vader_udf = f.udf(vader_sentiment, T.FloatType())
textblob_udf = f.udf(textblob_sentiment, T.FloatType())

In [38]:
# Add vader score to total df
total_df = total_df.withColumn("vader_score", vader_udf(f.col("body")))

# Add textblob score to total df
total_df = total_df.withColumn("textblob_score", textblob_udf(f.col("body")))

## Analyze Sentiment

In [15]:
# display a sample to manually inspect differences
total_df.select("body", "vader_score", "textblob_score").show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+-----------+--------------+
|                body|vader_score|textblob_score|
+--------------------+-----------+--------------+
| 8 dead, 10 injured.|    -0.7906|          -0.2|
|Very little infor...|        0.0|      -0.24375|
|Says military sid...|    -0.4019|         -0.15|
|Who's going to pr...|        0.0|           0.0|
|True. They're try...|     0.3919|          0.35|
|Fair point. Talib...|     0.5423|         0.475|
|           [removed]|        0.0|           0.0|
|           [removed]|        0.0|           0.0|
|In an alternate u...|     0.6908|           0.2|
|&gt;Police chief ...|    -0.7531|          -0.3|
|Take his pension ...|    -0.6908|   -0.33333334|
|Probably forgot h...|        0.0|           0.0|
|Forget it, Jake. ...|    -0.2263|           0.0|
|&gt; I'd bet he h...|    -0.5423|          -0.4|
|           [removed]|        0.0|           0.0|
|By his own logic,...|        0.0|           0.6|
|Wait...they were ...|     0.3252|           0.0|


                                                                                

In [40]:
misinfo_comments = total_df.filter(f.col("misinfo_class") == True)

In [16]:
# aggregating misinformation comments by topic
misinfo_comments_count = misinfo_comments.groupBy("vader_score").count().toPandas()

                                                                                

In [17]:
# save to csv file
misinfo_comments_count.to_csv('../data/csv/misinfo_comments_count.csv', index = False)

In [41]:
# Topics comment counts and percentage of vadar score <0

misinfo_comments = misinfo_comments.withColumn("vader_neg", (f.col("vader_score") < 0).cast("int"))

# group by the 'topic' to calculate the total comments
neg_comments_count = misinfo_comments.groupBy("topic").agg(f.count("comment_id").alias("total_comments"),
                                                           f.sum("vader_neg").alias("negative_vader_count")  
    )

# Calculate the percentage of negative VADER scores
neg_comments_count = neg_comments_count.withColumn("percentage_neg_vader", 
                           (f.col("negative_vader_count") / f.col("total_comments")) * 100)

neg_comments_count.show()



+-----------------+--------------+--------------------+--------------------+
|            topic|total_comments|negative_vader_count|percentage_neg_vader|
+-----------------+--------------+--------------------+--------------------+
|    emerging tech|         47188|               34707|   73.55047893532254|
|     social media|         39298|               29771|   75.75703598147489|
|   current events|         64453|               51426|   79.78837292290505|
|            covid|         40208|               29852|   74.24393155590927|
|   russia&ukraine|        113047|               85636|   75.75256309322671|
| demographic info|         34852|               26276|   75.39309078388615|
|         tv shows|         13166|                9594|   72.86951238037369|
|foriegn relations|         54915|               41612|   75.77528908312847|
+-----------------+--------------+--------------------+--------------------+



                                                                                

In [42]:
# Adjusting the condition to check for VADER score less than -0.8 and renaming the column
misinfo_comments = misinfo_comments.withColumn("vader_below_neg_0_8", (f.col("vader_score") < -0.8).cast("int"))

# Group by the 'topic' to calculate the total comments and the sum of negative comments with the new column name
neg_comments_count = misinfo_comments.groupBy("topic").agg(
    f.count("comment_id").alias("total_comments"),
    f.sum("vader_below_neg_0_8").alias("count_below_neg_0_8")  
)

# Calculate the percentage of negative VADER scores and rename the percentage column appropriately
neg_comments_count = neg_comments_count.withColumn("percentage_below_neg_0_8", 
                           (f.col("count_below_neg_0_8") / f.col("total_comments")) * 100)

# Display the results
neg_comments_count.show()



+-----------------+--------------+-------------------+------------------------+
|            topic|total_comments|count_below_neg_0_8|percentage_below_neg_0_8|
+-----------------+--------------+-------------------+------------------------+
|    emerging tech|         47188|              14998|       31.78350428074934|
|     social media|         39298|              13419|       34.14677591734948|
|   current events|         64453|              25635|      39.773168044931964|
|            covid|         40208|              12893|       32.06575805809789|
|   russia&ukraine|        113047|              37486|       33.15965925676931|
| demographic info|         34852|              10035|       28.79318260071158|
|         tv shows|         13166|               4131|       31.37627221631475|
|foriegn relations|         54915|              18437|      33.573704816534644|
+-----------------+--------------+-------------------+------------------------+



                                                                                

In [43]:
# save to csv file
neg_comments_count_pd = neg_comments_count.toPandas()
neg_comments_count_pd.to_csv('../data/csv/neg_comments_count.csv', header=True, index=False)

                                                                                