# NLP

In [2]:
# Setup - Run only once per Kernel App
%conda install openjdk -y

# install PySpark
%pip install pyspark==3.4.0

# install spark-nlp
%pip install spark-nlp==5.1.3

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")

Collecting package metadata (current_repodata.json): done
Solving environment: done


  current version: 23.3.1
  latest version: 24.3.0

Please update conda by running

    $ conda update -n base -c defaults conda

Or to minimize the number of packages updated during conda update use

     conda install conda=24.3.0



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrad

In [3]:
# Import pyspark and build Spark session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("PySparkApp")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.2.2")
    .config(
        "fs.s3a.aws.credentials.provider",
        "com.amazonaws.auth.ContainerCredentialsProvider",
    )
    .getOrCreate()
)

print(spark.version)



:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-14c1e9cc-2235-4fd2-9604-c15e907ca378;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.2.2 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.563 in central
:: resolution report :: resolve 329ms :: artifacts dl 21ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.563 from central in [default]
	org.apache.hadoop#hadoop-aws;3.2.2 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------------------

3.4.0


In [4]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()
output_prefix_data_submissions = "project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
print(f"reading submissions from {s3_path}")
posts = spark.read.parquet(s3_path, header=True)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
reading submissions from s3a://sagemaker-us-east-1-851725257127/project/submissions/yyyy=*


24/04/17 01:51:17 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/04/17 01:51:24 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [5]:
#reading comments
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
comments = spark.read.parquet(s3_path, header=True)

In [6]:
#import packages
import pyspark.sql.functions as f
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

## Adding variables

In [7]:
## Clean the comments body content
# define a function
def clean_text(df):
    # Lowercase all text
    df = df.withColumn("body", f.lower(f.col("body")))
    # Remove special characters (keeping only alphanumeric and spaces)
    df = df.withColumn("body", f.regexp_replace(f.col("body"), "[^a-zA-Z0-9\\s]", ""))
    # Trim spaces
    df = df.withColumn("body", f.trim(f.col("body")))
    return df

# Apply the cleaning function
comments = clean_text(comments)

In [8]:
# Beside correct typing, also adding the potential typos
comments = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda|propaganda|fraud|deceptive|fabricated|deep state|wake up|truth about'), True)\
                    .otherwise(False))

## LDA

In [9]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA
import pyspark.sql.functions as f
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import chain

In [10]:
#create small df to use for LDA
small_df = posts.select('title', 'id')

In [11]:
#create tokenizer
tokenizer = Tokenizer(outputCol="words")
tokenizer.setInputCol("title")

Tokenizer_3729b217de3d

In [12]:
#remove stop words 
StopWords = stopwords.words("english")
#removing stop words in other languages and other common words
additional = ['@reuters:', '–' '&amp;', '@ap:', 'rt', ':', 'از', 'آهنگ', 'دانلود', 'در', 'به', 'جدید', '@apentertainment:',
             '|', 'के', 'में', 'و', 'في', 'من', '@bbcworld:', 'de', 'la', 'di', 'की', 'से', 'bio', 'many','know', 'age', 'says', 'one',
             'net', 'user]', '[deleted', 'look', '–']
StopWords = StopWords + additional
remover = StopWordsRemover(stopWords=StopWords)
remover.setInputCol("words")
remover.setOutputCol("filtered")

StopWordsRemover_3e06c0745692

In [13]:
#count vectorizer
cv = CountVectorizer(inputCol="filtered", outputCol="raw_features", vocabSize=5000, minDF=25)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")

In [14]:
#lda model 
lda = LDA()
lda.setK(8)
lda.setMaxIter(10)
lda.setSeed(13)

LDA_8a38a4826146

In [15]:
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda])

In [None]:
model = pipeline.fit(small_df)

24/04/17 02:00:13 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS

In [39]:
topics = model.stages[-1].describeTopics()
terms = model.stages[-3].vocabulary

In [40]:
#get word from index of term 
def indices_to_terms(indices, terms=terms):
        terms_subset = [terms[index] for index in indices]
        return terms_subset
# Defining Spark UDF from above function
udf_indices_to_terms = f.udf(indices_to_terms, ArrayType(StringType()))

topics = (
    topics
       .withColumn("terms", udf_indices_to_terms(f.col("termIndices")))
    )

In [None]:
#topics.take(20)

In [41]:
#naming topics 
topic_dict = {0: 'economics/russia&ukraine', 1: 'presidental news', 2: 'supreme court/law', 3: 'global politics', 4: 'us politics', 
              5: 'covid/russia&ukraine', 6: 'crime/protest', 7: 'tv shows'}
              

In [42]:
small_df.cache()

DataFrame[title: string, id: string]

In [43]:
small_transform = model.transform(small_df)

In [44]:
small_df.unpersist()


DataFrame[title: string, id: string]

In [45]:
#map to topics
mapping_expr = f.create_map([f.lit(x) for x in chain(*topic_dict.items())])

In [46]:
#udf to get the top topic 
max_topic = f.udf(lambda v:float(v.argmax()),FloatType())
#using mao and udf to create a topic column
topic = small_transform.withColumn('topic_num', max_topic("topicDistribution"))\
.withColumn("topic", mapping_expr[f.col("topic_num")]).select('id','topic')

In [47]:
mini_posts = posts.select('created_utc', 'title', 'id')

In [48]:
#merging relevant columns wuth the topic column
merged_df = mini_posts.join(topic, 'id')

In [30]:
#merged_df.take(5)

                                                                                

[Row(id='10000r8', created_utc=datetime.datetime(2022, 12, 31, 18, 10, 56), title='Who dares bins? Councils in England use ex-SAS soldiers to catch fly-tippers', topic='us politics'),
 Row(id='10004rz', created_utc=datetime.datetime(2022, 12, 31, 18, 16, 6), title='Iran tests military drones in wargame near Strait of Hormuz', topic='covid/russia&ukraine'),
 Row(id='1000b7b', created_utc=datetime.datetime(2022, 12, 31, 18, 24, 30), title='Wise Registry Cleaner Pro 11.3.4 Crack Here [2023]', topic='presidental news'),
 Row(id='1000e7x', created_utc=datetime.datetime(2022, 12, 31, 18, 28, 22), title='ai-Marketing', topic='economics/russia&ukraine'),
 Row(id='1000o6u', created_utc=datetime.datetime(2022, 12, 31, 18, 41, 24), title='VuzeVPN Not Responding Big Sur 1.0.8.1 Crack Activation Key', topic='presidental news')]

## Adding comments

In [49]:
#renaming columns and removing the t3_ from the link id to get the post id on the comment
mini_comments = comments.select('created_utc','body','misinfo_class', 'link_id', 'id')\
.withColumn('comment_created', f.col('created_utc')).withColumn('comment_id', f.col('id'))\
.withColumn('id', f.regexp_extract('link_id', 't3_(.*)$', 1))

In [50]:
#merging dataframes
total_df = merged_df.join(mini_comments, 'id')

In [33]:
#total_df.take(10)

                                                                                

[Row(id='1005crh', created_utc=datetime.datetime(2022, 12, 31, 22, 29, 9), title='North Korea opens New Year with missile launch', topic='economics/russia&ukraine', created_utc=datetime.datetime(2022, 12, 31, 23, 55, 54), body="Kind of? I'm not sure if they're up and running right now what with COVID and all, but for years it was possible to book guided tours and stay in Pyongyang. Not exactly a smart thing to do (just ask the family of Otto Warmbier), but you could do it. The Kim family also loves to entertain celebrities, Dennis Rodman is probably the most famous example these days and before he died Billy Graham loved to visit Pyongyang.", misinfo_class=False, link_id='t3_1005crh', comment_created=datetime.datetime(2022, 12, 31, 23, 55, 54), comment_id='j2fx4ag'),
 Row(id='1005crh', created_utc=datetime.datetime(2022, 12, 31, 22, 29, 9), title='North Korea opens New Year with missile launch', topic='economics/russia&ukraine', created_utc=datetime.datetime(2022, 12, 31, 23, 57, 5), b

## Counting by topic

In [None]:
topic_misinfo_counts = total_df.filter(f.col('misinfo_class') == True).groupBy('topic').count().toPandas()

In [134]:
topic_misinfo_counts

Unnamed: 0,topic,count
0,us politics,24261
1,economics/russia&ukraine,47848
2,crime/protest,90683
3,global politics,35838
4,presidental news,42871
5,tv shows,14998
6,covid/russia&ukraine,91546
7,supreme court/law,59082


In [135]:
topic_misinfo_counts.to_csv('../data/csv/topic_misinfo_true_count.csv', index = False)

In [136]:
topic_counts = total_df.groupBy('topic').count().toPandas()

24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:35 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 04:56:36 WARN RowBasedKeyValueBatch: Calling spill() on

In [137]:
topic_counts

Unnamed: 0,topic,count
0,us politics,2761048
1,economics/russia&ukraine,5721107
2,crime/protest,11658072
3,global politics,4542845
4,presidental news,5028581
5,tv shows,2008234
6,covid/russia&ukraine,11446475
7,supreme court/law,6738106


In [138]:
topic_counts.to_csv('../data/csv/topic_counts.csv', index = False)

In [139]:
topic_misinfo_total = total_df.groupBy(['topic', 'misinfo_class']).count().toPandas()

24/04/13 05:19:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/04/13 05:19:04 WARN RowBasedKeyValueBatch: Calling spill() on

In [140]:
topic_misinfo_total

Unnamed: 0,topic,misinfo_class,count
0,economics/russia&ukraine,True,47848
1,covid/russia&ukraine,False,11354929
2,supreme court/law,True,59082
3,global politics,True,35838
4,crime/protest,False,11567389
5,economics/russia&ukraine,False,5673259
6,supreme court/law,False,6679024
7,us politics,True,24261
8,crime/protest,True,90683
9,tv shows,True,14998


In [141]:
topic_misinfo_total.to_csv('../data/csv/topic_misinfo_total.csv', index = False)

## Sentiment Analysis

In [51]:
!pip install vaderSentiment textblob
# reference for VADER:
# https://medium.com/@tom.bailey.courses/sentiment-analysis-in-snowflake-using-python-31d7296abe1a
# https://github.com/cjhutto/vaderSentiment

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [52]:
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)



### VADER and TextBlob

In [53]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyspark.sql.types as T
from textblob import TextBlob


In [54]:
# Use two libraries comparing the sentiment result
def vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vader_score = analyzer.polarity_scores(text)
    return vader_score['compound']  


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [55]:
# UDF
vader_udf = f.udf(vader_sentiment, T.FloatType())
textblob_udf = f.udf(textblob_sentiment, T.FloatType())

In [56]:
# Add vader score to total df
total_df = total_df.withColumn("vader_score", vader_udf(f.col("body")))

# Add textblob score to total df
total_df = total_df.withColumn("textblob_score", textblob_udf(f.col("body")))


### Comparison

In [50]:
total_df.cache()

DataFrame[id: string, created_utc: timestamp, title: string, topic: string, created_utc: timestamp, body: string, misinfo_class: boolean, link_id: string, comment_created: timestamp, comment_id: string, vader_score: float, textblob_score: float]

In [51]:
# display a sample to manually inspect differences
total_df.select("body", "vader_score", "textblob_score").show()


[Stage 48:>                                                         (0 + 1) / 1]

+--------------------+-----------+--------------+
|                body|vader_score|textblob_score|
+--------------------+-----------+--------------+
|kind of im not su...|     0.7254|    0.27857143|
|id be up for some...|     0.0772|          0.25|
|dont forget about...|    -0.3682|           0.0|
|its how he gets r...|     -0.296|           0.0|
|i thought they re...|     0.1779|           0.0|
|start it off with...|        0.0|           0.0|
|i have seen a doc...|        0.0|           0.0|
|fear of missillin...|    -0.4939|           0.0|
|you make it sound...|    -0.7184|    0.13333334|
|are we sure it ju...|     0.3182|           0.3|
|and less rockets ...|    -0.6369| -0.0033333334|
|pretty sure my ne...|     0.6705|          0.25|
|             deleted|        0.0|           0.0|
|they wanted to up...|     0.5719|    0.46818182|
|we bring joyous c...|     0.6249|    0.13636364|
| fear of missing out|    -0.6597|          -0.2|
|i hope atlantis s...|     0.4404|           0.0|


                                                                                

In [53]:
total_df.unpersist()

DataFrame[id: string, created_utc: timestamp, title: string, topic: string, created_utc: timestamp, body: string, misinfo_class: boolean, link_id: string, comment_created: timestamp, comment_id: string, vader_score: float, textblob_score: float]

In [None]:
total_df.describe(['vader_score', 'textblob_score']).show()


In [None]:
# compute correlation between the scores
total_df.stat.corr("sentiment_score", "textblob_score")


### Top Topics that people comment they are fake news

In [58]:
misinfo_comments = total_df.filter((f.col("vader_score") < 0) & (f.col("misinfo_class") == True))

In [59]:
misinfo_comments.cache()

DataFrame[id: string, created_utc: timestamp, title: string, topic: string, created_utc: timestamp, body: string, misinfo_class: boolean, link_id: string, comment_created: timestamp, comment_id: string, vader_score: float, textblob_score: float]

In [60]:
# aggregating misinformation comments by topic
misinfo_comments_count = misinfo_comments.groupBy("vader_score").count().toPandas()


                                                                                

In [64]:
# save to csv file
misinfo_comments_count.to_csv('../data/csv/misinfo_comments_count.csv', index = False)

In [73]:
misinfo_comments.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- vader_score: float (nullable = true)
 |-- textblob_score: float (nullable = true)





In [60]:
# Topics comment counts and percentage of vadar score <0

misinfo_comments = misinfo_comments.withColumn("vader_neg", (f.col("vader_score") < 0).cast("int"))

# group by the 'topic' to calculate the total comments
neg_comments_count = misinfo_comments.groupBy("topic").agg(f.count("comment_id").alias("total_comments"),
                                                           f.sum("vader_neg").alias("negative_vader_count")  
    )

# Calculate the percentage of negative VADER scores
neg_comments_count = neg_comments_count.withColumn("percentage_neg_vader", 
                           (f.col("negative_vader_count") / f.col("total_comments")) * 100)

neg_comments_count.show()

                                                                                

+--------------------+--------------+--------------------+--------------------+
|               topic|total_comments|negative_vader_count|percentage_neg_vader|
+--------------------+--------------+--------------------+--------------------+
|         us politics|         48503|               48503|               100.0|
|economics/russia&...|         91577|               91577|               100.0|
|       crime/protest|        111858|              111858|               100.0|
|     global politics|         53414|               53414|               100.0|
|    presidental news|         52837|               52837|               100.0|
|            tv shows|         24860|               24860|               100.0|
|covid/russia&ukraine|        134401|              134401|               100.0|
|   supreme court/law|         74628|               74628|               100.0|
+--------------------+--------------+--------------------+--------------------+



AttributeError: 'DataFrame' object has no attribute 'to_csv'

In [70]:
misinfo_comments.cache()

DataFrame[id: string, created_utc: timestamp, title: string, topic: string, created_utc: timestamp, body: string, misinfo_class: boolean, link_id: string, comment_created: timestamp, comment_id: string, vader_score: float, textblob_score: float, vader_neg: int, vader_below_neg_0_5: int, vader_below_neg_0_6: int, vader_below_neg_0_7: int]

In [85]:
from pyspark.sql import functions as f

# Adjusting the condition to check for VADER score less than -0.8 and renaming the column
misinfo_comments = misinfo_comments.withColumn("vader_below_neg_0_8", (f.col("vader_score") < -0.8).cast("int"))

# Group by the 'topic' to calculate the total comments and the sum of negative comments with the new column name
neg_comments_count = misinfo_comments.groupBy("topic").agg(
    f.count("comment_id").alias("total_comments"),
    f.sum("vader_below_neg_0_8").alias("count_below_neg_0_8")  
)

# Calculate the percentage of negative VADER scores and rename the percentage column appropriately
neg_comments_count = neg_comments_count.withColumn("percentage_below_neg_0_8", 
                           (f.col("count_below_neg_0_8") / f.col("total_comments")) * 100)

# Display the results
neg_comments_count.show()




+--------------------+--------------+-------------------+------------------------+
|               topic|total_comments|count_below_neg_0_8|percentage_below_neg_0_8|
+--------------------+--------------+-------------------+------------------------+
|         us politics|         48503|              19239|      39.665587695606455|
|economics/russia&...|         91577|              35744|      39.031634580735336|
|       crime/protest|        111858|              48657|       43.49890039156788|
|     global politics|         53414|              20609|      38.583517429887294|
|    presidental news|         52837|              21466|       40.62683346897061|
|            tv shows|         24860|              10391|       41.79806918744971|
|covid/russia&ukraine|        134401|              49112|       36.54139478128883|
|   supreme court/law|         74628|              30347|      40.664361901699095|
+--------------------+--------------+-------------------+------------------------+



                                                                                

In [88]:
# save to csv file
neg_comments_count_pd = neg_comments_count.toPandas()
neg_comments_count_pd.to_csv('../data/csv/neg_comments_count.csv', header=True, index=False)


In [89]:
misinfo_comments.unpersist()

DataFrame[id: string, created_utc: timestamp, title: string, topic: string, created_utc: timestamp, body: string, misinfo_class: boolean, link_id: string, comment_created: timestamp, comment_id: string, vader_score: float, textblob_score: float, vader_neg: int, vader_below_neg_0_5: int, vader_below_neg_0_6: int, vader_below_neg_0_7: int, vader_below_neg_0_8: int]