# NLP

## Part 0 - import and setup

In [2]:
# Setup - Run only once per Kernel App
%conda install https://anaconda.org/conda-forge/openjdk/11.0.1/download/linux-64/openjdk-11.0.1-hacce0ff_1021.tar.bz2

# install PySpark
!pip install sagemaker_pyspark
%pip install pyspark==3.4.0

# restart kernel
from IPython.core.display import HTML
HTML("<script>Jupyter.notebook.kernel.restart()</script>")


Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

Note: you may need to restart the kernel to use updated packages.
Collecting pyspark==3.3.0 (from sagemaker_pyspark)
  Using cached pyspark-3.3.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.5 (from pyspark==3.3.0->sagemaker_pyspark)
  Using cached py4j-0.10.9.5-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
    Uninstalling py4j-0.10.9.7:
      Successfully uninstalled py4j-0.10.9.7
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.4.0
    Uninstalling pyspark-3.4.0:
      Successfully uninstalled pyspark-3.4.0
Successfully installed py4j-0.10.9.5 pyspark-3.3.0
[0mCollecting pyspark==3.4.0
  Using cached pyspark-3.4.0-py2.py3-none-any.whl
Collecting py4j==0.10.9.7 (from 

In [3]:
import sagemaker
sess = sagemaker.Session()
bucket = sess.default_bucket() 
print(f"the default SageMaker region specific bucket for this account is {bucket}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
the default SageMaker region specific bucket for this account is sagemaker-us-east-1-165729782536


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
    .appName("sagemaker-spark") \
    .master("local[*]") \
    .config("spark.driver.memory", "8G") \
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:5.1.3,org.apache.hadoop:hadoop-aws:3.2.2") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.ContainerCredentialsProvider") \
    .getOrCreate()

print(spark.version)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bd825ab8-451b-4f69-ac33-8f5691d5c4a8;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;5.1.3 in central
	found com.typesafe#config;1.4.2 in central
	found org.rocksdb#rocksdbjni;6.29.5 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.828 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found com.google.cloud#google-cloud-storage;2.20.1 in central
	found com.google.guava#guava;31.1-jre in c

3.4.0


In [6]:
import sagemaker
session = sagemaker.Session()
bucket = session.default_bucket()

output_prefix_data_submissions = "project/submissions/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_submissions}"
print(f"reading submissions from {s3_path}")

posts = spark.read.parquet(s3_path, header=True)
posts = posts.filter(
    (posts.subreddit == "news") | (posts.subreddit == "worldnews")
)


reading submissions from s3a://sagemaker-us-east-1-165729782536/project/submissions/yyyy=*


24/04/27 16:25:04 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/04/27 16:25:13 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [7]:
#reading comments
output_prefix_data_comments = "project/comments/yyyy=*"
s3_path = f"s3a://{bucket}/{output_prefix_data_comments}"
comments = spark.read.parquet(s3_path, header=True)
comments = comments.filter(
    (comments.subreddit == "news") | (comments.subreddit == "worldnews")
)

In [8]:
#import packages
import pyspark.sql.functions as f
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

## Adding variables

In [9]:
## Clean the comments body content
# define a function
def clean_text(df):
    # Lowercase all text
    df = df.withColumn("body", f.lower(f.col("body")))
    # Remove special characters (keeping only alphanumeric and spaces)
    df = df.withColumn("body", f.regexp_replace(f.col("body"), "[^a-zA-Z0-9\\s]", ""))
    # Trim spaces
    df = df.withColumn("body", f.trim(f.col("body")))
    return df

# Apply the cleaning function
comments = clean_text(comments)

In [10]:
# Beside correct typing, also adding the potential typos
comments = comments.withColumn('misinfo_class', 
                    f.when(comments.body.rlike(r'fake news|bullshit|misinfo|clickbait|unreliable|propoganda|propaganda|fraud|deceptive|fabricated|deep state|wake up|truth about'), True)\
                    .otherwise(False))

## LDA

In [11]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml import Pipeline
from nltk.corpus import stopwords
from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA
import pyspark.sql.functions as f
from pyspark.sql.types import StringType, ArrayType, FloatType
from itertools import chain

In [12]:
#create small df to use for LDA
small_df = posts.select('title', 'id')

In [13]:
#create tokenizer
tokenizer = Tokenizer(outputCol="words")
tokenizer.setInputCol("title")

Tokenizer_1da53e5b9680

In [14]:
#remove stop words 
StopWords = stopwords.words("english")
#removing stop words in other languages and other common words
additional = ['@reuters:', '–' '&amp;', '@ap:', 'rt', ':', 'از', 'آهنگ', 'دانلود', 'در', 'به', 'جدید', '@apentertainment:',
             '|', 'के', 'में', 'و', 'في', 'من', '@bbcworld:', 'de', 'la', 'di', 'की', 'से', 'bio', 'many','know', 'age', 'says', 'one',
             'net', 'user]', '[deleted', 'look', '–']
StopWords = StopWords + additional
remover = StopWordsRemover(stopWords=StopWords)
remover.setInputCol("words")
remover.setOutputCol("filtered")

StopWordsRemover_c19b1ff60d6b

In [15]:
#count vectorizer
cv = CountVectorizer(inputCol="filtered", outputCol="raw_features", vocabSize=5000, minDF=25)
# IDF
idf = IDF(inputCol="raw_features", outputCol="features")

In [16]:
#lda model 
lda = LDA()
lda.setK(8)
lda.setMaxIter(10)
lda.setSeed(13)

LDA_e101364a1462

In [17]:
pipeline = Pipeline(stages=[tokenizer, remover, cv, idf, lda])

In [18]:
model = pipeline.fit(small_df)

                                                                                

In [19]:
topics = model.stages[-1].describeTopics()
terms = model.stages[-3].vocabulary

In [20]:
#get word from index of term 
def indices_to_terms(indices, terms=terms):
        terms_subset = [terms[index] for index in indices]
        return terms_subset
# Defining Spark UDF from above function
udf_indices_to_terms = f.udf(indices_to_terms, ArrayType(StringType()))

topics = (
    topics
       .withColumn("terms", udf_indices_to_terms(f.col("termIndices")))
    )

In [21]:
#topics.take(20)

In [22]:
#naming topics 
topic_dict = {0: 'economics/russia&ukraine', 1: 'presidental news', 2: 'supreme court/law', 3: 'global politics', 4: 'us politics', 
              5: 'covid/russia&ukraine', 6: 'crime/protest', 7: 'tv shows'}
              

In [23]:
small_df.cache()

DataFrame[title: string, id: string]

In [24]:
small_transform = model.transform(small_df)

In [25]:
small_df.unpersist()


DataFrame[title: string, id: string]

In [26]:
#map to topics
mapping_expr = f.create_map([f.lit(x) for x in chain(*topic_dict.items())])

In [27]:
#udf to get the top topic 
max_topic = f.udf(lambda v:float(v.argmax()),FloatType())
#using mao and udf to create a topic column
topic = small_transform.withColumn('topic_num', max_topic("topicDistribution"))\
.withColumn("topic", mapping_expr[f.col("topic_num")]).select('id','topic')

In [28]:
mini_posts = posts.select('created_utc', 'title', 'id')

In [29]:
#merging relevant columns wuth the topic column
merged_df = mini_posts.join(topic, 'id')

In [30]:
# merged_df.take(5)

                                                                                

[Row(id='10000r8', created_utc=datetime.datetime(2022, 12, 31, 18, 10, 56), title='Who dares bins? Councils in England use ex-SAS soldiers to catch fly-tippers', topic='presidental news'),
 Row(id='10004rz', created_utc=datetime.datetime(2022, 12, 31, 18, 16, 6), title='Iran tests military drones in wargame near Strait of Hormuz', topic='global politics'),
 Row(id='1000b7b', created_utc=datetime.datetime(2022, 12, 31, 18, 24, 30), title='Wise Registry Cleaner Pro 11.3.4 Crack Here [2023]', topic='economics/russia&ukraine'),
 Row(id='1000e7x', created_utc=datetime.datetime(2022, 12, 31, 18, 28, 22), title='ai-Marketing', topic='economics/russia&ukraine'),
 Row(id='1000o6u', created_utc=datetime.datetime(2022, 12, 31, 18, 41, 24), title='VuzeVPN Not Responding Big Sur 1.0.8.1 Crack Activation Key', topic='economics/russia&ukraine')]

## Adding comments

In [31]:
#renaming columns and removing the t3_ from the link id to get the post id on the comment
mini_comments = comments.select('created_utc','body','misinfo_class', 'link_id', 'id')\
.withColumn('comment_created', f.col('created_utc')).withColumn('comment_id', f.col('id'))\
.withColumn('id', f.regexp_extract('link_id', 't3_(.*)$', 1))

In [32]:
#merging dataframes
total_df = merged_df.join(mini_comments, 'id')

In [33]:
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)



## Counting by topic

In [None]:
topic_misinfo_counts = total_df.filter(f.col('misinfo_class') == True).groupBy('topic').count().toPandas()

In [None]:
topic_misinfo_counts

In [None]:
topic_misinfo_counts.to_csv('../data/csv/topic_misinfo_true_count.csv', index = False)

In [None]:
topic_counts = total_df.groupBy('topic').count().toPandas()

In [None]:
topic_counts

In [None]:
topic_counts.to_csv('../data/csv/topic_counts.csv', index = False)

In [None]:
topic_misinfo_total = total_df.groupBy(['topic', 'misinfo_class']).count().toPandas()

In [None]:
topic_misinfo_total

In [None]:
topic_misinfo_total.to_csv('../data/csv/topic_misinfo_total.csv', index = False)

## Sentiment Analysis

In [34]:
!pip install vaderSentiment textblob
# reference for VADER:
# https://medium.com/@tom.bailey.courses/sentiment-analysis-in-snowflake-using-python-31d7296abe1a
# https://github.com/cjhutto/vaderSentiment

Collecting vaderSentiment
  Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Collecting textblob
  Using cached textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Using cached vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Using cached textblob-0.18.0.post0-py3-none-any.whl (626 kB)
Installing collected packages: vaderSentiment, textblob
Successfully installed textblob-0.18.0.post0 vaderSentiment-3.3.2
[0m

In [35]:
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)



### VADER and TextBlob

In [36]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pyspark.sql.types as T
from textblob import TextBlob


In [37]:
# Use two libraries comparing the sentiment result
def vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()
    vader_score = analyzer.polarity_scores(text)
    return vader_score['compound']  


def textblob_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [38]:
# UDF
vader_udf = f.udf(vader_sentiment, T.FloatType())
textblob_udf = f.udf(textblob_sentiment, T.FloatType())

In [39]:
# Add vader score to total df
total_df = total_df.withColumn("vader_score", vader_udf(f.col("body")))

# Add textblob score to total df
total_df = total_df.withColumn("textblob_score", textblob_udf(f.col("body")))


In [40]:
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- created_utc: timestamp (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- vader_score: float (nullable = true)
 |-- textblob_score: float (nullable = true)



In [65]:
total_df = total_df.drop("created_utc")

### output the result

In [None]:
total_df.printSchema()

In [None]:
# Add a year column
total_df = total_df.withColumn("year", f.year("comment_created"))

In [None]:
# Get distinct years
year_list = [2023, 2021] 

In [None]:
def save_par

In [None]:
for y in year_list:
    # Filter the DataFrame for one year
    df_year = total_df.filter(total_df.year == y)
    print(f"--- start saving {y} ------")

    # Define the output path for this particular year
    output_path = f"s3a://{bucket}/project/output/year={y}"

    # Write the filtered DataFrame to Parquet
    df_year.drop("year").write.mode('overwrite').parquet(output_path)
    print(f"{y}'s data now saved in {output_path} ------")


In [None]:
total_df.count()

In [None]:
comments.count()

In [None]:
output_path = f"s3a://{bucket}/project/output/total.parquet"

# Write the DataFrame to Parquet on S3
total_df.write.mode('overwrite').parquet(output_path)
# total_df.save(path=output_path, source='parquet', mode='overwrite')

### Comparison

In [None]:
total_df.cache()

In [None]:
# display a sample to manually inspect differences
total_df.select("body", "vader_score", "textblob_score").show()


In [None]:
total_df.unpersist()

In [None]:
total_df.describe(['vader_score', 'textblob_score']).show()


In [None]:
# compute correlation between the scores
total_df.stat.corr("sentiment_score", "textblob_score")


### Top Topics that people comment they are fake news

In [None]:
misinfo_comments = total_df.filter((f.col("vader_score") < 0) & (f.col("misinfo_class") == True))

In [None]:
misinfo_comments.cache()

In [None]:
# aggregating misinformation comments by topic
misinfo_comments_count = misinfo_comments.groupBy("vader_score").count().toPandas()


In [None]:
# save to csv file
misinfo_comments_count.to_csv('../data/csv/misinfo_comments_count.csv', index = False)

In [None]:
misinfo_comments.printSchema()

In [None]:
# Topics comment counts and percentage of vadar score <0

misinfo_comments = misinfo_comments.withColumn("vader_neg", (f.col("vader_score") < 0).cast("int"))

# group by the 'topic' to calculate the total comments
neg_comments_count = misinfo_comments.groupBy("topic").agg(f.count("comment_id").alias("total_comments"),
                                                           f.sum("vader_neg").alias("negative_vader_count")  
    )

# Calculate the percentage of negative VADER scores
neg_comments_count = neg_comments_count.withColumn("percentage_neg_vader", 
                           (f.col("negative_vader_count") / f.col("total_comments")) * 100)

neg_comments_count.show()

In [None]:
misinfo_comments.cache()

In [None]:
from pyspark.sql import functions as f

# Adjusting the condition to check for VADER score less than -0.8 and renaming the column
misinfo_comments = misinfo_comments.withColumn("vader_below_neg_0_8", (f.col("vader_score") < -0.8).cast("int"))

# Group by the 'topic' to calculate the total comments and the sum of negative comments with the new column name
neg_comments_count = misinfo_comments.groupBy("topic").agg(
    f.count("comment_id").alias("total_comments"),
    f.sum("vader_below_neg_0_8").alias("count_below_neg_0_8")  
)

# Calculate the percentage of negative VADER scores and rename the percentage column appropriately
neg_comments_count = neg_comments_count.withColumn("percentage_below_neg_0_8", 
                           (f.col("count_below_neg_0_8") / f.col("total_comments")) * 100)

# Display the results
neg_comments_count.show()


In [None]:
# save to csv file
neg_comments_count_pd = neg_comments_count.toPandas()
neg_comments_count_pd.to_csv('../data/csv/neg_comments_count.csv', header=True, index=False)


In [None]:
misinfo_comments.unpersist()

## part 3 - Covid Data

In [41]:
# Define the path to the CSV file
s3_path = f"s3a://{bucket}/project/covid_data/epidemiology.csv"

# Read the CSV file
epi_data = spark.read.csv(s3_path, header=True, inferSchema=True)

# Show the DataFrame to verify the content
epi_data.show()


                                                                                

+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|      date|location_key|new_confirmed|new_deceased|new_recovered|new_tested|cumulative_confirmed|cumulative_deceased|cumulative_recovered|cumulative_tested|
+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|2020-01-01|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-02|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-03|          AD|            0|           0|         null|      null|                   0|                  0|                null|             null|
|2020-01-04|          AD|            0|           0|

In [43]:
# Define the path to the CSV file
s3_path = f"s3a://{bucket}/project/covid_data/vaccinations.csv"

# Read the CSV file
vac_data = spark.read.csv(s3_path, header=True, inferSchema=True)

# Show the DataFrame to verify the content
vac_data.printSchema()




root
 |-- date: date (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_persons_vaccinated: integer (nullable = true)
 |-- cumulative_persons_vaccinated: integer (nullable = true)
 |-- new_persons_fully_vaccinated: integer (nullable = true)
 |-- cumulative_persons_fully_vaccinated: integer (nullable = true)
 |-- new_vaccine_doses_administered: integer (nullable = true)
 |-- cumulative_vaccine_doses_administered: long (nullable = true)
 |-- new_persons_vaccinated_pfizer: integer (nullable = true)
 |-- cumulative_persons_vaccinated_pfizer: integer (nullable = true)
 |-- new_persons_fully_vaccinated_pfizer: integer (nullable = true)
 |-- cumulative_persons_fully_vaccinated_pfizer: integer (nullable = true)
 |-- new_vaccine_doses_administered_pfizer: integer (nullable = true)
 |-- cumulative_vaccine_doses_administered_pfizer: integer (nullable = true)
 |-- new_persons_vaccinated_moderna: integer (nullable = true)
 |-- cumulative_persons_vaccinated_moderna: integer (null

                                                                                

In [42]:
file = "google-search-trends.csv"

s3_path = f"s3a://{bucket}/project/covid_data/{file}"

# Read the CSV file
search_data = spark.read.csv(s3_path, header=True, inferSchema=True)

# Show the DataFrame to verify the content
# search_data.show()
# search_data.printSchema()

                                                                                

### data preprocess

In [51]:
import pandas as pd

#### create functions

In [44]:
def add_time_columns(df, date_col='date'):

    return df.withColumn("year", f.year(date_col)) \
             .withColumn("month", f.month(date_col)) \
             .withColumn("week", f.weekofyear(date_col))

In [45]:
def group_by_weekly(df, cols):

    # Prepare aggregation expressions
    aggregations = {col: "sum" for col in cols}
    

    return df.groupBy("year", "week").agg(
        *(f.sum(c).alias(f"sum_{c}") for c in cols)
    ).orderBy("year", "week")

In [46]:
def group_by_monthly(df, cols):

    aggregations = {col: "sum" for col in cols}
    
    # Group by year and month, then aggregate based on the provided columns
    return df.groupBy("year", "month").agg(
        *(f.sum(c).alias(f"sum_{c}") for c in cols)
    ).orderBy("year", "month")


#### start data process

In [60]:
# Extract the month from the date and group by it
epi_data = add_time_columns(epi_data)

epi_agg_cols =  ["new_confirmed", "new_deceased"]
monthly_epi_global = group_by_monthly(epi_data, epi_agg_cols)
weekly_epi_global = group_by_weekly(epi_data, epi_agg_cols)

monthly_epi_global_df = monthly_epi_global.toPandas()
weekly_epi_global_df = weekly_epi_global.toPandas()

                                                                                

In [61]:
# Extract the month from the date and group by it
vac_data = add_time_columns(vac_data)

vac_agg_cols =  ["new_persons_vaccinated", "new_persons_fully_vaccinated", "new_vaccine_doses_administered"]
monthly_vac_global = group_by_monthly(vac_data, vac_agg_cols)
weekly_vac_global = group_by_weekly(vac_data, vac_agg_cols)

monthly_vac_global_df = monthly_vac_global.toPandas()
weekly_vac_global_df = weekly_vac_global.toPandas()


                                                                                

In [62]:
monthly_covid = pd.merge(
    monthly_epi_global_df,
    monthly_vac_global_df,
    on=['year', 'month'],  # Columns to join on
    how='outer'  # Full outer join to ensure all data from both sides is kept
)
monthly_covid.head()

Unnamed: 0,year,month,sum_new_confirmed,sum_new_deceased,sum_new_persons_vaccinated,sum_new_persons_fully_vaccinated,sum_new_vaccine_doses_administered
0,2019,12,,,,,
1,2020,1,54666.0,1686.0,,0.0,
2,2020,2,-141599.0,5353.0,,0.0,
3,2020,3,2195224.0,110044.0,0.0,0.0,0.0
4,2020,4,5900737.0,465675.0,0.0,0.0,0.0


In [63]:
weekly_covid = pd.merge(
    weekly_epi_global_df,
    weekly_vac_global_df,
    on=['year', 'week'],  # Columns to join on
    how='outer'  # Full outer join to ensure all data from both sides is kept
)
weekly_covid.head()

Unnamed: 0,year,week,sum_new_confirmed,sum_new_deceased,sum_new_persons_vaccinated,sum_new_persons_fully_vaccinated,sum_new_vaccine_doses_administered
0,2019,1,,,,,
1,2020,1,15214.0,1120.0,,,
2,2020,2,8594.0,1.0,,,
3,2020,3,3078.0,5.0,,,
4,2020,4,8116.0,172.0,,0.0,


In [64]:
monthly_covid.to_csv('../data/csv/monthly_covid_info.csv', index = False)
weekly_covid.to_csv('../data/csv/weekly_covid_info.csv', index = False)

In [66]:
total_df = add_time_columns(total_df, 'comment_created')
total_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- body: string (nullable = true)
 |-- misinfo_class: boolean (nullable = false)
 |-- link_id: string (nullable = true)
 |-- comment_created: timestamp (nullable = true)
 |-- comment_id: string (nullable = true)
 |-- vader_score: float (nullable = true)
 |-- textblob_score: float (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- week: integer (nullable = true)

