## Open file and select text column

In [None]:
!pip install nltk
!pip install textblob

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pandas as p
import nltk
from textblob import TextBlob

# Initialize the Spark session, no configurations?
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Assignment2") \
    .getOrCreate()

# Read the table in csv format
tweet_table = spark.read.format("bigquery").load("de2021labs.twitter_data.twitter_data")

# Print schema and number of tweets in the dataset
print("Table schema: {}\n".format(tweet_table.schema))
print("Number of tweets: {}\n".format(tweet_table.count()))

# Select tweet text and peak into the text
tweet_text_table = tweet_table.select(['text'])
tweet_text_table.show(10,truncate=False)

ModuleNotFoundError: No module named 'nltk'

### Cleaning

In [63]:
# Removing empty tweets
tweet_text_table = tweet_text_table.na.drop()
print("Number of tweets after removing null: {}\n".format(tweet_text_table.count()))

# Remove twitter handlers, hashtags, URLS, special characters, single characters and double spaces respectively
df_clean = tweet_text_table.select('text', ((regexp_replace('text', '@[^\s]+', "")).alias('text2')))
df_clean = df_clean.select('text2', ((regexp_replace('text2', r'\B#\S+', "")).alias('text3')))
df_clean = df_clean.select('text3', ((regexp_replace('text3', r"http\S+", "")).alias('text4')))
df_clean = df_clean.select('text4', ((regexp_replace('text4', r'[^\w+]', " ")).alias('text5'))) 
df_clean = df_clean.select('text5', ((regexp_replace('text5', r'\s+[a-zA-Z]\s+', " ")).alias('text6')))
df_clean = df_clean.select('text6', ((regexp_replace('text6', r'\s+', " ")).alias('text'))).select(['text'])

# Create index
df_clean = df_clean.select("*") \
                    .withColumn("id", monotonically_increasing_id()) \
                    .select("id","text")

# Peak
df_clean.show(10,truncate=False)

Number of tweets after removing null: 7632

+---+------------------------------------------------------------------------------------------------------------------+
|id |text                                                                                                              |
+---+------------------------------------------------------------------------------------------------------------------+
|0  |Same folks said daikon paste could treat cytokine storm                                                           |
|1  |While the world has been on the wrong side of history this year hopefully the biggest vaccination effort we ve ev |
|2  | Russian vaccine is created to last 2 4 years                                                                     |
|3  |Facts are immutable Senator even when you re not ethically sturdy enough to acknowledge them 1 You were born      |
|4  |Explain to me again why we need vaccine                                                                 

## Sentiment analysis

In [61]:
# There was an issue with the texblob module such that it did not allow for application of an udf to an entire column. 
# To fix this, we converted the pyspark df to a pandas df and simply appended a list of sentiments to the df.

pandas_df = df_clean.toPandas()
sentiment_list = list()

for index, row in pandas_df.iterrows():
    sentiment = row['text']
    sentiment_list.append(TextBlob(sentiment).sentiment[0])

sentiments_df = spark.createDataFrame(sentiment_list, StringType()) \
                    .select("*") \
                    .withColumn("id", monotonically_increasing_id()) 

# Join dataframes
tweet_sentiments_df = df_clean.join(sentiments_df, "id")

# Peak
tweet_sentiments_df.show(10, truncate=False)


+---+------------------------------------------------------------------------------------------------------------------+------------------+
|id |text                                                                                                              |value             |
+---+------------------------------------------------------------------------------------------------------------------+------------------+
|0  |Same folks said daikon paste could treat cytokine storm                                                           |0.0               |
|1  |While the world has been on the wrong side of history this year hopefully the biggest vaccination effort we ve ev |-0.5              |
|2  | Russian vaccine is created to last 2 4 years                                                                     |0.0               |
|3  |Facts are immutable Senator even when you re not ethically sturdy enough to acknowledge them 1 You were born      |-0.05             |
|4  |Explain to me a

The next step is to save the sentiment analysis df to BigQuery.

In [None]:
bucket = "dejadsdejads_group10assignment2"
spark.conf.set('temporaryGcsBucket', bucket)

conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

tweet_sentiments_df.write.format('bigquery') \
  .option('table', 'de2021labs.twitter_data.twitter_sentiments') \
  .mode("overwrite") \
  .save()

At last, the spark context should be stopped.

In [None]:
spark.stop()