In [1]:
"""To run PySpark correctly, follow the tutorial at https://www.youtube.com/watch?v=DznteGdeJoA.
   To avoid possible errors, the path names for the Java and Spark installations should not
   contain any whitespace. Version of findspark used comes from pip rather than Anaconda.
   The SparkSession is configured to run locally with 4 cores.
   Versions of packages used:
   Anaconda3 2021.05
   Python 3.8.8 64-bit
   findspark 1.4.2
   pandas 1.2.4
   pyspark 3.2.0
   nltk 3.6.1
   textblob 0.15.3
   Spark 3.2.0
   OpenJDK 1.8.0_41
   re and os are built-in Python packages
"""

# https://github.com/minrk/findspark
# https://pypi.org/project/findspark/
import findspark
findspark.init()

import pandas as pd
import re
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from textblob import TextBlob
from textblob import Word
from nltk.corpus import stopwords
from os.path import exists

# Parameters for preprocessing
# Number of rows to fetch from the input csv file
nrows = 1000
# Input csv filename
data_filename = "tweets.csv"
# Output csv filename
preprocessed_data_filename = "preprocessed_tweets.csv"
# Columns to select
cols = ["timestamp", "text"]
# Dictionary to store preprocessed information to be written to output
timestamp_text_dict = {"timestamp": [], "text": []}

# https://towardsdatascience.com/pyspark-and-sparksql-basics-6cb4bf967e53
# https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.SparkSession.html
# Using Spark for better scalability with big data and multiple machines
# Currently cannot setup a cluster of machines to run in parallel
# Initializing a SparkSession, important for creating DataFrames
sc = SparkSession.builder.appName("BitcoinTweetPreprocessing")\
.master("local[4]")\
.config ("spark.sql.shuffle.partitions", "50")\
.config("spark.driver.maxResultSize","5g")\
.config ("spark.sql.execution.arrow.enabled", "true")\
.getOrCreate()

# Check if output file already exists
if not exists(preprocessed_data_filename):
    # Need to use Pandas to load a small chunk of the very large csv file correctly
    pandas_df = pd.read_csv(data_filename, sep=';', nrows=nrows, lineterminator="\r")
    # https://sparkbyexamples.com/pyspark/convert-pandas-to-pyspark-dataframe/
    # Converting the Pandas DataFrame into a Spark SQL DataFrame
    spark_df = sc.createDataFrame(pandas_df)
    # https://www.geeksforgeeks.org/how-to-loop-through-each-row-of-dataframe-in-pyspark/
    # Select the columns and put the data in a list
    spark_df_rows = spark_df.select(cols).collect()
    for row in spark_df_rows:
        # Check if the tweet text is a string and that it does not contain non-English characters
        if type(row[1]) == str and row[1].isascii():
            # Apply preprocessing techniques to tweet text
            # Regular expression from https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
            # Removal of hyperlinks, hashtags, usernames, and special characters
            cleaned = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", row[1]).split())
            # https://textblob.readthedocs.io/en/dev/quickstart.html
            textblob = TextBlob(cleaned)
            stop_words_removed = []
            # Remove stop words and lemmatize the remaining words
            # https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
            for word in textblob.words:
                if word not in stopwords.words():
                    lemmatized_word = Word(word).lemmatize()
                    stop_words_removed.append(lemmatized_word)
            # Preprocessed text and timestamp are added to the dictionary
            timestamp_text_dict["text"].append(" ".join(stop_words_removed))
            timestamp_text_dict["timestamp"].append(row[0])
    # Conversion of dictionary to Pandas DataFrame for writing to output csv file
    df = pd.DataFrame.from_dict(timestamp_text_dict)
    df.to_csv(preprocessed_data_filename, index=False)