In [1]:
import pyspark
from pyspark.sql import SparkSession

sc = pyspark.SparkContext()
spark = SparkSession(sc)

In [2]:
labeled_data = spark.read.csv("trumptweet-mod.csv", header = True, escape = '"')
labeled_data = labeled_data.select("text", "Class")
labeled_data = labeled_data.filter((labeled_data.Class == '0') | (labeled_data.Class == '1'))

In [3]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

rt_regex = r"(?=\s?)(RT)(?=\s?)"
user_regex = r"@\S+"
url_regex = r"http[s]?:\/\/\S+"
hashtag_regex = r"#"
unicode_regex = r"<ed>|<U\+[^>]*>"
space_etc_regex = r"^[\.'\",]\s+|\s{2,}|\n"

# Remove RT, users (@foo), URLs, #s, line feeds, odd unicode 'tags'
uber_regex =  "|".join([rt_regex, user_regex, url_regex, hashtag_regex, unicode_regex])

labeled_data = labeled_data.withColumn("norm_text", regexp_replace("text", uber_regex, ""))
labeled_data = labeled_data.withColumn("norm_text", trim(regexp_replace("norm_text", space_etc_regex, " ")))
labeled_data = labeled_data.filter(labeled_data.norm_text != '')
labeled_data = labeled_data.select("norm_text", "Class").distinct()

In [4]:
train_data, test_data = labeled_data.randomSplit([0.8, 0.2], seed=71082)

def write_df(df, dirname, filename):
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    tmp_name = filename + ".tmp"
    df.coalesce(1).write.mode('overwrite').text(tmp_name)
    
    fs = FileSystem.get(Configuration())
    fs.mkdirs(Path(dirname))
    # Assume one file output
    file = fs.globStatus(Path(tmp_name + "/*.txt"))[0].getPath();
    fs.rename(file, Path(dirname + "/" + filename));
    fs.delete(Path(tmp_name), True);
       
# split training data into positive/negative
positive_data = train_data.filter(train_data.Class == "1").select("norm_text")
write_df(positive_data, "train-data.txt", "positive.txt")

negative_data = labeled_data.filter(train_data.Class == "0").select("norm_text")
write_df(negative_data, "train-data.txt", "negative.txt")

test_data.write.mode("overwrite").parquet("test-data.parquet")