In [1]:
%%bash

if [ ! -f trumptweet_mod.csv ]; then
    sed 's/"X",/"X","X_copy",/' trumptweet.csv > trumptweet_mod.csv
    sed -i 's/"X.1",/"X_1",/g' trumptweet_mod.csv
    sed -i 's/\\"/""/g' trumptweet_mod.csv
else
    echo "trumptweet_mod.csv already exists"
fi

trumptweet_mod.csv already exists


In [2]:
import pyspark
from pyspark.sql import SparkSession

conf = pyspark.SparkConf()
conf.set("spark.executor.memory", "4g")
conf.set("spark.driver.memory", "4g")

sc = pyspark.SparkContext(conf = conf)
spark = SparkSession(sc)

In [21]:
labeled_data = spark.read.csv("trumptweet_mod.csv", header = True, escape = '"', encoding = "windows-1252", mode = "FAILFAST", multiLine = "true")
labeled_data.select("text","Class").describe().show()
labeled_data.groupBy('Class').count().show()

labeled_data = labeled_data.filter((labeled_data.Class == '0') | (labeled_data.Class == '1'))

+-------+--------------------+------------------+
|summary|                text|             Class|
+-------+--------------------+------------------+
|  count|                4602|              4602|
|   mean|                null|0.5084745762711864|
| stddev|                null|0.4999825016554307|
|    min|"#RedState hypocr...|                 0|
|    max|�trying to draft ...|                 1|
+-------+--------------------+------------------+

+-----+-----+
|Class|count|
+-----+-----+
|    0| 2262|
|    1| 2340|
+-----+-----+



In [22]:
# Normalize tweets
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import trim

rt_regex = r"(?=\s?)(RT)(?=\s?)"
user_regex = r"@\S+"
url_regex = r"http[s]?:\/\/\S+"
hashtag_regex = r"#"
unicode_regex = r"<ed>|<U\+[^>]*>"
space_etc_regex = r"^[\.'\",]\s+|\s{2,}|\n"

# Remove RT, users (@foo), URLs, #s, line feeds, odd unicode 'tags'
uber_regex =  "|".join([rt_regex, user_regex, url_regex, hashtag_regex, unicode_regex])

labeled_data = labeled_data.withColumn("norm_text", regexp_replace("text", uber_regex, ""))
labeled_data = labeled_data.withColumn("norm_text", trim(regexp_replace("norm_text", space_etc_regex, " ")))
labeled_data = labeled_data.filter(labeled_data.norm_text != '')
labeled_data = labeled_data.select("norm_text", "Class")
labeled_data.describe().show()
labeled_data.show()

+-------+--------------------+------------------+
|summary|           norm_text|             Class|
+-------+--------------------+------------------+
|  count|                4602|              4602|
|   mean|                null|0.5084745762711864|
| stddev|                null|0.4999825016554307|
|    min|" An FYI to all t...|                 0|
|    max|�trying to draft ...|                 1|
+-------+--------------------+------------------+

+--------------------+-----+
|           norm_text|Class|
+--------------------+-----+
|Illegals must be ...|    1|
|is there any othe...|    0|
|Caring - The GOP ...|    0|
|So much stupid go...|    0|
|THE TRUMP IMMIGRA...|    1|
|Christie on Donal...|    0|
|Not a Trump fan, ...|    1|
|Court Has To Step...|    0|
|Trump is correct ...|    1|
|"I�m going to pre...|    1|
|I really hope peo...|    0|
|Trump is claiming...|    0|
|Latest poll has T...|    1|
|BOOM � Univision ...|    1|
|I am now all in f...|    1|
|"His ratings amon...|    0

In [23]:
train_data, test_data = labeled_data.randomSplit([0.8, 0.2], seed=71082)

def write_df(df, dirname, filename):
    Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
    FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
    Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

    tmp_name = filename + ".tmp"
    df.coalesce(1).write.mode('overwrite').text(tmp_name)
    
    fs = FileSystem.get(Configuration())
    fs.mkdirs(Path(dirname))
    # Assume one file output
    file = fs.globStatus(Path(tmp_name + "/*.txt"))[0].getPath();
    fs.rename(file, Path(dirname + "/" + filename));
    fs.delete(Path(tmp_name), True);
       
# split training data into positive/negative
positive_data = train_data.filter(train_data.Class == "1").select("norm_text")
write_df(positive_data, "train-data.txt", "positive.txt")

negative_data = labeled_data.filter(train_data.Class == "0").select("norm_text")
write_df(negative_data, "train-data.txt", "negative.txt")

test_data.write.mode("overwrite").parquet("test-data.parquet")