In [1]:
#add shuffle and model parameters based on basic model

from pyspark.sql import SparkSession

# supress warnings
import warnings
warnings.filterwarnings('ignore')

spark = SparkSession.builder \
    .appName("Presidential Election Prediction") \
    .getOrCreate()

df = spark.read.parquet("part-00000-bb5ebda7-fafc-4784-a59f-3c66739c28ed-c000.snappy.parquet")
df.printSchema()
df.show()  

24/05/01 03:33:37 WARN Utils: Your hostname, yangjingdeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 192.168.1.7 instead (on interface en0)
24/05/01 03:33:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/01 03:33:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


root
 |-- podcast_name_cleaned: string (nullable = true)
 |-- trump_mention: integer (nullable = true)
 |-- biden_mention: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: string (nullable = true)

+--------------------+-------------+-------------+--------------------+--------+
|podcast_name_cleaned|trump_mention|biden_mention|            features|   label|
+--------------------+-------------+-------------+--------------------+--------+
|Bill OReillys No ...|            1|            0|[-4.0,0.012453206...|Negative|
|    pod save america|            0|            1|[10.0,0.015965217...| Neutral|
|Bill OReillys No ...|            1|            0|[29.0,0.005646936...|Positive|
|The Charlie Kirk ...|            1|            1|[20.0,0.029572529...| Neutral|
|The Clay Travis a...|            1|            1|[19.0,0.024224661...| Neutral|
|      The Wilderness|            0|            1|[54.0,0.036696821...|Positive|
|      The Wilderness|            0|        

In [2]:
df.dtypes

[('podcast_name_cleaned', 'string'),
 ('trump_mention', 'int'),
 ('biden_mention', 'int'),
 ('features', 'vector'),
 ('label', 'string')]

In [3]:
from pyspark.sql.functions import when

# Assuming your DataFrame is named df and the label column is named "label"
#notes: the labels should ideally be non-negative integers starting from 0.
df = df.withColumn(
    "label_numeric",
    when(df["label"] == "Positive", 2)
    .when(df["label"] == "Neutral", 1)
    .otherwise(0)
)
df.dtypes

# Drop the original string label column if not needed
# df = df.drop("label")

[('podcast_name_cleaned', 'string'),
 ('trump_mention', 'int'),
 ('biden_mention', 'int'),
 ('features', 'vector'),
 ('label', 'string'),
 ('label_numeric', 'int')]

In [4]:
from pyspark.ml.classification import NaiveBayes, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler

In [5]:
# Handle Negative Values: since our features columns contains negative values, we use scaling techniques such as MinMaxScaler or StandardScaler to transform or standarize it.
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [6]:
# check the podcast_counts frequency distribution 
from pyspark.sql.functions import col, ceil
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession

podcast_counts = df.groupBy("podcast_name_cleaned").count()
podcast_counts_ordered = podcast_counts.orderBy(col("count").desc(), col("podcast_name_cleaned"))
podcast_counts_ordered.show(podcast_counts_ordered.count(), truncate=False)

podcast_counts = podcast_counts.withColumn("total_count", ceil(col("count") * 0.8))  # Assuming 80% training rate
podcast_counts.show(podcast_counts.count(), truncate=False)

+---------------------------------------+-----+
|podcast_name_cleaned                   |count|
+---------------------------------------+-----+
|pod save america                       |123  |
|mark levin                             |77   |
|FiveThirtyEight Politics               |75   |
|The Clay Travis and Buck Sexton Show   |70   |
|dan bongino                            |68   |
|ben shapiro                            |67   |
|Steve Deace                            |63   |
|The Wilderness                         |55   |
|Common Sense with Dan Carlin           |34   |
|The Rubin Report                       |34   |
|npr politics podcast                   |34   |
|Bill OReillys No Spin News and Analysis|33   |
|Stay Tuned with Preet                  |29   |
|THE SAVAGE NATION                      |29   |
|joe rogan                              |22   |
|the daily                              |15   |
|The Charlie Kirk Show                  |14   |
|transcripts                            

In [11]:
#split the dataset using stratified sampling strategies to avoid bias and training rate is 80%
train_data = df.join(podcast_counts, "podcast_name_cleaned", "left") \
    .filter(col("count") <= col("total_count"))
test_data = df.join(podcast_counts, "podcast_name_cleaned", "left") \
    .filter(col("count") > col("total_count"))

In [12]:
# Train the model(LogisticRegression)
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label_numeric")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

Logistic Regression Accuracy: 0.375


In [None]:
# notes: for binary classification tasks like SVM, the label conlum should contain only binary values (0 or 1).
# Train the model(SVM)
svm = LinearSVC(featuresCol="scaled_features", labelCol="label_numeric")
svm_model = svm.fit(train_data)
svm_predictions = svm_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
svm_accuracy = evaluator.evaluate(svm_predictions)
print("SVM Accuracy:", svm_accuracy)

In [13]:
# Train the model(NaiveBayes)
nb = NaiveBayes(featuresCol="scaled_features", labelCol="label_numeric")
nb_model = nb.fit(train_data)
nb_predictions = nb_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)

Naive Bayes Accuracy: 0.3830275229357798
