In [1]:
#add shuffle and model parameters based on basic model

from pyspark.sql import SparkSession

# supress warnings
import warnings
warnings.filterwarnings('ignore')

spark = SparkSession.builder \
    .appName("Presidential Election Prediction") \
    .getOrCreate()

df = spark.read.parquet("part-00000-9d9060a9-37be-4b14-b8ce-98a3efe55429-c000.snappy.parquet")
df.printSchema()
df.show()  

24/05/08 10:57:59 WARN Utils: Your hostname, yangjingdeMacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.17.186.112 instead (on interface en0)
24/05/08 10:57:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/08 10:58:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/08 10:58:00 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


root
 |-- segment_id: long (nullable = true)
 |-- podcast_name_cleaned: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: string (nullable = true)

+----------+--------------------+--------------------+-----+
|segment_id|podcast_name_cleaned|            features|label|
+----------+--------------------+--------------------+-----+
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.8|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.5|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.8|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|    1|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.7|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.2|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.4|
|         0|Bill OReillys No ...|[-4.0,-0.00676251...|  0.5|
|         1|    pod save america|[10.0,9.362422279...|  0.5|
|         1|    pod save america|[10.0,9.362422279...|  0.7|
|         1|    pod save america

In [2]:
df.dtypes

[('segment_id', 'bigint'),
 ('podcast_name_cleaned', 'string'),
 ('features', 'vector'),
 ('label', 'string')]

In [3]:
from pyspark.sql.functions import when

# Assuming your DataFrame is named df and the label column is named "label"
#notes: the labels should ideally be non-negative integers starting from 0.
df = df.withColumn(
    "label_numeric",
    when(df["label"] == "Positive", 2)
    .when(df["label"] == "Neutral", 1)
    .otherwise(0)
)
df.dtypes

# Drop the original string label column if not needed
# df = df.drop("label")

[('segment_id', 'bigint'),
 ('podcast_name_cleaned', 'string'),
 ('features', 'vector'),
 ('label', 'string'),
 ('label_numeric', 'int')]

In [4]:
from pyspark.ml.classification import NaiveBayes, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import MinMaxScaler

In [5]:
# Handle Negative Values: since our features columns contains negative values, we use scaling techniques such as MinMaxScaler or StandardScaler to transform or standarize it.
scaler = MinMaxScaler(inputCol="features", outputCol="scaled_features")
scaler_model = scaler.fit(df)
df = scaler_model.transform(df)

In [6]:
# check the podcast_counts frequency distribution 
from pyspark.sql.functions import col, ceil
from pyspark.sql import DataFrame
from pyspark.sql import SparkSession

podcast_counts = df.groupBy("podcast_name_cleaned").count()
podcast_counts_ordered = podcast_counts.orderBy(col("count").desc(), col("podcast_name_cleaned"))
podcast_counts_ordered.show(podcast_counts_ordered.count(), truncate=False)

podcast_counts = podcast_counts.withColumn("total_count", ceil(col("count") * 0.8))  # Assuming 80% training rate
podcast_counts.show(podcast_counts.count(), truncate=False)

+---------------------------------------+-----+
|podcast_name_cleaned                   |count|
+---------------------------------------+-----+
|pod save america                       |120  |
|mark levin                             |92   |
|FiveThirtyEight Politics               |86   |
|ben shapiro                            |66   |
|dan bongino                            |62   |
|The Clay Travis and Buck Sexton Show   |58   |
|Steve Deace                            |57   |
|Bill OReillys No Spin News and Analysis|51   |
|The Wilderness                         |46   |
|Stay Tuned with Preet                  |40   |
|Common Sense with Dan Carlin           |39   |
|The Charlie Kirk Show                  |27   |
|joe rogan                              |26   |
|THE SAVAGE NATION                      |22   |
|npr politics podcast                   |15   |
|Today Explained                        |14   |
|transcripts                            |13   |
|The Rubin Report                       

In [7]:
#split the dataset using stratified sampling strategies to avoid bias and training rate is 80%
train_data = df.join(podcast_counts, "podcast_name_cleaned", "left") \
    .filter(col("count") <= col("total_count"))
test_data = df.join(podcast_counts, "podcast_name_cleaned", "left") \
    .filter(col("count") > col("total_count"))

In [8]:
# Train the model(LogisticRegression)
lr = LogisticRegression(featuresCol="scaled_features", labelCol="label_numeric")
lr_model = lr.fit(train_data)
lr_predictions = lr_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
lr_accuracy = evaluator.evaluate(lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

24/05/08 10:58:05 ERROR Instrumentation: java.lang.IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.
	at scala.Predef$.require(Predef.scala:281)
	at org.apache.spark.ml.stat.SummarizerBuffer.mean(Summarizer.scala:624)
	at org.apache.spark.ml.classification.LogisticRegression.$anonfun$train$1(LogisticRegression.scala:519)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:497)
	at org.apache.spark.ml.classification.LogisticRegression.train(LogisticRegression.scala:287)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:114)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:78)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal

IllegalArgumentException: requirement failed: Nothing has been added to this summarizer.

In [None]:
# notes: for binary classification tasks like SVM, the label conlum should contain only binary values (0 or 1).
# Train the model(SVM)
svm = LinearSVC(featuresCol="scaled_features", labelCol="label_numeric")
svm_model = svm.fit(train_data)
svm_predictions = svm_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
svm_accuracy = evaluator.evaluate(svm_predictions)
print("SVM Accuracy:", svm_accuracy)

In [None]:
# Train the model(NaiveBayes)
nb = NaiveBayes(featuresCol="scaled_features", labelCol="label_numeric")
nb_model = nb.fit(train_data)
nb_predictions = nb_model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label_numeric", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)