In [1]:
# Setup Spark Session
from pyspark.sql import SparkSession

# Get Imports Needed
from pyspark.sql.functions import col, udf, regexp_replace, lower
from pyspark.sql import functions as F


# Get Datatypes needed for DataFrame manipulation
from pyspark.sql.types import IntegerType, StringType, ArrayType

# Other Imports
import re  # Import the "re" module for regular expressions


# Setup Spark Session
sc = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("Qualyfing-Exam") \
        .getOrCreate()

# Print Spark Version being run
print("Spark V: ", sc.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/08/08 17:06:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark V:  3.3.2


In [2]:
# Setup the Dataframes for Training & Testing
# Read the English & German Datasets into Dataframes
df_en = sc.read.csv("data/pd_en_translated.csv", header=True, inferSchema=True)
df_de = sc.read.csv("data/pd_de_translated.csv", header=True, inferSchema=True)
print(f"English Row Count: {df_en.count()}")
print(f"German Row Count: {df_de.count()}")

# Ad Numerical Label for Emotions
emotion_key = {
    "boredom": 0,
    "love": 1,
    "relief": 2,
    "fun": 3,
    "hate": 4,
    "neutral": 5,
    "anger": 6,
    "happiness": 7,
    "surprise": 8,
    "sadness": 9,
    "worry": 10,
    "enthusiasm": 11,
    "empty": 12,
    "---": 13
}

# Create a mapping function to map emotion_en to label
def map_emotion(label):
    return emotion_key[label]

# Add a new column "label" to the DataFrame with numerical emotion labels
map_emotion_udf = F.udf(map_emotion, IntegerType())
df_en = df_en.withColumn("label", map_emotion_udf("emotion_en"))
df_de = df_de.withColumn("label", map_emotion_udf("emotion_en"))

# Cast the Label to Double
df_en = df_en.withColumn("label", col("label").cast("double"))
df_de = df_de.withColumn("label", col("label").cast("double"))


# Make Sure there are No Rows with NULLs in the sentence columns
df_en = df_en.dropna(subset=["sentence_en"])
df_en = df_en.dropna(subset=["sentence_de"])

df_de = df_de.dropna(subset=["sentence_de"])
df_de = df_de.dropna(subset=["sentence_en"])


# Function to clean Sentence
def clean_sentence(sentence):
    sentence = re.sub(r'@\w+', '', sentence) # Remove mentions (@user)
    sentence = re.sub(r'#\w+', '', sentence) # Remove hashtags (#weekend)
    sentence = re.sub(r'https?://\S+|www\.\S+|bit\.ly/\S+', '', sentence) # Remove URLs
    sentence = re.sub(r"[^\w\s]", "", sentence) # Remove special characters and symbols
    sentence = sentence.lower() # Convert to lowercase
    sentence = re.sub(r'\s+', ' ', sentence).strip() # Remove multiple spaces and leading/trailing spaces
    sentence = re.sub(r'\t', ' ', sentence) # Remove tabs
    return sentence

# Create a User-Defined Function (UDF) to apply the sentence cleaning function to the DataFrame
clean_sentence_udf = udf(clean_sentence, StringType())
df_en = df_en.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_en = df_en.withColumn("sentence_de", clean_sentence_udf("sentence_de"))

df_de = df_de.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_de = df_de.withColumn("sentence_de", clean_sentence_udf("sentence_de"))


# Split the English and German Dataframes for Training and Testing
# We are using a 20/80 Split
df_en_train, df_en_test = df_en.randomSplit([0.85, 0.15], seed=2023)
df_de_train, df_de_test = df_de.randomSplit([0.85, 0.15], seed=2023)
print(f"English Train Row Count: {df_en_train.count()}")
print(f"German Train Row Count: {df_de_train.count()}")


# Create the Extended Dataframe wiht Translated Data
df_en_train_extended = df_en_train.union(df_de.select(*df_en_train.columns))
df_de_train_extended = df_de_train.union(df_en.select(*df_de_train.columns))
print(f"English Extended Train Row Count: {df_en_train_extended.count()}")
print(f"German Extended Train Row Count: {df_de_train_extended.count()}")


df_en_train.groupBy("label").count().show()
df_en_train_extended.groupBy("label").count().show()


English Row Count: 1500
German Row Count: 1500
English Train Row Count: 1288
German Train Row Count: 1281
English Extended Train Row Count: 2779
German Extended Train Row Count: 2781
+-----+-----+
|label|count|
+-----+-----+
|  8.0|   81|
|  7.0|  186|
|  1.0|  142|
|  4.0|   42|
| 11.0|   25|
| 10.0|  288|
|  6.0|    8|
|  5.0|  310|
|  9.0|  206|
+-----+-----+

+-----+-----+
|label|count|
+-----+-----+
|  8.0|  291|
|  7.0|  271|
|  1.0|  334|
|  4.0|   62|
| 11.0|  494|
| 10.0|  375|
|  6.0|  140|
|  5.0|  499|
|  9.0|  313|
+-----+-----+



In [19]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF

from transformers import DistilBertTokenizer

# Pass Values
df_train = df_en_train
df_test = df_en_test
col_sentence = "sentence_en"


# Initialize the AutoTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Define a UDF to tokenize the text column
def tokenize_text(text):
    return tokenizer.tokenize(text)

tokenize_udf = udf(tokenize_text, ArrayType(StringType()))


# Tokenize the text column using the UDF
df_train = df_train.withColumn("words", tokenize_udf(col_sentence))
df_test = df_test.withColumn("words", tokenize_udf(col_sentence))


# Prepare the feature column by applying various transformations
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
cv = CountVectorizer(inputCol="filtered_words", outputCol="raw_features", vocabSize=1000)
idf = IDF(inputCol="raw_features", outputCol="features")

# Create a pipeline
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[stop_words_remover, cv, idf])


# Fit the pipeline to the training data
pipeline_model = pipeline.fit(df_train)


# Transform the training and testing data
df_train = pipeline_model.transform(df_train)
df_test = pipeline_model.transform(df_test)


# Select relevant columns for the model
df_train = df_train.select("features", "label")
df_test = df_test.select("features", "label")


# Train the RandomForestClassifier model
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
model_a = rf.fit(df_train)

# Train the Naive Bayes model
nb_model = NaiveBayes(labelCol="label", featuresCol="features")
model_b = nb_model.fit(df_train)

# Train the GLM model (General Linear Model)
glm_model = LogisticRegression(labelCol="label", featuresCol="features", maxIter=500)
model_c = glm_model.fit(df_train)


# Make predictions on the test data
predictions_a = model_a.transform(df_test)
predictions_b = model_b.transform(df_test)
predictions_c = model_c.transform(df_test)


# Evaluate the model's performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

accuracy_a = evaluator.evaluate(predictions_a)
accuracy_b = evaluator.evaluate(predictions_b)
accuracy_c = evaluator.evaluate(predictions_c)

f1_a = evaluator_f1.evaluate(predictions_a)
f1_b = evaluator_f1.evaluate(predictions_b)
f1_c = evaluator_f1.evaluate(predictions_c)

print()
print(f"Test Accuracy (A) - Random Forest: {accuracy_a:.4f} with an F1: {f1_a:.4f}")
print(f"Test Accuracy (B) - Naive Bayes: {accuracy_b:.4f} with an F1: {f1_b:.4f}")
print(f"Test Accuracy (C) - GLM: {accuracy_c:.4f} with an F1: {f1_c:.4f}")
print()



Test Accuracy (A) - Random Forest: 0.2877 with an F1: 0.2074
Test Accuracy (B) - Naive Bayes: 0.0566 with an F1: 0.0692
Test Accuracy (C) - GLM: 0.2594 with an F1: 0.2592



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics, MultilabelMetrics


# Calculate the confusion matrix
metrics_a = MulticlassMetrics(predictions_a.select("prediction", "label").rdd)
metrics_b = MulticlassMetrics(predictions_b.select("prediction", "label").rdd)
metrics_c = MulticlassMetrics(predictions_c.select("prediction", "label").rdd)

confusion_matrix_a = metrics_a.confusionMatrix()
confusion_matrix_b = metrics_b.confusionMatrix()
confusion_matrix_c = metrics_c.confusionMatrix()

# Display the results
print("Confusion Matrix (A):")
print(confusion_matrix_a)
print()

print("Confusion Matrix (B):")
print(confusion_matrix_b)
print()

print("Confusion Matrix (C):")
print(confusion_matrix_c)
print()


# Get additional metrics for each label
labels_a = predictions_a.select("label").distinct().rdd.flatMap(lambda x: x).collect()
labels_b = predictions_b.select("label").distinct().rdd.flatMap(lambda x: x).collect()
labels_c = predictions_c.select("label").distinct().rdd.flatMap(lambda x: x).collect()

for label in sorted(labels_a):
    precision = metrics_a.precision(label)
    recall = metrics_a.recall(label)
    f1_score = metrics_a.fMeasure(label)
    print(f"Label: {label}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print()
    
for label in sorted(labels_b):
    precision = metrics_b.precision(label)
    recall = metrics_b.recall(label)
    f1_score = metrics_b.fMeasure(label)
    print(f"Label: {label}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print()
    
for label in sorted(labels_c):
    precision = metrics_c.precision(label)
    recall = metrics_c.recall(label)
    f1_score = metrics_c.fMeasure(label)
    print(f"Label: {label}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")
    print()