In [None]:
# Setup Spark Session
from pyspark.sql import SparkSession

# Get Imports Needed
from pyspark.sql.functions import col, udf, regexp_replace, lower
from pyspark.sql import functions as F


# Get Datatypes needed for DataFrame manipulation
from pyspark.sql.types import IntegerType, StringType, ArrayType

# Other Imports
import re  # Import the "re" module for regular expressions


# Setup Spark Session
sc = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("Qualyfing-Exam") \
        .getOrCreate()

# Print Spark Version being run
print("Spark V: ", sc.version)

In [None]:
# Setup the Dataframes for Training & Testing
# Read the English & German Datasets into Dataframes
df_en = sc.read.csv("data/pd_en_translated.csv", header=True, inferSchema=True)
df_de = sc.read.csv("data/pd_de_translated.csv", header=True, inferSchema=True)
print(f"English Row Count: {df_en.count()}")
print(f"German Row Count: {df_de.count()}")

# Add Numerical Label for Emotions
emotion_key = {
    "boredom": 0,
    "love": 1,
    "relief": 2,
    "fun": 3,
    "hate": 4,
    "neutral": 5,
    "anger": 6,
    "happiness": 7,
    "surprise": 8,
    "sadness": 9,
    "worry": 10,
    "enthusiasm": 11,
    "empty": 12,
    "---": 13
}

# Create a mapping function to map emotion_en to label
def map_emotion(label):
    return emotion_key[label]

# Add a new column "label" to the DataFrame with numerical emotion labels
map_emotion_udf = F.udf(map_emotion, IntegerType())
df_en = df_en.withColumn("label", map_emotion_udf("emotion_en"))
df_de = df_de.withColumn("label", map_emotion_udf("emotion_en"))

# Cast the Label to Double
df_en = df_en.withColumn("label", col("label").cast("double"))
df_de = df_de.withColumn("label", col("label").cast("double"))


# Make Sure there are No Rows with NULLs in the sentence columns
df_en = df_en.dropna(subset=["sentence_en"])
df_en = df_en.dropna(subset=["sentence_de"])

df_de = df_de.dropna(subset=["sentence_de"])
df_de = df_de.dropna(subset=["sentence_en"])


# Function to clean Sentence
def clean_sentence(sentence):
    sentence = re.sub(r'@\w+', '', sentence) # Remove mentions (@user)
    sentence = re.sub(r'#\w+', '', sentence) # Remove hashtags (#weekend)
    sentence = re.sub(r'https?://\S+|www\.\S+|bit\.ly/\S+', '', sentence) # Remove URLs
    sentence = re.sub(r"[^\w\s]", "", sentence) # Remove special characters and symbols
    sentence = sentence.lower() # Convert to lowercase
    sentence = re.sub(r'\s+', ' ', sentence).strip() # Remove multiple spaces and leading/trailing spaces
    sentence = re.sub(r'\t', ' ', sentence) # Remove tabs
    return sentence

# Create a User-Defined Function (UDF) to apply the sentence cleaning function to the DataFrame
clean_sentence_udf = udf(clean_sentence, StringType())
df_en = df_en.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_en = df_en.withColumn("sentence_de", clean_sentence_udf("sentence_de"))

df_de = df_de.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_de = df_de.withColumn("sentence_de", clean_sentence_udf("sentence_de"))


# Split the English and German Dataframes for Training and Testing
# We are using a 20/80 Split
df_en_train, df_en_test = df_en.randomSplit([0.85, 0.15], seed=2023)
df_de_train, df_de_test = df_de.randomSplit([0.85, 0.15], seed=2023)
print(f"English Train Row Count: {df_en_train.count()}")
print(f"German Train Row Count: {df_de_train.count()}")


# Create the Extended Dataframe wiht Translated Data
df_en_train_extended = df_en_train.union(df_de.select(*df_en_train.columns))
df_de_train_extended = df_de_train.union(df_en.select(*df_de_train.columns))
print(f"English Extended Train Row Count: {df_en_train_extended.count()}")
print(f"German Extended Train Row Count: {df_de_train_extended.count()}")


df_en_train.groupBy("label").count().show()
df_en_train_extended.groupBy("label").count().show()


In [None]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, Word2Vec
from transformers import DistilBertTokenizer

import os
import shutil


# Clean Dataframes (becasue of past iterations)
df_en_train = df_en_train.select("label", "sentence_en", "sentence_de")
df_en_test = df_en_test.select("label", "sentence_en", "sentence_de")
df_de_train = df_de_train.select("label", "sentence_en", "sentence_de")
df_de_test = df_de_test.select("label", "sentence_en", "sentence_de")


# Initialize the AutoTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Define a UDF to tokenize the text column
def tokenize_text(text):
    return tokenizer.tokenize(text)

tokenize_udf = udf(tokenize_text, ArrayType(StringType()))


# Tokenize the text column using the UDF
df_en_train = df_en_train.withColumn("words", tokenize_udf("sentence_en"))
df_en_test = df_en_test.withColumn("words", tokenize_udf("sentence_en"))

df_de_train = df_de_train.withColumn("words", tokenize_udf("sentence_de"))
df_de_test = df_de_test.withColumn("words", tokenize_udf("sentence_de"))


# Prepare the feature column by applying various transformations
stop_words_remover = StopWordsRemover(inputCol="words", 
                                      outputCol="filtered_words")

cv = CountVectorizer(inputCol="filtered_words", 
                     outputCol="raw_features", 
                     vocabSize=1000)

idf = IDF(inputCol="raw_features", 
          outputCol="features")

word2Vec = Word2Vec(vectorSize=100,
                    minCount=3,
                    inputCol="filtered_words",
                    outputCol="features")

# Models
rfc = RandomForestClassifier(labelCol="label", 
                             featuresCol="features", 
                             maxBins=32,
                             numTrees=100, 
                             maxDepth=10)

nb = NaiveBayes(labelCol="label", 
                featuresCol="features", 
                modelType="multinomial",
                smoothing=1.0)

lr = LogisticRegression(labelCol="label", 
                        featuresCol="features", 
                        regParam=1.0,
                        elasticNetParam=0.01,
                        maxIter=100)


# Create the pipeline
pipeline_rfc = Pipeline(stages=[stop_words_remover, word2Vec, rfc ])
pipeline_nb  = Pipeline(stages=[stop_words_remover, cv,       idf, nb])
pipeline_lr  = Pipeline(stages=[stop_words_remover, cv,       idf, lr])


# Fit the pipeline to the training data
# This is essentially creating the model
model_en_rfc = pipeline_rfc.fit(df_en_train)
model_en_nb  = pipeline_nb.fit(df_en_train)
model_en_lr  = pipeline_lr.fit(df_en_train)
model_de_rfc = pipeline_rfc.fit(df_de_train)
model_de_nb  = pipeline_nb.fit(df_de_train)
model_de_lr  = pipeline_lr.fit(df_de_train)

# Save Trained Models to Disk
# These will later on be used by the RESTful API

# First Cleanup Old Models
for root, dirs, files in os.walk("./models/", topdown=False):
    for dir_name in dirs:
        folder_path = os.path.join(root, dir_name)
        shutil.rmtree(folder_path)
        print(f"Removed folder: {folder_path}")

model_en_rfc.save("./models/model_en_rfc.model")
model_en_nb.save("./models/model_en_nb.model")
model_en_lr.save("./models/model_en_lr.model")
model_de_rfc.save("./models/model_de_rfc.model")
model_de_nb.save("./models/model_de_nb.model")
model_de_lr.save("./models/model_de_lr.model")


# Make Predictions
predictions_en_rfc = model_en_rfc.transform(df_en_test)
predictions_en_nb = model_en_nb.transform(df_en_test)
predictions_en_lr = model_en_lr.transform(df_en_test)
predictions_de_rfc = model_de_rfc.transform(df_de_test)
predictions_de_nb = model_de_nb.transform(df_de_test)
predictions_de_lr = model_de_lr.transform(df_de_test)


# Evaluate the model's performance
eval_ac = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
eval_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

# Process English
accuracy_en_rfc = eval_ac.evaluate(predictions_en_rfc)
accuracy_en_nb = eval_ac.evaluate(predictions_en_nb)
accuracy_en_lr = eval_ac.evaluate(predictions_en_lr)

f1_en_rfc = eval_f1.evaluate(predictions_en_rfc)
f1_en_nb = eval_f1.evaluate(predictions_en_nb)
f1_en_lr = eval_f1.evaluate(predictions_en_lr)

# Process German
accuracy_de_rfc = eval_ac.evaluate(predictions_de_rfc)
accuracy_de_nb = eval_ac.evaluate(predictions_de_nb)
accuracy_de_lr = eval_ac.evaluate(predictions_de_lr)

f1_de_rfc = eval_f1.evaluate(predictions_de_rfc)
f1_de_nb = eval_f1.evaluate(predictions_de_nb)
f1_de_lr = eval_f1.evaluate(predictions_de_lr)

print()
print("English Dataset")
print(f"Test Accuracy (RFC): {accuracy_en_rfc:.4f} with an F1: {f1_en_rfc:.4f}")
print(f"Test Accuracy (NB): {accuracy_en_nb:.4f} with an F1: {f1_en_nb:.4f}")
print(f"Test Accuracy (LR): {accuracy_en_lr:.4f} with an F1: {f1_en_lr:.4f}")
print()

print()
print("German Dataset")
print(f"Test Accuracy (RFC): {accuracy_de_rfc:.4f} with an F1: {f1_de_rfc:.4f}")
print(f"Test Accuracy (NB): {accuracy_de_nb:.4f} with an F1: {f1_de_nb:.4f}")
print(f"Test Accuracy (LR): {accuracy_de_lr:.4f} with an F1: {f1_de_lr:.4f}")
print()


In [None]:
print("DE - Labels")
predictions_de_rfc.groupBy("label").count().show()

print("DE - RFC")
predictions_de_rfc.groupBy("prediction").count().show()

print("DE - LR")
predictions_de_lr.groupBy("prediction").count().show()

print("DE - NB")
predictions_de_nb.groupBy("prediction").count().show()

print("EN - Labels")
predictions_en_rfc.groupBy("label").count().show()

print("EN - RFC")
predictions_en_rfc.groupBy("prediction").count().show()

print("EN - LR")
predictions_en_lr.groupBy("prediction").count().show()

print("EN - NB")
predictions_en_nb.groupBy("prediction").count().show()