In [1]:
# Setup Spark Session
from pyspark.sql import SparkSession

# Get Imports Needed
from pyspark.sql.functions import col, udf, regexp_replace, lower
from pyspark.sql import functions as F


# Get Datatypes needed for DataFrame manipulation
from pyspark.sql.types import IntegerType, StringType, ArrayType

# Other Imports
import re  # Import the "re" module for regular expressions


# Setup Spark Session
sc = SparkSession \
        .builder \
        .master("local[*]") \
        .appName("Qualyfing-Exam") \
        .getOrCreate()

# Print Spark Version being run
print("Spark V: ", sc.version)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/08/11 08:46:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Spark V:  3.3.2


In [2]:
# Setup the Dataframes for Training & Testing
# Read the English & German Datasets into Dataframes
df_en = sc.read.csv("data/pd_en_translated.csv", header=True, inferSchema=True)
df_de = sc.read.csv("data/pd_de_translated.csv", header=True, inferSchema=True)
print(f"English Row Count: {df_en.count()}")
print(f"German Row Count: {df_de.count()}")

# Add Numerical Label for Emotions
emotion_key = {
    "boredom": 0,
    "love": 1,
    "relief": 2,
    "fun": 3,
    "hate": 4,
    "neutral": 5,
    "anger": 6,
    "happiness": 7,
    "surprise": 8,
    "sadness": 9,
    "worry": 10,
    "enthusiasm": 11,
    "empty": 12,
    "---": 13
}

# Create a mapping function to map emotion_en to label
def map_emotion(label):
    return emotion_key[label]

# Add a new column "label" to the DataFrame with numerical emotion labels
map_emotion_udf = F.udf(map_emotion, IntegerType())
df_en = df_en.withColumn("label", map_emotion_udf("emotion_en"))
df_de = df_de.withColumn("label", map_emotion_udf("emotion_en"))

# Cast the Label to Double
df_en = df_en.withColumn("label", col("label").cast("double"))
df_de = df_de.withColumn("label", col("label").cast("double"))


# Make Sure there are No Rows with NULLs in the sentence columns
df_en = df_en.dropna(subset=["sentence_en"])
df_en = df_en.dropna(subset=["sentence_de"])

df_de = df_de.dropna(subset=["sentence_de"])
df_de = df_de.dropna(subset=["sentence_en"])


# Function to clean Sentence
def clean_sentence(sentence):
    sentence = re.sub(r'@\w+', '', sentence) # Remove mentions (@user)
    sentence = re.sub(r'#\w+', '', sentence) # Remove hashtags (#weekend)
    sentence = re.sub(r'https?://\S+|www\.\S+|bit\.ly/\S+', '', sentence) # Remove URLs
    sentence = re.sub(r"[^\w\s]", "", sentence) # Remove special characters and symbols
    sentence = sentence.lower() # Convert to lowercase
    sentence = re.sub(r'\s+', ' ', sentence).strip() # Remove multiple spaces and leading/trailing spaces
    sentence = re.sub(r'\t', ' ', sentence) # Remove tabs
    return sentence

# Create a User-Defined Function (UDF) to apply the sentence cleaning function to the DataFrame
clean_sentence_udf = udf(clean_sentence, StringType())
df_en = df_en.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_en = df_en.withColumn("sentence_de", clean_sentence_udf("sentence_de"))

df_de = df_de.withColumn("sentence_en", clean_sentence_udf("sentence_en"))
df_de = df_de.withColumn("sentence_de", clean_sentence_udf("sentence_de"))


# Split the English and German Dataframes for Training and Testing
# We are using a 20/80 Split
df_en_train, df_en_test = df_en.randomSplit([0.85, 0.15], seed=2023)
df_de_train, df_de_test = df_de.randomSplit([0.85, 0.15], seed=2023)
print(f"English Train Row Count: {df_en_train.count()}")
print(f"German Train Row Count: {df_de_train.count()}")


# Create the Extended Dataframe wiht Translated Data
df_en_train_extended = df_en_train.union(df_de.select(*df_en_train.columns))
df_de_train_extended = df_de_train.union(df_en.select(*df_de_train.columns))
print(f"English Extended Train Row Count: {df_en_train_extended.count()}")
print(f"German Extended Train Row Count: {df_de_train_extended.count()}")


df_en_train.groupBy("label").count().show()
df_en_train_extended.groupBy("label").count().show()


English Row Count: 1500
German Row Count: 1500
English Train Row Count: 1288
German Train Row Count: 1281
English Extended Train Row Count: 2779
German Extended Train Row Count: 2781
+-----+-----+
|label|count|
+-----+-----+
|  8.0|   81|
|  7.0|  186|
|  1.0|  142|
|  4.0|   42|
| 11.0|   25|
| 10.0|  288|
|  6.0|    8|
|  5.0|  310|
|  9.0|  206|
+-----+-----+

+-----+-----+
|label|count|
+-----+-----+
|  8.0|  291|
|  7.0|  271|
|  1.0|  334|
|  4.0|   62|
| 11.0|  494|
| 10.0|  375|
|  6.0|  140|
|  5.0|  499|
|  9.0|  313|
+-----+-----+



In [16]:
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, StringIndexer
from pyspark.ml.classification import NaiveBayes, RandomForestClassifier, LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, Tokenizer, StopWordsRemover, CountVectorizer, IDF, Word2Vec
from transformers import DistilBertTokenizer

import os
import shutil


# Clean Dataframes (becasue of past iterations)
df_en_train = df_en_train.select("label", "sentence_en", "sentence_de")
df_en_test = df_en_test.select("label", "sentence_en", "sentence_de")
df_de_train = df_de_train.select("label", "sentence_en", "sentence_de")
df_de_test = df_de_test.select("label", "sentence_en", "sentence_de")


# Initialize the AutoTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Define a UDF to tokenize the text column
def tokenize_text(text):
    return tokenizer.tokenize(text)

tokenize_udf = udf(tokenize_text, ArrayType(StringType()))


# Tokenize the text column using the UDF
df_en_train = df_en_train.withColumn("words", tokenize_udf("sentence_en"))
df_en_test = df_en_test.withColumn("words", tokenize_udf("sentence_en"))

df_de_train = df_de_train.withColumn("words", tokenize_udf("sentence_de"))
df_de_test = df_de_test.withColumn("words", tokenize_udf("sentence_de"))


# Prepare the feature column by applying various transformations
stop_words_remover = StopWordsRemover(inputCol="words", 
                                      outputCol="filtered_words")

cv = CountVectorizer(inputCol="filtered_words", 
                     outputCol="raw_features", 
                     vocabSize=1000)

idf = IDF(inputCol="raw_features", 
          outputCol="features")

word2Vec = Word2Vec(vectorSize=100,
                    minCount=3,
                    inputCol="filtered_words",
                    outputCol="features")

# Models
rfc = RandomForestClassifier(labelCol="label", 
                             featuresCol="features", 
                             maxBins=32,
                             numTrees=100, 
                             maxDepth=10)

nb = NaiveBayes(labelCol="label", 
                featuresCol="features", 
                modelType="multinomial",
                smoothing=1.0)

lr = LogisticRegression(labelCol="label", 
                        featuresCol="features", 
                        regParam=1.0,
                        elasticNetParam=0.01,
                        maxIter=100)


# Create the pipeline
pipeline_rfc = Pipeline(stages=[stop_words_remover, word2Vec, rfc ])
pipeline_nb  = Pipeline(stages=[stop_words_remover, cv,       idf, nb])
pipeline_lr  = Pipeline(stages=[stop_words_remover, cv,       idf, lr])


# Fit the pipeline to the training data
# This is essentially creating the model
model_en_rfc = pipeline_rfc.fit(df_en_train)
model_en_nb  = pipeline_nb.fit(df_en_train)
model_en_lr  = pipeline_lr.fit(df_en_train)
model_de_rfc = pipeline_rfc.fit(df_de_train)
model_de_nb  = pipeline_nb.fit(df_de_train)
model_de_lr  = pipeline_lr.fit(df_de_train)

# Save Trained Models to Disk
# These will later on be used by the RESTful API

# First Cleanup Old Models
for root, dirs, files in os.walk("./models/", topdown=False):
    for dir_name in dirs:
        folder_path = os.path.join(root, dir_name)
        shutil.rmtree(folder_path)
        print(f"Removed folder: {folder_path}")

model_en_rfc.save("./models/model_en_rfc.model")
model_en_nb.save("./models/model_en_nb.model")
model_en_lr.save("./models/model_en_lr.model")
model_de_rfc.save("./models/model_de_rfc.model")
model_de_nb.save("./models/model_de_nb.model")
model_de_lr.save("./models/model_de_lr.model")


# Make Predictions
predictions_en_rfc = model_en_rfc.transform(df_en_test)
predictions_en_nb = model_en_nb.transform(df_en_test)
predictions_en_lr = model_en_lr.transform(df_en_test)
predictions_de_rfc = model_de_rfc.transform(df_de_test)
predictions_de_nb = model_de_nb.transform(df_de_test)
predictions_de_lr = model_de_lr.transform(df_de_test)


# Evaluate the model's performance
eval_ac = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
eval_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

# Process English
accuracy_en_rfc = eval_ac.evaluate(predictions_en_rfc)
accuracy_en_nb = eval_ac.evaluate(predictions_en_nb)
accuracy_en_lr = eval_ac.evaluate(predictions_en_lr)

f1_en_rfc = eval_f1.evaluate(predictions_en_rfc)
f1_en_nb = eval_f1.evaluate(predictions_en_nb)
f1_en_lr = eval_f1.evaluate(predictions_en_lr)

# Process German
accuracy_de_rfc = eval_ac.evaluate(predictions_de_rfc)
accuracy_de_nb = eval_ac.evaluate(predictions_de_nb)
accuracy_de_lr = eval_ac.evaluate(predictions_de_lr)

f1_de_rfc = eval_f1.evaluate(predictions_de_rfc)
f1_de_nb = eval_f1.evaluate(predictions_de_nb)
f1_de_lr = eval_f1.evaluate(predictions_de_lr)

print()
print("English Dataset")
print(f"Test Accuracy (RFC): {accuracy_en_rfc:.4f} with an F1: {f1_en_rfc:.4f}")
print(f"Test Accuracy (NB): {accuracy_en_nb:.4f} with an F1: {f1_en_nb:.4f}")
print(f"Test Accuracy (LR): {accuracy_en_lr:.4f} with an F1: {f1_en_lr:.4f}")
print()

print()
print("German Dataset")
print(f"Test Accuracy (RFC): {accuracy_de_rfc:.4f} with an F1: {f1_de_rfc:.4f}")
print(f"Test Accuracy (NB): {accuracy_de_nb:.4f} with an F1: {f1_de_nb:.4f}")
print(f"Test Accuracy (LR): {accuracy_de_lr:.4f} with an F1: {f1_de_lr:.4f}")
print()


23/08/11 09:17:45 WARN DAGScheduler: Broadcasting large task binary with size 1449.0 KiB
23/08/11 09:17:46 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
23/08/11 09:17:47 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB



[Stage 1763:>                                                       (0 + 1) / 1]

                                                                                

23/08/11 09:17:47 WARN DAGScheduler: Broadcasting large task binary with size 4.8 MiB



[Stage 1765:>                                                       (0 + 1) / 1]

                                                                                

23/08/11 09:17:48 WARN DAGScheduler: Broadcasting large task binary with size 6.4 MiB



[Stage 1767:>                                                       (0 + 1) / 1]

                                                                                

23/08/11 09:17:55 WARN DAGScheduler: Broadcasting large task binary with size 1513.8 KiB
23/08/11 09:17:55 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
23/08/11 09:17:56 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB



[Stage 1868:>                                                       (0 + 1) / 1]

                                                                                

23/08/11 09:17:57 WARN DAGScheduler: Broadcasting large task binary with size 5.3 MiB



[Stage 1870:>                                                       (0 + 1) / 1]

                                                                                

23/08/11 09:17:58 WARN DAGScheduler: Broadcasting large task binary with size 7.1 MiB


                                                                                

Removed folder: ./models/model_en_nb.model/stages/0_StopWordsRemover_26ba14aeb971/metadata
Removed folder: ./models/model_en_nb.model/stages/3_NaiveBayes_3aca16295988/data
Removed folder: ./models/model_en_nb.model/stages/3_NaiveBayes_3aca16295988/metadata
Removed folder: ./models/model_en_nb.model/stages/1_CountVectorizer_450bf45e4a27/data
Removed folder: ./models/model_en_nb.model/stages/1_CountVectorizer_450bf45e4a27/metadata
Removed folder: ./models/model_en_nb.model/stages/2_IDF_f91d20a38f2d/data
Removed folder: ./models/model_en_nb.model/stages/2_IDF_f91d20a38f2d/metadata
Removed folder: ./models/model_en_nb.model/stages/0_StopWordsRemover_26ba14aeb971
Removed folder: ./models/model_en_nb.model/stages/3_NaiveBayes_3aca16295988
Removed folder: ./models/model_en_nb.model/stages/1_CountVectorizer_450bf45e4a27
Removed folder: ./models/model_en_nb.model/stages/2_IDF_f91d20a38f2d
Removed folder: ./models/model_en_nb.model/stages
Removed folder: ./models/model_en_nb.model/metadata
Remov

In [23]:
print("DE - Labels")
predictions_de_rfc.groupBy("label").count().show()

print("DE - RFC")
predictions_de_rfc.groupBy("prediction").count().show()

print("DE - LR")
predictions_de_lr.groupBy("prediction").count().show()

print("DE - NB")
predictions_de_nb.groupBy("prediction").count().show()

print("EN - Labels")
predictions_en_rfc.groupBy("label").count().show()

print("EN - RFC")
predictions_en_rfc.groupBy("prediction").count().show()

print("EN - LR")
predictions_en_lr.groupBy("prediction").count().show()

print("EN - NB")
predictions_en_nb.groupBy("prediction").count().show()

DE - Labels
+-----+-----+
|label|count|
+-----+-----+
|  8.0|   24|
|  7.0|   10|
|  1.0|   25|
|  4.0|    5|
| 11.0|   72|
| 10.0|    9|
|  6.0|   23|
|  5.0|   30|
|  9.0|   12|
+-----+-----+

DE - RFC
23/08/11 10:09:35 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
23/08/11 10:09:35 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
+----------+-----+
|prediction|count|
+----------+-----+
|       8.0|   11|
|       7.0|    5|
|       1.0|   13|
|       4.0|    2|
|      11.0|  161|
|      10.0|    4|
|       6.0|    7|
|       5.0|    5|
|       9.0|    2|
+----------+-----+

DE - LR
+----------+-----+
|prediction|count|
+----------+-----+
|      11.0|  209|
|       6.0|    1|
+----------+-----+

DE - NB
+----------+-----+
|prediction|count|
+----------+-----+
|       8.0|   47|
|       0.0|   34|
|       7.0|   20|
|       1.0|    5|
|       4.0|   16|
|       3.0|   16|
|       2.0|   28|
|       6.0|   20|
|       5.0|   24|
+----------+-----

In [22]:
# Setup API 
from flask import Flask, request, jsonify
from googletrans import Translator

# Setup
from transformers import DistilBertTokenizer
from pyspark.ml.pipeline import PipelineModel


# Initialize the AutoTokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")


# Load Saved Models
model_en_rfc = PipelineModel.load("./models/model_en_rfc.model")
model_en_nb = PipelineModel.load("./models/model_en_nb.model")
model_en_lr = PipelineModel.load("./models/model_en_lr.model")
model_de_rfc = PipelineModel.load("./models/model_de_rfc.model")
model_de_nb = PipelineModel.load("./models/model_de_nb.model")
model_de_lr = PipelineModel.load("./models/model_de_lr.model")


app = Flask(__name__)
translator = Translator()

# Setup Emotion Key Reverse
emotions = {v: k for k, v in emotion_key.items()}

@app.route('/predict', methods=['GET'])
def transform_data():
    sentence_en = request.args.get('sentence_en')
    
    if not sentence_en:
        return jsonify({'error': 'No Sentence (EN) provided!'})
    
    sentence_de = translator.translate(sentence_en, dest="de").text
    print(f"English: {sentence_en}")
    print(f"German: {sentence_de}")
    
    words_en = tokenizer.tokenize(sentence_en)
    words_de = tokenizer.tokenize(sentence_de)

    # Create Englsih & German Dataframe
    df_en_api = sc.createDataFrame([(sentence_en, words_en)], ["sentence", "words"])
    df_de_api = sc.createDataFrame([(sentence_de, words_de)], ["sentence", "words"])
    
    predictions_en_rfc = model_en_rfc.transform(df_en_api)
    predictions_en_nb = model_en_nb.transform(df_en_api)
    predictions_en_lr = model_en_lr.transform(df_en_api)
    predictions_de_rfc = model_de_rfc.transform(df_de_api)
    predictions_de_nb = model_de_nb.transform(df_de_api)
    predictions_de_lr = model_de_lr.transform(df_de_api)
    
    #Build Response
    resp = {
        "sentence": {
            "english":sentence_en,
            "german":sentence_de,
        },
        "predictions": {
            "en_rfc": emotions.get(predictions_en_rfc.collect()[0]["prediction"]),
            "en_nb":  emotions.get(predictions_en_nb.collect()[0]["prediction"]),
            "en_lr":  emotions.get(predictions_en_lr.collect()[0]["prediction"]),
            "de_rfc": emotions.get(predictions_de_rfc.collect()[0]["prediction"]),
            "de_nb":  emotions.get(predictions_de_nb.collect()[0]["prediction"]),
            "de_lr":  emotions.get(predictions_de_lr.collect()[0]["prediction"]),
        },
    }
    
    return jsonify(resp)

if __name__ == '__main__':
    app.run(host='127.0.0.1', port=8080)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:8080
[33mPress CTRL+C to quit[0m


English: I am so happy
German: ich bin so glücklich
23/08/11 09:31:01 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:31:02 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:31:02] "GET /predict?sentence_en=I%20am%20so%20happy HTTP/1.1" 200 -


English: This is not good
German: Das ist nicht gut
23/08/11 09:31:13 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:31:14 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:31:14] "GET /predict?sentence_en=This%20is%20not%20good HTTP/1.1" 200 -


English: This is not bad
German: Das ist nicht schlecht
23/08/11 09:31:28 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:31:29 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:31:29] "GET /predict?sentence_en=This%20is%20not%20bad HTTP/1.1" 200 -


English: This is not bad lets do it
German: Das ist nicht schlecht, lass es uns tun
23/08/11 09:31:47 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:31:47 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:31:48] "GET /predict?sentence_en=This%20is%20not%20bad%20lets%20do%20it HTTP/1.1" 200 -


English: This is bad lets do it
German: Das ist schlecht, lass es uns tun
23/08/11 09:32:03 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:32:03 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:32:03] "GET /predict?sentence_en=This%20is%20bad%20lets%20do%20it HTTP/1.1" 200 -


English: I would love to participate
German: Ich würde gerne teilnehmen
23/08/11 09:32:19 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:32:20 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:32:20] "GET /predict?sentence_en=I%20would%20love%20to%20participate HTTP/1.1" 200 -


English: I would love
German: Ich würde lieben
23/08/11 09:32:33 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:32:33 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:32:33] "GET /predict?sentence_en=I%20would%20love HTTP/1.1" 200 -


English: Wie intelligent ist künstliche Intelligenz
German: Wie intelligent ist künstliche Intelligenz
23/08/11 09:33:23 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:33:24 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:33:24] "GET /predict?sentence_en=Wie%20intelligent%20ist%20künstliche%20Intelligenz HTTP/1.1" 200 -


English: How smart is AI
German: Wie schlau ist KI
23/08/11 09:33:36 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:33:36 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:33:36] "GET /predict?sentence_en=How%20smart%20is%20AI HTTP/1.1" 200 -


English: How intelligent is artificial intelligence?
German: Wie intelligent ist künstliche Intelligenz?
23/08/11 09:34:05 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:34:06 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:34:06] "GET /predict?sentence_en=How%20intelligent%20is%20artificial%20intelligence? HTTP/1.1" 200 -


English: How intelligent are you 
German: Wie intelligent bist du
23/08/11 09:34:22 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:34:23 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:34:23] "GET /predict?sentence_en=How%20intelligent%20are%20you%20 HTTP/1.1" 200 -


English: I am not looking forward to this
German: Ich freue mich nicht darauf
23/08/11 09:34:44 WARN DAGScheduler: Broadcasting large task binary with size 5.2 MiB
23/08/11 09:34:45 WARN DAGScheduler: Broadcasting large task binary with size 5.5 MiB


127.0.0.1 - - [11/Aug/2023 09:34:45] "GET /predict?sentence_en=I%20am%20not%20looking%20forward%20to%20this HTTP/1.1" 200 -
