In [1]:
import findspark
import os
from dotenv import load_dotenv
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField
# Create SparkSession

load_dotenv()

spark = SparkSession.builder \
                    .appName(os.environ["SESSION_NAME"]) \
                    .config("spark.executor.memory", "5g") \
                    .config("spark.driver.memory", "5g") \
                    .getOrCreate()

# Define the schema for the DataFrame
schema = StructType([
    StructField('Number', StringType(), True),
    StructField('Description', StringType(), True),
    StructField('Short_description', StringType(), True),
    StructField('Resolution_notes', StringType(), True)
])

In [2]:
# Generate the fake descriptions
description1 = '''Si richiede la clonazione anagrafica dell'ndg 9342345 comprensiva dei conti correnti
Il cliente in questone, QUIXA ha richiesto una serie di personalizzazioni che andranno collegate all'NDG ed al conto, per i quali si rendono necessari diversi test in UJ.
Q0 non è fondamentale ma potrebbe essere utile averlo allineato.'''
description2 = '''NEW Customer creation (new ndg) not possible because the customer's passport is already available (see error message)
Customer's passport details: Austrian passport number U223424799
01/16/2018 - 01/15/2028
BH Dornbirn.
Urgent case!
Please research where stored.
Thank you!'''
description3 = '''NEU Kundenanlage nicht möglich, da Reisepass von Kundin bereits vergeben ist (siehe Fehlermeldung)
Reisepassdaten von Kundin: Osterreichischer Reisepass U2723429 16.01.2018 - 15.01.2028 BH Dornbirn.
Dringender Fall!
Bitte um recherche, WO gespeichert Vielen Dank!'''

# Define the invented data
data = [
    ('1', description1, 'Short description 1', 'Resolution notes for number 1'),
    ('2', description2, 'Short description 2', 'Resolution notes for number 2'),
    ('3', description3, 'Short description 3', 'Resolution notes for number 3')
]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

In [3]:
df.show()

+------+--------------------+-------------------+--------------------+
|Number|         Description|  Short_description|    Resolution_notes|
+------+--------------------+-------------------+--------------------+
|     1|Si richiede la cl...|Short description 1|Resolution notes ...|
|     2|NEW Customer crea...|Short description 2|Resolution notes ...|
|     3|NEU Kundenanlage ...|Short description 3|Resolution notes ...|
+------+--------------------+-------------------+--------------------+



In [4]:
from myprojects.datasets import utils

In [5]:
df_ano=utils.compute(df)

In [7]:
df_ano.show()

+------+--------------------+-------------------+--------------------+----------------------+----------------------------+---------------------------+
|Number|         Description|  Short_description|    Resolution_notes|Description_anonymized|Short_description_anonymized|Resolution_notes_anonymized|
+------+--------------------+-------------------+--------------------+----------------------+----------------------------+---------------------------+
|     1|Si richiede la cl...|Short description 1|Resolution notes ...|  Si richiede la cl...|         Short description 1|       Resolution notes ...|
|     2|NEW Customer crea...|Short description 2|Resolution notes ...|  NEW Customer crea...|         Short description 2|       Resolution notes ...|
|     3|NEU Kundenanlage ...|Short description 3|Resolution notes ...|  <ANONYMIZED> nich...|         Short description 3|       Resolution notes ...|
+------+--------------------+-------------------+--------------------+----------------------+-