# Text Processing:
---

In [1]:
import sparknlp

spark = sparknlp.start()

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2,application_1731014642264_0003,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [2]:
sc._conf.get('spark.executor.memory')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

'9486M'

## Librerias:

In [3]:
# PySpark SQL Modules
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit, when, trim, split, udf, expr
from pyspark.sql.types import StringType, ArrayType
from sparknlp.annotator import *

# PySpark ML Modules
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer, StopWordsRemover

# Other Libraries
import os

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Lectura de Datos
---

In [4]:
# Ruta al archivo JSON
file_path = 's3://bucketspark14/complaints.json'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
# Cargar archivo JSON en Spark DataFrame
df_tickets = spark.read.json(file_path)

# Desanidar las columnas dentro de _source
df_tickets_flat = df_tickets.select(
    "_id", "_index", "_score",
    col("_source.tags").alias("tags"),
    col("_source.zip_code").alias("zip_code"),
    col("_source.complaint_id").alias("complaint_id"),
    col("_source.issue").alias("issue"),
    col("_source.date_received").alias("date_received"),
    col("_source.state").alias("state"),
    col("_source.consumer_disputed").alias("consumer_disputed"),
    col("_source.product").alias("product"),
    col("_source.company_response").alias("company_response"),
    col("_source.company").alias("company"),
    col("_source.submitted_via").alias("submitted_via"),
    col("_source.date_sent_to_company").alias("date_sent_to_company"),
    col("_source.company_public_response").alias("company_public_response"),
    col("_source.sub_product").alias("sub_product"),
    col("_source.timely").alias("timely"),
    col("_source.complaint_what_happened").alias("complaint_what_happened"),
    col("_source.sub_issue").alias("sub_issue"),
    col("_source.consumer_consent_provided").alias("consumer_consent_provided"),
    "_type"
)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Limpieza de Datos
---

In [6]:
# Seleccionar la columnas
df_filtered = df_tickets_flat.select('complaint_what_happened','product','sub_product')

# Concatenar las columnas 'category' y 'sub_category' con un '+' en medio
df_filtered = df_filtered.withColumn('category', F.concat_ws('+', F.col('product'), F.col('sub_product')))

# Eliminar la columna 'sub_category'
df_filtered = df_filtered.drop('product','sub_product')

# Reemplazar cadenas vacías por nulos
df_filtered = df_filtered.withColumn("complaint_what_happened", when(trim(col("complaint_what_happened")) == "", lit(None)).otherwise(col("complaint_what_happened")))        

# Eliminar filas con nulos en columnas especificadas
df_filtered = df_filtered.dropna(subset=["complaint_what_happened", "category"])

df_filtered.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+--------------------+
|complaint_what_happened|            category|
+-----------------------+--------------------+
|   Good morning my n...|Debt collection+C...|
|   I upgraded my XXX...|Credit card or pr...|
|   Chase Card was re...|Credit reporting,...|
|   On XX/XX/2018, wh...|Credit reporting,...|
|   my grand son give...|Checking or savin...|
|   Can you please re...|Credit reporting,...|
|   With out notice J...|Checking or savin...|
|   During the summer...|Vehicle loan or l...|
|   On XXXX XX/XX/201...|Money transfer, v...|
|   I have a Chase cr...|Credit card or pr...|
|   mishandling of th...|Vehicle loan or l...|
|   I have reached ou...|Credit reporting,...|
|   I opened an accou...|Checking or savin...|
|   To whom it may co...|Checking or savin...|
|   My chase amazon c...|Credit card or pr...|
|   I opened the savi...|Checking or savin...|
|   XXXX XXXX a sofa,...|Checking or savin...|
|   My card went miss...|Checking or savin...|
|   Chase sen

In [7]:
# Definir una UDF para aplicar todas las transformaciones en Spark
def clean_text_spark(df, text_column):
    # Convertir el texto a minúsculas
    df = df.withColumn(text_column, F.lower(F.col(text_column)))
    
    # Eliminar texto en corchetes []
    df = df.withColumn(text_column, F.regexp_replace(F.col(text_column), r'\[.*?\]', ''))
    
    # Eliminar puntuación
    df = df.withColumn(text_column, F.regexp_replace(F.col(text_column), r'[^\w\s]', ''))
    
    # Eliminar palabras que contienen números
    df = df.withColumn(text_column, F.regexp_replace(F.col(text_column), r'\b\w*\d\w*\b', ''))
    
    # Eliminar espacios en blanco adicionales
    df = df.withColumn(text_column, F.trim(F.col(text_column)))
    
    return df

# Aplicar la función de limpieza
df_cleanedxx = clean_text_spark(df_filtered, 'complaint_what_happened')

# Mostrar el resultado limpio
df_cleanedxx.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+--------------------+
|complaint_what_happened|            category|
+-----------------------+--------------------+
|   good morning my n...|Debt collection+C...|
|   i upgraded my xxx...|Credit card or pr...|
|   chase card was re...|Credit reporting,...|
|   on  while trying ...|Credit reporting,...|
|   my grand son give...|Checking or savin...|
|   can you please re...|Credit reporting,...|
|   with out notice j...|Checking or savin...|
|   during the summer...|Vehicle loan or l...|
|   on xxxx  i made a...|Money transfer, v...|
|   i have a chase cr...|Credit card or pr...|
|   mishandling of th...|Vehicle loan or l...|
|   i have reached ou...|Credit reporting,...|
|   i opened an accou...|Checking or savin...|
|   to whom it may co...|Checking or savin...|
|   my chase amazon c...|Credit card or pr...|
|   i opened the savi...|Checking or savin...|
|   xxxx xxxx a sofa ...|Checking or savin...|
|   my card went miss...|Checking or savin...|
|   chase sen

In [8]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import re

# Define a UDF to remove "xxxx" and any extra spaces left by the removal
def remove_xxxx(text):
    # Remove "xxxx" and any spaces left behind
    return re.sub(r"\s*xxxx\s*", "", text)

# Register the UDF with Spark
remove_xxxx_udf = udf(remove_xxxx, StringType())

# Apply the UDF to the 'complaint_what_happened' column
df_cleaned = df_cleanedxx.withColumn("complaint_what_happened", remove_xxxx_udf("complaint_what_happened"))

# Verify the column after cleaning
df_cleaned.show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------------+--------------------+
|complaint_what_happened|            category|
+-----------------------+--------------------+
|   good morning my n...|Debt collection+C...|
|   i upgraded mycard...|Credit card or pr...|
|   chase card was re...|Credit reporting,...|
|   on  while trying ...|Credit reporting,...|
|   my grand son give...|Checking or savin...|
|   can you please re...|Credit reporting,...|
|   with out notice j...|Checking or savin...|
|   during the summer...|Vehicle loan or l...|
|   oni made a  payme...|Money transfer, v...|
|   i have a chase cr...|Credit card or pr...|
|   mishandling of th...|Vehicle loan or l...|
|   i have reached ou...|Credit reporting,...|
|   i opened an accou...|Checking or savin...|
|   to whom it may co...|Checking or savin...|
|   my chase amazon c...|Credit card or pr...|
|   i opened the savi...|Checking or savin...|
|   a sofa love seat ...|Checking or savin...|
|   my card went miss...|Checking or savin...|
|   chase sen

## Lemmatization, Stopwords, Pos Tagging, CountVectorizer.
---

In [9]:
from pyspark.ml import Pipeline, Transformer
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, StringType
import sparknlp
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, LemmatizerModel, PerceptronModel, StopWordsCleaner

# Iniciar sesión de Spark con Spark NLP
# spark = sparknlp.start()

# Document Assembler
document_assembler = DocumentAssembler() \
    .setInputCol("complaint_what_happened") \
    .setOutputCol("document")

# Tokenizer
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# Stop Words Remover
stop_words_cleaner = StopWordsCleaner() \
    .setInputCols(["token"]) \
    .setOutputCol("cleanTokens") \
    .setCaseSensitive(False)

# POS Tagger BERT
pos_tagger = PerceptronModel.pretrained() \
    .setInputCols(["document", "cleanTokens"]) \
    .setOutputCol("pos")

# Lemmatizer
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(["cleanTokens"]) \
    .setOutputCol("lemma")

# Finisher to extract results from annotations
finisher = Finisher() \
    .setInputCols(["lemma", "pos"]) \
    .setOutputCols(["finished_lemma", "finished_pos"]) \
    .setCleanAnnotations(False)

# Custom Transformer to remove pronouns and extract nouns
class NounExtractor(Transformer):
    def __init__(self, inputColPOS=None, inputColLemma=None, outputCol=None):
        super(NounExtractor, self).__init__()
        self.inputColPOS = inputColPOS
        self.inputColLemma = inputColLemma
        self.outputCol = outputCol

    def _transform(self, dataset):
        def extract_nouns(pos_tags, lemmas):
            #noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
            noun_tags = ['NN']
            return [lemma for pos, lemma in zip(pos_tags, lemmas) if pos in noun_tags]
        
        extract_nouns_udf = udf(extract_nouns, ArrayType(StringType()))
        return dataset.withColumn(self.outputCol, extract_nouns_udf(col(self.inputColPOS), col(self.inputColLemma)))

# Instantiate custom transformer
noun_extractor = NounExtractor(inputColPOS="finished_pos", inputColLemma="finished_lemma", outputCol="noun_tokens")

# Build the pipeline
pipeline = Pipeline(stages=[
    document_assembler,
    tokenizer,
    stop_words_cleaner,
    pos_tagger,
    lemmatizer,
    finisher,
    noun_extractor
])

# Fit and transform the data
pipeline_model = pipeline.fit(df_cleaned)
df_final = pipeline_model.transform(df_cleaned)
df_final.select("noun_tokens", "pos").show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

pos_anc download started this may take some time.
Approximate size to download 3.9 MB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]
+--------------------+--------------------+
|         noun_tokens|                 pos|
+--------------------+--------------------+
|[morning, name, i...|[{pos, 0, 3, JJ, ...|
|[mycard, agent, u...|[{pos, 2, 9, JJ, ...|
|[chase, card, app...|[{pos, 0, 4, NN, ...|
|[book, aticket, o...|[{pos, 10, 15, VB...|
|[son, check, depo...|[{pos, 3, 7, JJ, ...|
|           [inquiry]|[{pos, 8, 13, VB,...|
|[notice, jp, morg...|[{pos, 9, 14, NN,...|
|[summer, experien...|[{pos, 11, 16, NN...|
|[oni, payment, on...|[{pos, 0, 2, NN, ...|
|[chase, credit, c...|[{pos, 9, 13, NN,...|
|[account, chase, ...|[{pos, 0, 10, VBG...|
|[attempt, inquiry...|[{pos, 7, 13, VBD...|
|[account, chase, ...|[{pos, 2, 7, VBD,...|
|[bank, alert, bal...|[{pos, 11, 13, MD...|
|[chase, amazon, c...|[{pos, 3, 7, NN, ...|
|[account, bonus,

## NGRAM COUNTVECTORIZER-IDF

In [10]:
from pyspark.ml.feature import NGram, CountVectorizer, IDF
from pyspark.sql.functions import array_union, col

# Generar bigramas y trigramas a partir de noun_tokens
ngram2 = NGram(n=2, inputCol="noun_tokens", outputCol="bigrams")
ngram3 = NGram(n=3, inputCol="noun_tokens", outputCol="trigrams")

df_bigrams = ngram2.transform(df_final)
df_trigrams = ngram3.transform(df_bigrams)

# Combinar palabras individuales, bigramas y trigramas en una sola columna llamada all_ngrams
df_combined = df_trigrams.withColumn(
    "all_ngrams", array_union(array_union(col("noun_tokens"), col("bigrams")), col("trigrams"))
)

# Aplicar CountVectorizer para obtener la matriz de frecuencias de términos en noun_tokens
count_vectorizer = CountVectorizer(inputCol="noun_tokens", outputCol="tf_features", minDF=2, maxDF=0.95)
cv_model = count_vectorizer.fit(df_final)
df_tf = cv_model.transform(df_final)

# Aplicar IDF para obtener los valores de TF-IDF
idf = IDF(inputCol="tf_features", outputCol="tfidf_features")
idf_model = idf.fit(df_tf)
df_tfidf = idf_model.transform(df_tf)

# Mostrar los resultados
df_tfidf.select("noun_tokens", "tfidf_features").show()


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------------+
|         noun_tokens|      tfidf_features|
+--------------------+--------------------+
|[morning, name, i...|(14263,[0,3,10,38...|
|[mycard, agent, u...|(14263,[10,40,75,...|
|[chase, card, app...|(14263,[0,2,4,76,...|
|[book, aticket, o...|(14263,[0,1,2,3,4...|
|[son, check, depo...|(14263,[0,1,3,6,7...|
|           [inquiry]|(14263,[128],[3.2...|
|[notice, jp, morg...|(14263,[0,1,3,4,6...|
|[summer, experien...|(14263,[0,5,9,10,...|
|[oni, payment, on...|(14263,[0,3,5,7,8...|
|[chase, credit, c...|(14263,[0,2,4,16,...|
|[account, chase, ...|(14263,[0,1,139],...|
|[attempt, inquiry...|(14263,[0,1,2,3,1...|
|[account, chase, ...|(14263,[0,1,3,32,...|
|[bank, alert, bal...|(14263,[0,3,7,11,...|
|[chase, amazon, c...|(14263,[0,1,4,11,...|
|[account, bonus, ...|(14263,[1,6,38,45...|
|[sofa, love, seat...|(14263,[3,4,7,26,...|
|[card, didnt, rea...|(14263,[1,3,4,7,3...|
|[chase, email, to...|(14263,[0,1,7,8,1...|
|[purchase, withon...|(14263,[0,

### LDA

In [11]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import NGram, CountVectorizer, IDF, Tokenizer
from pyspark.ml.clustering import LDA
from pyspark.sql.functions import col, array_union, expr, udf
from pyspark.sql.types import IntegerType
import numpy as np

# Crear sesión de Spark (si aún no está creada)
# spark = SparkSession.builder.appName("LDA Example").getOrCreate()

# Aplicar LDA para encontrar patrones latentes en las PQRS
lda = LDA(k=5, seed=1, maxIter=25, featuresCol="tfidf_features")
lda_model = lda.fit(df_tfidf)

# Transformar los datos para asignar cada transacción a un tema
df_transformed = lda_model.transform(df_tfidf)


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
def get_predominant_topic(topic_distribution):
    return int(np.argmax(topic_distribution))

# Registrar la función UDF para obtener el tema predominante
get_predominant_topic_udf = udf(get_predominant_topic, IntegerType())

# Agregar columna para el tema predominante y mantener la distribución de temas
df_results = df_transformed.withColumn("tema_predominante", get_predominant_topic_udf("topicDistribution"))

# Mostrar el tema predominante y la distribución de temas para cada documento
df_results.select("topicDistribution", "tema_predominante").show()

# Obtener los términos predominantes de cada tema
topics = lda_model.describeTopics(10)  # Obtiene los 10 términos principales por tema

# Extraer los términos del vocabulario para interpretar cada tema
vocab = cv_model.vocabulary  # Supone que 'cv_model' es el CountVectorizer ajustado anteriormente

# Mostrar términos predominantes y distribución de temas para cada tema
for topic in range(5):  # Itera sobre el número de temas
    term_indices = topics.where(col("topic") == topic).select("termIndices").head()[0]
    term_weights = topics.where(col("topic") == topic).select("termWeights").head()[0]
    terms = [vocab[idx] for idx in term_indices]
    print(f"Tema {topic}:")
    print("Términos predominantes:", terms)
    print("Pesos de términos:", term_weights)
    print("\n")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----------------+
|   topicDistribution|tema_predominante|
+--------------------+-----------------+
|[0.00262342188206...|                4|
|[0.00241075125210...|                4|
|[0.00730151254814...|                1|
|[0.00118021653137...|                4|
|[0.00193692144151...|                1|
|[0.03911115134350...|                1|
|[7.76745246158478...|                4|
|[0.16822794859553...|                4|
|[4.61600810463970...|                1|
|[0.01468848764267...|                1|
|[0.03256511554981...|                4|
|[0.00206036935529...|                1|
|[0.00366844232673...|                2|
|[0.00345387375760...|                1|
|[9.07194327753448...|                1|
|[0.00303454248669...|                2|
|[0.00362646637784...|                1|
|[0.00978145144809...|                1|
|[0.19425021867640...|                4|
|[0.51561641455677...|                0|
+--------------------+-----------------+
only showing top

In [13]:
from pyspark.sql import functions as F

# Agrupar por `tema_predominante` y contar la cantidad de documentos en cada tema
df_topic_counts = df_results.groupBy("tema_predominante").agg(F.count("*").alias("count"))

# Mostrar la distribución de temas predominantes
df_topic_counts.orderBy("tema_predominante").show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----+
|tema_predominante|count|
+-----------------+-----+
|                0| 2995|
|                1| 8681|
|                2| 1302|
|                3| 2022|
|                4| 6072|
+-----------------+-----+

## Modelado

### Regresion Logistica

In [28]:
## Logistic regression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Assume df_results is the DataFrame containing 'tfidf_features' and 'tema_predominante'
# Split the data into training and test sets
train_data, test_data = df_results.randomSplit([0.7, 0.3], seed=42)

# Create the Logistic Regression model
lr = LogisticRegression(featuresCol='tfidf_features', labelCol='tema_predominante', maxIter=20)

# Fit the model on the training data
lr_model = lr.fit(train_data)

# Make predictions on the test data
predictions = lr_model.transform(test_data)

# Evaluate the model using MulticlassClassificationEvaluator
# Accuracy
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator_accuracy.evaluate(predictions)
print(f"Test Set Accuracy = {accuracy}")

# F1 Score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="f1")
f1 = evaluator_f1.evaluate(predictions)
print(f"Test Set F1 Score = {f1}")

# Recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="weightedRecall")
recall = evaluator_recall.evaluate(predictions)
print(f"Test Set Recall = {recall}")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Test Set Accuracy = 0.7203836930455636
Test Set F1 Score = 0.7256315794001083
Test Set Recall = 0.7203836930455635

### Decision Tree

In [17]:
from pyspark.ml.classification import DecisionTreeClassifier

# Crear el modelo Decision Tree
dt = DecisionTreeClassifier(featuresCol='tfidf_features', labelCol='tema_predominante', maxDepth=5, seed=42)

# Entrenar el modelo con los datos de entrenamiento
dt_model = dt.fit(train_data)

# Realizar predicciones en los datos de prueba
dt_predictions = dt_model.transform(test_data)

# Evaluar el modelo usando MulticlassClassificationEvaluator

# Exactitud (Accuracy)
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="accuracy")
dt_accuracy = evaluator_accuracy.evaluate(dt_predictions)
print(f"Exactitud del conjunto de prueba (Decision Tree) = {dt_accuracy}")

# Puntaje F1
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="f1")
dt_f1 = evaluator_f1.evaluate(dt_predictions)
print(f"Puntaje F1 del conjunto de prueba (Decision Tree) = {dt_f1}")

# Recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="weightedRecall")
dt_recall = evaluator_recall.evaluate(dt_predictions)
print(f"Recall del conjunto de prueba (Decision Tree) = {dt_recall}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exactitud del conjunto de prueba (Decision Tree) = 0.5162270183852917
Puntaje F1 del conjunto de prueba (Decision Tree) = 0.42286866429605396
Recall del conjunto de prueba (Decision Tree) = 0.5162270183852918

### Random Forest

In [18]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Crear el modelo Random Forest
rf = RandomForestClassifier(featuresCol='tfidf_features', labelCol='tema_predominante', numTrees=50, seed=42)

# Entrenar el modelo con los datos de entrenamiento
rf_model = rf.fit(train_data)

# Realizar predicciones en los datos de prueba
rf_predictions = rf_model.transform(test_data)

# Evaluar el modelo usando MulticlassClassificationEvaluator

# Exactitud (Accuracy)
evaluator_accuracy = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="accuracy")
rf_accuracy = evaluator_accuracy.evaluate(rf_predictions)
print(f"Exactitud del conjunto de prueba (Random Forest) = {rf_accuracy}")

# Puntaje F1
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="f1")
rf_f1 = evaluator_f1.evaluate(rf_predictions)
print(f"Puntaje F1 del conjunto de prueba (Random Forest) = {rf_f1}")

# Recall
evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="tema_predominante", predictionCol="prediction", metricName="weightedRecall")
rf_recall = evaluator_recall.evaluate(rf_predictions)
print(f"Recall del conjunto de prueba (Random Forest) = {rf_recall}")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Exactitud del conjunto de prueba (Random Forest) = 0.4545163868904876
Puntaje F1 del conjunto de prueba (Random Forest) = 0.3206838704891863
Recall del conjunto de prueba (Random Forest) = 0.4545163868904876

In [19]:
## saving
# Define la ruta en S3 para almacenar el modelo de Rndom Forest
s3_path_rf_model = "s3://bucketspark14/models/random_forest_final"

# Guardar el modelo de RF en S3
rf_model.save(s3_path_rf_model)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [20]:
## saving
# Define la ruta en S3 para almacenar el modelo de Regresión Logística
s3_path_lr_model = "s3://bucketspark14/models/logistic_regression_model_final"

# Guardar el modelo de Regresión Logística en S3
lr_model.save(s3_path_lr_model)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [21]:
## saving
# Define la ruta en S3 para almacenar el modelo de DT
s3_path_dt_model = "s3://bucketspark14/models/decision_tree_model_final"

# Guardar el modelo de DT en S3
dt_model.save(s3_path_dt_model)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
s3_path_lda = "s3://bucketspark14/models/lda_model"

# Guardar el modelo LDA en S3
lda_model.save(s3_path_lda)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [23]:
s3_path_cv = "s3://bucketspark14/models/count_vectorizer_model"
s3_path_idf = "s3://bucketspark14/models/idf_model"

# Guardar el modelo CountVectorizer en S3
cv_model.save(s3_path_cv)

# Guardar el modelo IDF en S3
idf_model.save(s3_path_idf)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [24]:
s3_path_results = "s3://bucketspark14/results/lda_transformed_data"

# Guardar el DataFrame `df_transformed` en S3 en formato Parquet
df_transformed.write.mode("overwrite").parquet(s3_path_results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [25]:
s3_path_results = "s3://bucketspark14/results/cv_transformed_data"

df_tfidf.write.mode("overwrite").parquet(s3_path_results)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…