In [1]:
# Importaciones

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Model # type: ignore
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Concatenate # type: ignore
from tensorflow.keras.preprocessing.text import Tokenizer # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from sklearn.model_selection import train_test_split
import json


import sys
sys.path.append("../")
import dotenv # type: ignore
dotenv.load_dotenv()
from src import support_bd as bd
from src import support_tsf as tsf

import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
%load_ext autoreload
%autoreload 2

In [2]:
df = bd.select_datos("alumnos")
df = df.drop(columns=["nombre", "apellidos", "email", "telefono"])

In [3]:

# Preparar los codificadores para las columnas categóricas
le_estudios = LabelEncoder()
le_especialidad = LabelEncoder()
le_ciudad = LabelEncoder()
le_sexo = LabelEncoder()

# Ajustar los codificadores con los valores únicos de cada columna
df["estudios"] = le_estudios.fit_transform(df["estudios"])
df["especialidad"] = le_especialidad.fit_transform(df["especialidad"])
df["ciudad"] = le_ciudad.fit_transform(df["ciudad"])
df["sexo"] = le_sexo.fit_transform(df["sexo"])

# Tokenizar y procesar el campo de texto 'motivo_compra'
tokenizer = Tokenizer(num_words=5000)  # Limitar a las 5000 palabras más frecuentes
tokenizer.fit_on_texts(df["motivo_compra"])
df["motivo_compra"] = tokenizer.texts_to_sequences(df["motivo_compra"])

# Rellenar las secuencias para que todas tengan la misma longitud
max_len = 50  # Máxima longitud de las descripciones
df["motivo_compra"] = pad_sequences(df["motivo_compra"], maxlen=max_len).tolist()

# Separar características (X) y etiquetas (y)
X_numerico = df[["edad", "estudios", "especialidad", "sexo", "ciudad"]].values
X_texto = np.array(df["motivo_compra"].tolist())
y = df["comprado"].values

# Escalar los datos numéricos
scaler = StandardScaler()
X_numerico = scaler.fit_transform(X_numerico)

# Dividir en conjunto de entrenamiento y prueba
X_num_train, X_num_test, X_text_train, X_text_test, y_train, y_test = train_test_split(
    X_numerico, X_texto, y, test_size=0.2, random_state=42
)

# Crear el modelo combinado
# Entrada numérica
input_numerico = Input(shape=(X_numerico.shape[1],))
numerico = Dense(16, activation="relu")(input_numerico)

# Entrada de texto
input_texto = Input(shape=(max_len,))
texto = Embedding(input_dim=5000, output_dim=64, input_length=max_len)(input_texto)
texto = LSTM(32)(texto)

# Concatenar entradas
concatenado = Concatenate()([numerico, texto])
denso = Dense(16, activation="relu")(concatenado)
salida = Dense(1, activation="sigmoid")(denso)

# Modelo final
model = Model(inputs=[input_numerico, input_texto], outputs=salida)

# Compilar el modelo
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Entrenar el modelo
history = model.fit(
    [X_num_train, X_text_train], y_train,
    epochs=3, batch_size=128, validation_data=([X_num_test, X_text_test], y_test)
)

# Evaluar el modelo
loss, accuracy = model.evaluate([X_num_test, X_text_test], y_test, verbose=0)
print(f"Precisión del modelo: {accuracy * 100:.2f}%")

Epoch 1/3
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 29ms/step - accuracy: 0.7958 - loss: 0.4503 - val_accuracy: 0.8542 - val_loss: 0.3057
Epoch 2/3
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 36ms/step - accuracy: 0.8537 - loss: 0.2920 - val_accuracy: 0.9169 - val_loss: 0.2025
Epoch 3/3
[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 37ms/step - accuracy: 0.9313 - loss: 0.1833 - val_accuracy: 0.9543 - val_loss: 0.1153
Precisión del modelo: 95.43%


In [4]:
def update_data(row, score):
        
        score_short = np.round(float(score), decimals=2)
        
        data ={ 
            "email" : row['email'], 
            "score" : score_short
        } 
        supabase = bd.init_conection_bd()

        response = (
            supabase.table("leads")
            .update({"score": data['score']})
            .eq("email", data['email'])
            .execute())

In [5]:
df_leads = bd.select_datos("leads")
# Iterar sobre las filas
for index, row in df_leads.iterrows():
    score = tsf.predict(row, le_estudios, le_especialidad, le_ciudad, le_sexo, scaler, tokenizer, max_len, model)
    update_data(row, score)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 121ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 101ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m