In [20]:
TRAIN_PATH = '../Datos/TRAIN.csv'
REDNEURONAL_PATH='../Red Neuronal/modelo_red_neuronal.keras'
TOKENIZER_PATH='../Red Neuronal/tokenizer.json'

In [21]:
import pandas as pd
import numpy as np
import re

## Carga de Datos y limpieza

In [None]:
df=pd.read_csv(TRAIN_PATH)

In [None]:
#Vamos a modificar la función normalize_document para que retire
# las "stopwords"
import nltk
nltk.download('stopwords')
def normalize_document(doc, stopwords=nltk.corpus.stopwords.words('spanish')):

    #remove html tags (Los textos tienen etiquetas html que hacen la tokenizacion mas dificil)
    doc = re.sub(r'<[^>]+>','', doc)
    #remove withespaces and break lines
    doc = ' '.join(doc.strip().split())
    #remove special characters
    #doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    # tokenize document
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stopwords]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)

    return doc

normalize_corpus = np.vectorize(normalize_document)

In [3]:
RANDOM_STATE = 1234

from sklearn.model_selection import train_test_split

frase_train, frase_test, tipo_train, tipo_test = train_test_split(df['Frases'], df['Tipo texto'], test_size=0.01, random_state=RANDOM_STATE)


#primero normalizamos los datos de entrenamiento
norm_train_corpus = normalize_corpus(np.array(frase_train))
type_train = normalize_corpus(np.array(tipo_train))

## Árbol de decisión

In [15]:
from sklearn import tree
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

# Empezamos con la bolsa de palabras
cv_matrix_train = cv.fit_transform(norm_train_corpus)

# Creamos el clasificador con los valores por defecto
tree_classifier_bolsa = tree.DecisionTreeClassifier()
tree_classifier_bolsa.fit(cv_matrix_train, type_train)

In [16]:
# Representación TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer()

tv_matrix_train = tv.fit_transform(norm_train_corpus)

# Creamos el clasificador con los valores por defecto
tree_classifier_TFIDF = tree.DecisionTreeClassifier()
tree_classifier_TFIDF.fit(tv_matrix_train, type_train)



## Naïve Bayes method

In [17]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Empezamos con la bolsa de palabras

cv = CountVectorizer()

cv_matrix_train = cv.fit_transform(norm_train_corpus)

mnb_classifier_bolsa = MultinomialNB()

mnb_classifier_bolsa.fit(cv_matrix_train, type_train)

In [18]:
# Representación TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer()

tv_matrix_train = tv.fit_transform(norm_train_corpus)

# Creamos el clasificador con los valores por defecto
mnb_classifier_TFIDF = MultinomialNB()

mnb_classifier_TFIDF.fit(tv_matrix_train, type_train)

## Red Neuronal

In [19]:
def tokenize_data(df):
    from json import load
    from keras.preprocessing.text import tokenizer_from_json
    from keras.utils import pad_sequences
    with open(TOKENIZER_PATH) as f:
        data = load(f)
        tokenizer = tokenizer_from_json(data)
        sequences = tokenizer.texts_to_sequences(df[0])
        data = pad_sequences(sequences, maxlen=40)
        return data

def neural_network_predict(df):
    from keras.models import load_model
    reconstructed_model = load_model(REDNEURONAL_PATH)
    y_pred = reconstructed_model.predict(df)
    return ["Humano" if y_pred[i] > 0.5 else "IA" for i in range(len(y_pred))]

## Interfaz

In [14]:
##Aquí hago la interfaz

import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd
import numpy as np
import ttkbootstrap as ttkb

# Función que normaliza y clasifica el texto
def classify_text():
    input_text = text_entry.get("1.0", "end-1c")  # Obtiene el texto del widget de entrada
    if input_text.strip() == "":
        messagebox.showinfo("Error", "Por favor, ingresa un texto para clasificar.")
        return

    # Transformamos el texto a DataFrame
    prueba_Texto = [input_text]
    frase_prueba = pd.DataFrame(prueba_Texto)


    # Seleccionamos el clasificador según el método elegido
    selected_method = method_var.get()
    if selected_method == "Árbol de clasificación (Bolsa de palabras)":
        norm = normalize_corpus(np.array(frase_prueba[0]))
        prediction = tree_classifier_bolsa.predict(cv.transform(norm))
        result = "IA" if prediction == 'ia' else "Humano" 
    elif selected_method == "Árbol de clasificación (TF-IDF)":
        norm = normalize_corpus(np.array(frase_prueba[0]))
        prediction = tree_classifier_TFIDF.predict(cv.transform(norm))
        result = "IA" if prediction == 'ia' else "Humano"
        pass
    elif selected_method == "Naïve Bayes (Bolsa de palabras)":
        norm = normalize_corpus(np.array(frase_prueba[0]))
        prediction = mnb_classifier_bolsa.predict(cv.transform(norm))
        result = "IA" if prediction == 'ia' else "Humano"
        pass
    elif selected_method == "Naïve Bayes (TF-IDF)":
        norm = normalize_corpus(np.array(frase_prueba[0]))
        prediction = mnb_classifier_TFIDF.predict(cv.transform(norm))
        result = "IA" if prediction == 'ia' else "Humano"
        pass
        
    elif selected_method == "Red neuronal":
        data = tokenize_data(frase_prueba)
        result = neural_network_predict(data)[0]
        pass

    # Mostramos el resultado en un cuadro de mensaje

    messagebox.showinfo("Resultado", f"El texto es clasificado como: {result}")

# Configuración de la ventana principal
# Creamos la ventana con un estilo de ttkbootstrap
root = ttkb.Window(themename='darkly')
root.title("Clasificador de Texto IA/Humano")

# Crear un widget de entrada de texto
text_entry = tk.Text(root, height=10, width=100)
text_entry.pack()

# Variable para el método de clasificación seleccionado
method_var = tk.StringVar(root)
method_var.set("Árbol de clasificación")  # default value

# Podemos usar widgets de ttk en lugar de los de tk para aprovechar los estilos
method_menu = ttk.OptionMenu(root, method_var, "Seleccionar Método", "Árbol de clasificación (Bolsa de palabras)","Árbol de clasificación (TF-IDF)", "Naïve Bayes (Bolsa de palabras)","Naïve Bayes (TF-IDF)", "Red neuronal")
method_menu.pack()

classify_button = ttk.Button(root, text="Clasificar Texto", command=classify_text, bootstyle='success')  # Botón con estilo
classify_button.pack()


# Ejecutar la aplicación
root.mainloop()

bgerror failed to handle background error.
    Original error: can't invoke "event" command: application has been destroyed
    Error in bgerror: can't invoke "tk" command: application has been destroyed


