In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, LSTM
from gensim import corpora, models
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [2]:
# Cargar los datos
entities_data = pd.read_csv('entities_train.csv', sep='\t')
relations_data = pd.read_csv('relations_train.csv', sep='\t')

In [3]:
# Eliminar filas con valores nulos si es necesario
entities_data = entities_data.dropna()
# Eliminar caracteres especiales y convertir a minúsculas
entities_data['mention'] = entities_data['mention'].apply(lambda x: x.lower())
entities_data['mention'] = entities_data['mention'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

# Tokenización y eliminación de palabras vacías (stopwords)
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

entities_data['mention'] = entities_data['mention'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anaso\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anaso\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Eliminar filas con valores nulos
relations_data = relations_data.dropna()
# Convertir la columna 'type' a minúsculas
relations_data['type'] = relations_data['type'].str.lower()
# Eliminar caracteres especiales y convertir a minúsculas en entity_1_id y entity_2_id
relations_data['entity_1_id'] = relations_data['entity_1_id'].apply(lambda x: x.lower())
relations_data['entity_2_id'] = relations_data['entity_2_id'].apply(lambda x: x.lower())


In [5]:
# Renombrar las columnas 'type'
entities_data.rename(columns={'type': 'entity_type'}, inplace=True)
relations_data.rename(columns={'type': 'relation_type'}, inplace=True)

# Combinar los datos
combined_data = entities_data.merge(relations_data, on='abstract_id', how='inner')

In [6]:
# Preprocesamiento de texto
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(combined_data['mention'])
le = LabelEncoder()
y = le.fit_transform(combined_data['relation_type'])

In [7]:
# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Algoritmo 1: CNN

In [8]:
model_cnn = Sequential()
model_cnn.add(Embedding(input_dim=X.shape[1], output_dim=32))
model_cnn.add(Conv1D(32, 5, activation='tanh'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(y.max()+1, activation='softmax'))
model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_cnn.fit(X_train.toarray(), y_train, epochs=1, batch_size=32)



<keras.callbacks.History at 0x1d280181010>

### Algoritmo 2: Asignación de Dirichlet Latente (LDA)

In [9]:
feature_names = tfidf.get_feature_names_out()
documents = []
for doc in X:
    words = [feature_names[i] for i in doc.indices]
    documents.append(words)
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(doc) for doc in documents]
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary)

### Algoritmo 3: RNN y LSTM

In [10]:
model_rnn = Sequential()
model_rnn.add(Embedding(input_dim=X.shape[1], output_dim=32)) 
model_rnn.add(LSTM(32))  
model_rnn.add(Dense(y.max()+1, activation='softmax'))
model_rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_rnn.fit(X_train.toarray(), y_train, epochs=1, batch_size=64) 

 204/1951 [==>...........................] - ETA: 1:49:29 - loss: 1.2269 - accuracy: 0.5280

### Visualizaciones

In [None]:
# Calcular la precisión de cada modelo y generar un gráfico de barras para visualizarla
models = [model_cnn, lda_model, model_rnn]
accuracies = []
for model in models:
    loss, accuracy = model.evaluate(X_test.toarray(), y_test)
    accuracies.append(accuracy)
for i, accuracy in enumerate(accuracies):
    print(f"Accuracy (Model {i+1}):", accuracy)

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(len(models)), accuracies)
plt.xlabel('Modelo')
plt.ylabel('Precisión')
plt.show()

In [None]:
# Matriz de confusión para el modelo CNN
predictions_cnn = model_cnn.predict(X_test.toarray())
cm_cnn = confusion_matrix(y_test.argmax(axis=1), predictions_cnn.argmax(axis=1))
plt.figure(figsize=(10,7))
sns.heatmap(cm_cnn, annot=True)
plt.title('Confusion Matrix for CNN')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()

In [None]:
# Matriz de confusión para el modelo RNN
predictions_rnn = model_rnn.predict(X_test.toarray())
cm_rnn = confusion_matrix(y_test.argmax(axis=1), predictions_rnn.argmax(axis=1))
plt.figure(figsize=(10,7))
sns.heatmap(cm_rnn, annot=True)
plt.title('Confusion Matrix for RNN')
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()