<a href="https://colab.research.google.com/github/rodr1ggoql17/Procesamiento-Lenguaje-Natural/blob/main/Clasificacion_de_texto_con_tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Clasificación Binaria
Es un tipo de problema de aprendizaje supervisado el cual su objetivo es predecir entre dos clases posibles en cada instancia.

In [None]:
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from wordcloud import WordCloud
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

In [None]:
stops_espanol = set(stopwords.words('spanish')) # guardar stopwords en español

In [None]:
df = pd.read_excel('/content/drive/MyDrive/CURSO NLP/data/BBDD.xlsx')

In [None]:
len(df)

In [None]:
df

In [None]:
df.dtypes

In [None]:
df.head(5)

In [None]:
df['review_es'][0]

In [None]:
df['sentimiento'].values

In [None]:
df = df[['review_es', 'sentimiento']]

In [None]:
df

# Etiquetado binario

In [None]:
df['polaridad'] = df['sentimiento'].map({'positivo': 1, 'negativo': 0})

In [None]:
df

In [None]:
df['review_es'][1]

In [None]:
plt.hist(df['polaridad']);

# Wordcloud de las reviews

In [None]:
stops_espanol = set(stopwords.words('spanish'))

In [None]:
text = " ".join(review for review in df['review_es'])

# Generate the word cloud
wordcloud = WordCloud().generate(text)

# Display the word cloud
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
text = " ".join(review for review in df['review_es'])
wordcloud = WordCloud(stopwords=stops_espanol).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# División de dataset en conjunto de entramiento y pruebas

In [None]:
df_train, df_test = train_test_split(df,random_state=42)

In [None]:
vectorizer = TfidfVectorizer(stop_words=list(stops_espanol),max_features=2000)
X_train = vectorizer.fit_transform(df_train['review_es'])
X_test = vectorizer.fit_transform(df_test['review_es'])

In [None]:
X_train

In [None]:
X_test

In [None]:
X_train = X_train.toarray()
X_test = X_test.toarray()

In [None]:
X_train

In [None]:
X_test

In [None]:
Y_train = df_train['polaridad']
Y_test = df_test['polaridad']

In [None]:
D = X_train.shape[1]
print(D)

In [None]:
i = Input(shape=(D,))
x = Dense(1)(i)

modelo = Model(i,x)

In [None]:
modelo.summary()

In [None]:
modelo.compile(
    loss= BinaryCrossentropy(from_logits=True),
    optimizer = Adam(learning_rate = 0.01),
    metrics=['accuracy']
)

In [None]:
r = modelo.fit(
    X_train, Y_train,
    validation_data = (X_test, Y_test),
    epochs=100,
    batch_size=128
)

In [None]:
P_train = ((modelo.predict(X_train) > 0)*1.0).flatten()
P_test = ((modelo.predict(X_test) > 0)*1.0).flatten()

In [None]:
matrix = confusion_matrix(Y_train, P_train, normalize = 'true')
matrix

In [None]:
classes = ['negativo', 'positivo']
df_cm = pd.DataFrame(matrix, index=classes, columns = classes)
ax = sns.heatmap(df_cm, annot=True, fmt='g')
ax.set_xlabel('Predicción');
ax.set_ylabel('Objetivo');

In [None]:
Pr_train = modelo.predict(X_train)
Pr_test = modelo.predict(X_test)

print("Train AUC: ", roc_auc_score(Y_train, Pr_train))
print("Test AUC: ", roc_auc_score(Y_test, Pr_test))

In [None]:
print("Train F1: ", f1_score(Y_train, P_train))
print("Test F1: ", f1_score(Y_test, P_test))

# Reseña de ejemplo

In [None]:
review = 'menuda mierda'
review = vectorizer.transform([review]).toarray()

In [None]:
prediccion = modelo.predict(review)
np.argmax(prediccion)