In [1]:
import re
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

from sklearn.svm import SVC
from sklearn.metrics import classification_report

from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import torch

from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_1 = pd.read_csv("./data/frases_sentimientos.csv")
df_2 = pd.read_csv("./data/noemoticon.csv", encoding='latin1', header=None) # sin encoding da error utf-8

In [None]:
# Vamos a trabajar con la mitad del segundo dataset ya que entero le cuesta mucho
#half_size = len(df_2) // 2
# df_2_1 = df_2.iloc[:half_size]

### Análisis de los df

#### Análisis del df_1

- frase: frase compuesta por la función phrases_generator
- sentimiento: sentimiento bueno/malo de la frase previa

In [58]:
df_1.head(3)

Unnamed: 0,frase,sentimiento
0,Este ítem es increíble,positivo
1,El servicio me encantó,positivo
2,El servicio me encantó,positivo


#### Análisis del df_2

- target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- ids: The id of the tweet ( 2087)
- date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- flag: The query (lyx). If there is no query, then this value is NO_QUERY.
- user: the user that tweeted (robotickilldozr)
- text: the text of the tweet (Lyx is cool)

In [3]:
# Como no tiene nombre las columnas vamos a ponerselas
df_2.columns = ["target", "ids", "date", "flag", "user", "text"]

df_2.head(3)

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


## Preprocesamiento de Datos

En un principio vamos a trabajar con el segundo dataframe (**df_2**) ya que además de estar en inglés, es un dataset
con el que se trabaja NLP

In [4]:
def limpiar_texto(texto):
    texto = re.sub(r'@\w+', '', texto) #Elimina las menciones (@usuario)
    texto = re.sub(r'http\S+|www\.\S+', '', texto) #Elimina las URLs (http/https o www)
    texto = re.sub(r'[^A-Za-z0-9\sáéíóúÁÉÍÓÚñÑ]', '', texto) #Elimina los caracteres especiales, dejando solo letras, números y espacios
    texto = re.sub(r'\s+', ' ', texto).strip() # Elimina los espacios múltiples
    return texto

In [5]:
# Eliminamos los caracteres especiales del df_2, así como las menciones y los links
df_2['clean_text'] = df_2['text'].apply(limpiar_texto)
df_2.head(5)

Unnamed: 0,target,ids,date,flag,user,text,clean_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww thats a bummer You shoulda got David Carr...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to sav...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...


## Tokenización

La tokenización es un proceso fundamental en el procesamiento de lenguaje natural (NLP). Consiste en dividir un texto en unidades más pequeñas, que pueden ser palabras, frases o incluso caracteres, para facilitar su análisis y manipulación.

En el contexto de NLP, los tokens son las unidades que se analizan. Por ejemplo, en el texto "Hola, ¿cómo estás?", los tokens podrían ser ['Hola', ',', '¿', 'cómo', 'estás', '?'].

In [6]:
nltk.download('punkt')

# Aplicar la tokenización a la columna 'texto'
df_2['tokens'] = df_2['clean_text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rukyf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [63]:
# Comprobamos visualmente que ha tokenizado la frase como queríamos
df_2.head(3)

Unnamed: 0,target,ids,date,flag,user,text,clean_text,tokens
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww thats a bummer You shoulda got David Carr...,"[Awww, thats, a, bummer, You, shoulda, got, Da..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, Faceb..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to sav...,"[I, dived, many, times, for, the, ball, Manage..."


## Lematización

La lematización es un proceso en el procesamiento de lenguaje natural (NLP) que consiste en reducir una palabra a su forma base o lema, eliminando sufijos y prefijos, y dejando la raíz de la palabra. Por ejemplo, la lematización convierte "corriendo", "corrió", "correrá" en "correr", que es la forma base (o lema) del verbo.

In [7]:
lemmatizer = WordNetLemmatizer()

def lematizar_texto(texto):
    palabras = texto.split()
    return ' '.join([lemmatizer.lemmatize(palabra, pos='v') for palabra in palabras])

In [8]:
df_2['lematizado'] = df_2['clean_text'].apply(lematizar_texto)

In [66]:
# Comprobamos que ha lematizado como queremos
df_2.head(3)

Unnamed: 0,target,ids,date,flag,user,text,clean_text,tokens,lematizado
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww thats a bummer You shoulda got David Carr...,"[Awww, thats, a, bummer, You, shoulda, got, Da...",Awww thats a bummer You shoulda get David Carr...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, Faceb...",be upset that he cant update his Facebook by t...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to sav...,"[I, dived, many, times, for, the, ball, Manage...",I dive many time for the ball Managed to save ...


## Borrado de 'Stopwords'

El borrado de stopwords es un paso común en el procesamiento de texto para limpiar datos antes de realizar tareas de análisis de texto, como clasificación, análisis de sentimientos, etc. Las stopwords son palabras comunes que no aportan mucho significado en el contexto del análisis de texto, como "el", "la", "y", "de", "a", "en", entre otras. Estas palabras se eliminan para reducir la complejidad del modelo sin perder información relevante.

In [9]:
# Lista de stopwords en el idioma en el que está el dataset (inglés)
stop_words = set(stopwords.words('english'))

In [10]:
def eliminar_stopwords(texto):
    palabras = word_tokenize(texto)
    return ' '.join([palabra for palabra in palabras if palabra.lower() not in stop_words])

In [11]:
df_2['text_no_stopwords'] = df_2['clean_text'].apply(eliminar_stopwords)
df_2.head(3)

Unnamed: 0,target,ids,date,flag,user,text,clean_text,tokens,lematizado,text_no_stopwords
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Awww thats a bummer You shoulda got David Carr...,"[Awww, thats, a, bummer, You, shoulda, got, Da...",Awww thats a bummer You shoulda get David Carr...,Awww thats bummer shoulda got David Carr Third...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,"[is, upset, that, he, cant, update, his, Faceb...",be upset that he cant update his Facebook by t...,upset cant update Facebook texting might cry r...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball Managed to sav...,"[I, dived, many, times, for, the, ball, Manage...",I dive many time for the ball Managed to save ...,dived many times ball Managed save 50 rest go ...


## Entrenamos el modelo (Naive Bayes)

In [12]:
# vectorizamos
vectorizer = TfidfVectorizer()

# Aplicar el vectorizador a los textos preprocesados
X = vectorizer.fit_transform(df_2['lematizado'])

In [13]:
from sklearn.model_selection import train_test_split

# Dividimos el dataset
X_train, X_test, y_train, y_test = train_test_split(X, df_2['target'], test_size=0.3, random_state=42)

# Observamos las dimensiones de los conjuntos de datos
print(X_train.shape, X_test.shape)


(1120000, 434683) (480000, 434683)


In [14]:
# Creamos el modelo Naive Bayes
model = MultinomialNB() # probando a cambiar el hiperparámetro 'alpha' no cambia el resultado

In [15]:
# Entrenar el modelo
model.fit(X_train, y_train)

In [16]:
# Realizamos las predicciones
y_pred = model.predict(X_test)

In [17]:
# Evaluamos el modelo
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77    239361
           4       0.79      0.74      0.76    240639

    accuracy                           0.77    480000
   macro avg       0.77      0.77      0.77    480000
weighted avg       0.77      0.77      0.77    480000



### Utilizando BERT...

In [76]:
model_name = "bert-base-uncased"  # Cambia esto por tu modelo
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
test_phrases = ["Just tried this new coffee shop, and wow, the best latte I've ever had. Totally recommend it.",
                "The coffee was decent, but the service was a bit slow. Not bad overall.",
                "Terrible experience at this coffee shop. Cold coffee, rude staff, and overpriced. Never going back"
                ]

In [78]:
inputs = tokenizer(test_phrases[0], return_tensors="pt", truncation=True, padding=True)

In [79]:
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

In [80]:
predicted_class = torch.argmax(logits, dim=1).item()

In [81]:
probs = torch.nn.functional.softmax(logits, dim=1)
probabilities = probs[0].tolist()

print(f"Categoría predicha: {predicted_class}")
print(f"Probabilidades: {probabilities}")

Categoría predicha: 0
Probabilidades: [0.5514791011810303, 0.44852083921432495]


## Intento de entrenar el modelo (SVM)

In [82]:
# Crear el modelo SVM
svm_model = SVC(kernel='linear')

# Entrenar el modelo
svm_model.fit(X_train, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = svm_model.predict(X_test)

# Evaluar el modelo
print(classification_report(y_test, y_pred))

ValueError: The number of classes has to be greater than one; got 1 class

### Transformador BERT al modelo

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
type(list(df_2["clean_text"]))

list

In [20]:
inputs = tokenizer(list(df_2["clean_text"]), padding=True, truncation=True, return_tensors="pt", max_length=128)


KeyboardInterrupt



In [None]:
labels = torch.tensor(df_2["target"].values)

In [None]:
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Cargar el modelo preentrenado de BERT para clasificación
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # Ajusta `num_labels` según tu tarea

# Mover el modelo a la GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
# Configurar el optimizador
optimizer = AdamW(model.parameters(), lr=2e-5)

# Entrenamiento del modelo
model.train()
for epoch in range(2):  # Ajusta el número de épocas según tus necesidades
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, attention_mask, labels = batch
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        
    print(f'Epoch {epoch + 1} completed. Loss: {loss.item()}')

IndexError: Target 4 is out of bounds.