# Procesamiento del Lenguaje Natural

Rodrigo S. Cortez Madrigal

<img src="https://pcic.posgrado.unam.mx/wp-content/uploads/Ciencia-e-Ingenieria-de-la-Computacion_color.png" alt="Logo PCIC" width="128" />  

In [1]:
import numpy as np
import pandas as pd
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from plotly import graph_objs as go
from plotly import express as px
from plotly.subplots import make_subplots


## Sentiment Analysis

In [2]:

text = "I actually don't think this comment will be classified correctly, " \
"because it has happy words, and I'm happy while writing it, " \
"even if I'm saying something that is not beneficial for the application itself."

### VADER

VADER (Valence Aware Dictionary and sEntiment Reasoner) es un modelo de análisis de sentimientos basado en reglas y léxicos. Este modelo fue desarrollado por C.J. Hutto y Eric Gilbert en 2014. VADER es una
herramienta de análisis de sentimientos que es específica para los medios sociales, y que está diseñada para ser rápida y fácil de usar. VADER no requiere entrenamiento previo, y es capaz de manejar tanto texto en inglés como en otros idiomas.

In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

score = analyzer.polarity_scores(text)

print(score)

fig = px.bar(x=list(score.keys()), y=list(score.values()))
fig.show()

{'neg': 0.059, 'neu': 0.76, 'pos': 0.181, 'compound': 0.7179}


## TextBlob

In [4]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("spacytextblob")


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



<spacytextblob.spacytextblob.SpacyTextBlob at 0x30dd59cd0>

In [7]:
doc = nlp(text)

print(f'Polarity: {doc._.blob.polarity}')
print(f'Subjectivity: {doc._.blob.subjectivity}')

fig = px.bar(x=['Polarity', 'Subjectivity'], y=[doc._.blob.polarity, doc._.blob.subjectivity])
fig.show()

Polarity: 0.5333333333333333
Subjectivity: 0.7000000000000001


In [8]:
print(doc._.blob.sentiment_assessments.assessments)

[(['actually'], 0.0, 0.1, None), (['happy'], 0.8, 1.0, None), (['happy'], 0.8, 1.0, None)]


In [9]:
# ['happy'] is the most positive word in the text, with a polarity of 0.8 and a subjectivity of 1.0
# ['actually'] is the most neutral word in the text, with a polarity of 0.0 and a subjectivity of 0.1

### PySentimento

In [10]:
from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="en")

In [11]:
prediction = analyzer.predict(text)

In [12]:
prediction

AnalyzerOutput(output=POS, probas={POS: 0.647, NEU: 0.331, NEG: 0.022})

In [13]:
# Plot probas 

fig = px.bar(x=list(prediction.probas.keys()), y=list(prediction.probas.values()))
fig.show()

### Transformers

- Model: tabularisai/multilingual-sentiment-analysis
- Desciption: Distilbert-based Multilingual Sentiment Classification Model

In [44]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    sentiment_map = {0: "Very Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Very Positive"}
    sentiment = [sentiment_map[i] for i in probabilities.argmax(dim=-1).tolist()]
    return sentiment, [{sentiment_map[i]: proba for i, proba in enumerate(probabilities.squeeze().tolist())}]



prediction, probas = predict_sentiment([text])



In [45]:
prediction

['Positive']

In [46]:
probas

[{'Very Negative': 0.10938490927219391,
  'Negative': 0.14041787385940552,
  'Neutral': 0.2572682797908783,
  'Positive': 0.3097192049026489,
  'Very Positive': 0.18320982158184052}]

In [49]:
fig = px.bar(x=list(probas[0].keys()), y=list(probas[0].values()))
fig.show()

In [63]:
prediction, probas = predict_sentiment(["Mi mamá me regañó por no hacer la tarea"])
print(f"Prediction: {prediction[0]}")
fig = px.bar(x=list(probas[0].keys()), y=list(probas[0].values()))
fig.show()

Prediction: Very Negative


In [65]:
prediction, probas = predict_sentiment(["Me encanta hacer la tarea de Procesamiento de Lenguaje Natural"])
print(f"Prediction: {prediction[0]}")
fig = px.bar(x=list(probas[0].keys()), y=list(probas[0].values()))
fig.show()

Prediction: Positive


In [77]:
prediction, probas = predict_sentiment(["Reprobé la materia de Procesamiento de Lenguaje Natural :("])
print(f"Prediction: {prediction[0]}")
fig = px.bar(x=list(probas[0].keys()), y=list(probas[0].values()))
fig.show()

Prediction: Neutral
