### Carga Corpus

In [28]:
! pip3 install -q kaggle

In [29]:
! kaggle datasets download -d arushchillar/disneyland-reviews
! unzip disneyland-reviews.zip

disneyland-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  disneyland-reviews.zip
replace DisneylandReviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [None]:
import pandas as pd

: 

In [34]:
df = pd.read_csv("./DisneylandReviews.csv", encoding="ISO-8859-1")

A fines de generar un modelo de Sentiment Analysis, generaremos una variable Target con el sentimiento de cada review en base a la variable rating de la siguiente manera:
1. Positive: Reseñas con puntaje 4 o 5
2. Neutral: Reseñas con puntaje 3
3. Negative: Reseñas con puntaje 1 y 2

In [None]:
#Genero columna target
def Target(row):
    sentiment = "positive"
    if int(row["Rating"]) == 3:
        sentiment = "neutral"
    elif int(row["Rating"]) < 3:
        sentiment = "negative"
    return sentiment

df["target"] = df.apply(Target, axis=1)
print(df.groupby(['target'])['target'].count())

In [None]:
corpus = df['Review_Text'].tolist()

: 

### Vocabulario

Tokenización Bag Of Words: Devuelve frecuencia de aparición de cada palabra en todo el corpus

In [None]:
! pip3 install transformers nltk

In [None]:
import nltk
nltk.download('stopwords')

In [108]:
from transformers import AutoTokenizer
from nltk.corpus import stopwords
from collections import defaultdict

import string

stop_words = set(stopwords.words('english'))
tokenizer = AutoTokenizer.from_pretrained("gpt2")

word_freqs = defaultdict(int)

for text in corpus:
    text = text.translate(str.maketrans('', '', string.punctuation))
    filtered_text = [word.lower() for word in text.split(" ") if word.lower() not in stop_words]
    filtered_text = " ".join(filtered_text)
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(filtered_text)
    new_words = [word for word, _ in words_with_offsets]
    for word in new_words:
        word = word.lstrip("Ġ")
        if word != "":
            word_freqs[word] += 1
        
print(sorted(word_freqs.items(), key=lambda x: x[1], reverse=True)[:15])

[('park', 44309), ('disney', 35973), ('rides', 34295), ('disneyland', 32622), ('time', 29219), ('day', 28145), ('get', 22963), ('go', 20091), ('one', 19081), ('ride', 17661), ('great', 16305), ('would', 14591), ('kids', 14135), ('food', 14129), ('place', 13367)]


In [None]:
! pip install wordcloud matplotlib

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = "\n".join(corpus)

wordcloud = WordCloud(stopwords=stop_words).generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Byte-Pair Encoding: Amplía el vocabulario con las agrupaciones de tokens más comunes

In [106]:
vocab = []

for word in word_freqs.keys():
    for letter in word:
        if letter not in vocab:
            vocab.append(letter)
vocab.sort()

print(vocab)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
#Recibe todos los caracteres separados de las palabras de los documentos, los empareja y calcula la frecuencia de los emparejamientos
def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs

splits = {word: [c for c in word] for word in word_freqs.keys()}
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(f"{key}: {pair_freqs[key]}")
    if i >= 5:
        break

In [None]:
#Muestra el par más repetido
best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or max_freq < freq:
        best_pair = pair
        max_freq = freq

print(best_pair, max_freq)

In [None]:
#encuentra en las palabras los pares indicados y los agrupa
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

splits = merge_pair("i", "n", splits)

for i, key in enumerate(splits.keys()):
    print(f"{key}: {splits[key]}")
    if i >= 5:
        break

In [None]:
#muestra los pares más repetidos
merges = {("i", "n"): "in"}
vocab.append("in")

vocab_size = 100 #setea el máximo de pares a insertar hasta que el vocab alcance dicho tamaño

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or max_freq < freq:
            best_pair = pair
            max_freq = freq
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = best_pair[0] + best_pair[1]
    vocab.append(best_pair[0] + best_pair[1])

print(merges)

In [None]:
print(vocab)

TF-IDF: Devuelve el "peso" de cada palabra dentro de cada documento

In [None]:
#Genera peso para cada palabra de cada documento
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1,1))
bow = vect.fit_transform(df['Review_Text'])
total_features = len(vect.vocabulary_)

print(bow[0]) #muestro pesos de palabras del documento 0

In [None]:
#muestro orden de palabras con mayor peso del documento 0
print(sorted((bow[0,doc] for doc in range(total_features)), reverse=True))

In [None]:
print(bow[0]==0.40736440254022876) #la palabra con el mayor peso del doc 0 es la 15187

In [None]:
vect.get_feature_names_out()[15187]

In [None]:
df["Review_Text"][0]