In [1]:
import sys
import os

# Ruta relativa desde el notebook a la carpeta de scripts
sys.path.append(os.path.abspath("../scripts"))

In [7]:
import numpy as np
import pandas as pd
from dataset_utils import df_to_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train_df = pd.read_csv("../datasets/Rest-Mex_2025_train.csv", encoding='utf-8')

In [4]:
train_df.columns

Index(['Title', 'Review', 'Polarity', 'Town', 'Region', 'Type'], dtype='object')

In [15]:
len(train_df["Polarity"].value_counts().index.to_numpy())

5

In [6]:
towns = { town:idx for idx, town in enumerate(train_df["Town"].value_counts().index.to_numpy()) }
train_df["label_town"] = train_df["Town"].apply(lambda x: towns[x])

In [7]:
train_df["label_polarity"] = train_df["Polarity"].apply(lambda x: int(x-1))

In [8]:
types = { _type:idx for idx, _type in enumerate(train_df["Type"].value_counts().index.to_numpy()) }
train_df["label_type"] = train_df["Type"].apply(lambda x: types[x] )

In [None]:
train_dataset = df_to_dataset(train_df, columns=['label_town', 'label_polarity', 'label_type'], text_column='Review')

In [12]:
train_dataset.save_to_disk(f'../datasets/restmex_mtl_v1')

Saving the dataset (1/1 shards): 100%|██████████| 208051/208051 [00:00<00:00, 784212.34 examples/s]


In [4]:
test_df = pd.read_csv("../datasets/Rest-Mex_2025_test.csv")

In [5]:
test_df.columns

Index(['ID', 'Title', 'Review'], dtype='object')

In [8]:
test_dataset = df_to_dataset(test_df, columns=['ID'], text_column='Review')

In [9]:
test_dataset.save_to_disk(f'../datasets/restmex_mtl_v1_test')

Saving the dataset (1/1 shards): 100%|██████████| 89166/89166 [00:00<00:00, 1147231.11 examples/s]


### Caracters

In [11]:
train_dataset['reviews'][:10]

['Excelente lugar para comer y pasar una buena noche!!! El servicio es de primera y la comida exquisita!!!',
 'andar mucho, así que un poco difícil para personas con niños pequeños, pero con mucha historia en la zona, y la diversión de aprender un poco de todo, y explorar las ruinas. La playa también era bastante agradable!',
 'Es nuestra cuarta visita a Dreams Tulum, elegimos este hotel para festejar mi cumpleaños ya que en este hotel nos comprometimos y casamos y tenemos un cariño muy especial por este lugar, pero mostramos que cambiaron las cosas. En cuestión de instalaciones sigue perfecto!! La playa muy limpia a pesar del sargazo. Pero en la amabilidad y servicio que los distinguía lo han perdido bastante, los empleados andan corriendo por todos lados, gritando de un lado a otro tratando de organizarse y pasamos varios detalles como por ejemplo mi esposo pidió un juego verde y la mesera le contestó que se parara él que estaba en la esquina porque solo se llevaba el café!! Eso jamá

### Caracters

In [None]:
import numpy as np

characters = np.unique_values([ c for review in train_df["Review"] for c in review ])

In [None]:
import re
import unicodedata

def clean_characters(text):
    #text = re.sub('\n+', '\n', text)
    text = re.sub(r"`|‘|’|´", "'", text)
    text = re.sub(r"''", "", text)
    text = re.sub(r"\xad|\x81|…|_|\u200b", " ", text)
    text = re.sub("[-—–]+", "-", text) 
    text = re.sub(r"\.+", ".", text)
    
    text = re.sub(r"б|Ã¡|Ã¡|à", "á", text)
    text = re.sub(r"Ã©|è|й", "é", text)
    text = re.sub(r"у|Ã³|í³|ò", "ó", text)
    text = re.sub(r"ъ|Ãº|ù", "ú", text)

    text = re.sub(r"Ã|À", "Á", text)
    text = re.sub(r"Ã‰|È", "É", text)
    text = re.sub(r"Ã|Ì", "Í", text)
    text = re.sub(r"Ã“|Ò", "Ó", text)
    text = re.sub(r"Ãš|Ù", "Ú", text)
    
    text = re.sub(r"Ã‘", "Ñ", text)
    text = re.sub(r"с|Ã±|a±|í±", "ñ", text)
    text = re.sub(r"е", "e", text)
    text = re.sub(r"Â¿|Ї", "¿", text)
    text = re.sub(r"éÂ¼", "üe", text)
    text = re.sub(r"ss ", " ", text)  
    
    if re.search(r"н|Ã ­|Ã|­�|ì", text) is not None: 
        text = re.sub(r"н|Ã ­|Ã|­�|ì", "í", text)
        text = re.sub(r"í ", "í", text)

    text = re.sub(r'([\.,¡!¿\?\[\]\(\)%\$])', r' \1 ', text)
    text = re.sub(r"[^A-Za-zÁÉÍÓÚáéíóúüÑñ¿\?!¡,\[\]\(\)\n'0-9-\. %\$]", '', text)
    
    text = re.sub(r'\b[Mm][áa]s\b[\.\!\?]?$','', text)

    text = re.sub(r"\s+", " ", text)

    return unicodedata.normalize('NFC', text.strip())

In [None]:
#from dataset_utils import clean_characters

train_df["clean_review"] = train_df["Review"].apply(lambda x: clean_characters(x))

In [None]:
new_characters = np.unique_values([ c for review in train_df["clean_review"] for c in review ])
new_characters

In [None]:
# 121199 121196 20450
id = 12560
print(train_df["Review"][id])
print("")
print(train_df["clean_review"][id])

In [None]:
import matplotlib.pyplot as plt
import numpy as np

words_size = [ len(text.split(" ")) for text in train_df["clean_review"] ]
words_mean = np.mean(words_size)
words_q    = np.quantile(words_size, q=0.95)

In [None]:
np.quantile(words_size, q=0.8)

In [None]:
train_df["clean_review"].iloc[np.argmax(words_size)].split(".")[3]

In [None]:
plt.hist(words_size, bins=np.arange(1,512,1))
plt.axvline(words_mean, color="red", linestyle="dashed", linewidth=2, label=f"Mean: {words_mean:.2f}")
plt.axvline(words_q, color="green", linestyle="dashed", linewidth=2, label=f"Quantile: {words_q:.2f}")
plt.legend()
plt.show()

In [None]:
"Vivo en la majestuosa CDMX".split(" ")[-1:]