# Sentiment Analysis Magical Mexican Towns Training Corpus - Classifier

#### Import libraries

In [22]:
import spacy
import wasabi
import transformers
import joblib

from tqdm import tqdm
tqdm.pandas() 
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

from metrics import RestMexMetrics
metrics = RestMexMetrics()

#### Check GPU

In [3]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


## Load Data and preprocess

In [None]:
data = pd.read_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train.csv')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

## Tokenization

In [5]:
# Download the spanish model
#!python -m spacy download es_dep_news_trf 
#!python -m spacy download es_core_news_sm

In [6]:
spacy.prefer_gpu()
nlp = spacy.load("es_core_news_sm")

# Tokenize the text data

def tokenize_text(text, nlp):
    """
    Tokenize the input text, removing punctuation, stopwords, and converting to lowercase.
    
    Args:
        text (str): The input text to tokenize.
        nlp (spacy.Language): The spaCy language model.
    
    Returns:
        list: A list of processed tokens.
    """
    # Procesar el texto con el modelo de spaCy
    doc = nlp(text)
    
    # Extraer tokens, eliminar puntuación, stopwords y convertir a minúsculas
    tokens = [token.text.lower() for token in doc if not token.is_stop]
    
    return tokens

In [None]:
# Tokenize the Title data with progress bar
data['Title_tokens'] = data['Title'].progress_apply(lambda x: tokenize_text(x, nlp))

# Tokenize the Review data with progress bar
data['Review_tokens'] = data['Review'].progress_apply(lambda x: tokenize_text(x, nlp))

100%|██████████| 208051/208051 [04:09<00:00, 834.39it/s]
100%|██████████| 208051/208051 [14:18<00:00, 242.26it/s]


In [16]:
# Save the tokenized data to a CSV file

data.to_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train_tokenized.csv', index=False, encoding='utf-8')

In [4]:
# Load the tokenized data

data = pd.read_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train_tokenized.csv', encoding='utf-8')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [9]:
data

Unnamed: 0,Title,Review,Polarity,Town,Region,Type,Title_tokens,Review_tokens
0,Mi Lugar Favorito!!!!,Excelente lugar para comer y pasar una buena n...,5,Sayulita,Nayarit,Restaurant,"['lugar', 'favorito', '!', '!', '!', '!']","['excelente', 'lugar', 'comer', 'pasar', 'noch..."
1,lugares interesantes para visitar,"andar mucho, así que un poco difícil para pers...",4,Tulum,QuintanaRoo,Attractive,"['lugares', 'interesantes', 'visitar']","['andar', ',', 'difícil', 'personas', 'niños',..."
2,No es el mismo Dreams,"Es nuestra cuarta visita a Dreams Tulum, elegi...",3,Tulum,QuintanaRoo,Hotel,['dreams'],"['cuarta', 'visita', 'dreams', 'tulum', ',', '..."
3,un buen panorama cerca de CancÃºn,"Estando en CancÃºn, fuimos al puerto y tomamos...",4,Isla_Mujeres,QuintanaRoo,Attractive,"['panorama', 'cerca', 'cancãºn']","['estando', 'cancãºn', ',', 'puerto', 'tomamos..."
4,El mejor,Es un lugar antiguo y por eso me encanto tiene...,5,Patzcuaro,Michoacan,Hotel,[],"['lugar', 'antiguo', 'encanto', 'área', 'juego..."
...,...,...,...,...,...,...,...,...
208046,"excelente ambiente, comida y atencion","Excelente Restaurante, comida Mexicana de alto...",5,Tequisquiapan,Queretaro,Restaurant,"['excelente', 'ambiente', ',', 'comida', 'aten...","['excelente', 'restaurante', ',', 'comida', 'm..."
208047,Muy mal servicio,Estuvimos allí como huéspedes de un día a la p...,2,Tulum,QuintanaRoo,Hotel,['servicio'],"['estuvimos', 'huéspedes', 'playa', '.', 'cama..."
208048,Excelente,"Excelente comida, así como la atención y servi...",5,Ixtapan_de_la_Sal,Estado_de_Mexico,Restaurant,['excelente'],"['excelente', 'comida', ',', 'atención', 'serv..."
208049,Visita nocturna,Nos toco visitar este sitio cuando ya había ca...,4,Creel,Chihuahua,Attractive,"['visita', 'nocturna']","['toco', 'visitar', 'sitio', 'caido', 'sol', '..."


In [5]:
# Split the data into training and testing sets

train, test = train_test_split(data, test_size=0.20, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (166440, 8)
Test shape: (41611, 8)


#### GridSearch

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Import classifiers Naive Bayes, Logistic Regression, SVM, Random Forest
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RepeatedKFold


### Type

In [None]:
# Para Type 

X_train = train['Title_tokens'] + train['Review_tokens']
y_train = train['Type']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(),
        TfidfVectorizer(),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC(), RandomForestClassifier()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
type_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
type_search.fit(X_train, y_train)

joblib.dump(type_search.best_estimator_, 'best_model_type.pkl')

print(f"Best parameters for Type: {type_search.best_params_}")
print(f"Best score for Type: {type_search.best_score_}")



{'classifier': SVC(), 'scaler': None, 'vectorizer': TfidfVectorizer()}

In [149]:
# Load best model & predict on the test set

best_type_model = joblib.load('best_model_type.pkl')

In [150]:
# Predict on the validation set
X_test = test['Title_tokens'] + test['Review_tokens']
y_test = test['Type']
y_test_pred = best_type_model.predict(X_test)

In [None]:
report = classification_report(y_test, y_test_pred, target_names=test['Type'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[['Attractive', 'Hotel', 'Restaurant']].loc['f1-score'].to_dict()

ResT_k = metrics.TypeScore(f1)
ResT_k

### Magic Town

In [None]:
# Para MT 

X_train = train['Region'] + train['Title_tokens'] + train['Review_tokens']
y_train = train['Town']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(),
        TfidfVectorizer(),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC(), RandomForestClassifier()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
town_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
town_search.fit(X_train, y_train)

joblib.dump(town_search.best_estimator_, 'best_model_magictown.pkl')

print(f"Best parameters for Type: {town_search.best_params_}")
print(f"Best score for Type: {town_search.best_score_}")



Best parameters for Type: {'classifier': LogisticRegression(max_iter=1000), 'scaler': None, 'vectorizer': CountVectorizer()}
Best score for Type: 0.871877032192697


In [153]:
best_town_model = joblib.load('best_model_magictown.pkl')

In [154]:
# Predict on the validation set
X_test = test['Region'] + test['Title_tokens'] + test['Review_tokens']
y_test = test['Town']
y_test_pred = best_town_model.predict(X_test)

In [155]:
report = classification_report(y_test, y_test_pred, target_names=test['Town'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResMT_k = metrics.TypeScore(f1)

ResMT_k

0.8472366523785156

### Sentiment

In [None]:
# Para MT 

X_train = train['Title_tokens'] + train['Review_tokens']
y_train = train['Polarity']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(),
        TfidfVectorizer(),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC(), RandomForestClassifier()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
polarity_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
polarity_search.fit(X_train, y_train)

joblib.dump(polarity_search.best_estimator_, 'best_model_polarity.pkl')

print(f"Best parameters for Polarity: {polarity_search.best_params_}")
print(f"Best score for Polarity: {polarity_search.best_score_}")



Best parameters for Type: {'classifier': LogisticRegression(max_iter=1000), 'scaler': None, 'vectorizer': TfidfVectorizer()}
Best score for Type: 0.6797838032454482


In [156]:
best_polarity_model = joblib.load('best_model_polarity.pkl')

In [157]:
best_polarity_model

In [None]:
X_test = test['Title_tokens'] + test['Review_tokens']
y_test = test['Polarity']
y_test_pred = best_polarity_model.predict(X_test)

In [159]:
report = classification_report(y_test, y_test_pred, target_names=test['Polarity'].astype(int).unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResP_k = metrics.TypeScore(f1)

ResP_k

0.4767953820830647

In [146]:
report

Unnamed: 0,4,3,5,2,1,accuracy,macro avg,weighted avg
precision,0.64311,0.402667,0.469227,0.485453,0.781035,0.716133,0.556298,0.678669
recall,0.513641,0.136528,0.31154,0.320319,0.928737,0.716133,0.442153,0.716133
f1-score,0.57113,0.203916,0.37446,0.385965,0.848506,0.716133,0.476795,0.686372
support,1063.0,1106.0,3059.0,9272.0,27111.0,0.716133,41611.0,41611.0


In [161]:
print(f"ResP_k: {ResP_k:.4f}")
print(f"ResMT_k: {ResMT_k:.4f}")
print(f"ResT_k: {ResT_k:.4f}")

Sentiment_k = RestMexMetrics.RestMexScore(ResP_k, ResT_k, ResMT_k)
print(f"Sentiment(k): {Sentiment_k:.4f}")

ResP_k: 0.4768
ResMT_k: 0.8472
ResT_k: 0.9563
Sentiment(k): 0.7419
