In [1]:
import spacy
import wasabi
import transformers
import joblib

from tqdm import tqdm
tqdm.pandas() 
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

from metrics import RestMexMetrics
metrics = RestMexMetrics()

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

Using device: mps


In [3]:
data = pd.read_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train.csv')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [4]:

train, test = train_test_split(data, test_size=0.15, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

X_train = '<title>' + train['Title'] + '<title> <review>' + train['Review'] + '<review>'
X_test = '<title>' + test['Title'] + '<title> <review>' + test['Review'] + '<review>'

Train shape: (176843, 6)
Test shape: (31208, 6)


In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "CarlosRCDev/spanish-gte-multilingual-base"
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)


In [None]:
# Embedd 

def embed_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

X_train_embeddings = []
X_test_embeddings = []

for text in tqdm(X_train):
    embedding = embed_text(text)
    X_train_embeddings.append(embedding)

for text in tqdm(X_test):
    embedding = embed_text(text)
    X_test_embeddings.append(embedding)

In [None]:
# Save the embeddings
joblib.dump(X_train_embeddings, 'X_train_embeddings.pkl')
joblib.dump(X_test_embeddings, 'X_test_embeddings.pkl')

In [5]:
X_train_embeddings = joblib.load('X_train_embeddings.pkl')
X_test_embeddings = joblib.load('X_test_embeddings.pkl')

In [None]:
# Reduce dimensionality

from sklearn.decomposition import PCA



In [7]:
# Logistic Regression

y_train = train['Polarity']
y_test = test['Polarity']

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_embeddings, y_train)

y_pred = lr.predict(X_test_embeddings)
print("Logistic Regression Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=test['Polarity'].astype(int).unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResP_k = metrics.TypeScore(f1)

ResP_k

Logistic Regression Model
Accuracy: 0.7434632145603691
F1 Score: 0.7223601392307758
Classification Report:
              precision    recall  f1-score   support

           1       0.61      0.61      0.61       794
           2       0.40      0.28      0.33       829
           3       0.55      0.48      0.51      2318
           4       0.54      0.35      0.43      6935
           5       0.81      0.93      0.87     20332

    accuracy                           0.74     31208
   macro avg       0.58      0.53      0.55     31208
weighted avg       0.72      0.74      0.72     31208



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5498756818619603

In [None]:
pca = PCA(n_components=2)
X_train_embeddings_reduced = pca.fit_transform(X_train_embeddings)
# Create a DataFrame for the reduced embeddings
df_embeddings_train = pd.DataFrame(X_train_embeddings_reduced, columns=['x', 'y'])
df_embeddings_train['Polarity'] = y_train.values
# Plot the reduced embeddings
fig = px.scatter(df_embeddings_train, x='x', y='y', color='Polarity', title="PCA of Embeddings")
fig.update_traces(marker=dict(size=5))
fig.update_layout(title="PCA of Train Embeddings", xaxis_title="PCA 1", yaxis_title="PCA 2")
fig.show()

In [9]:
# Logistic Regression

y_train = train['Type']
y_test = test['Type']

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_embeddings, y_train)

y_pred = lr.predict(X_test_embeddings)
print("Logistic Regression Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=test['Type'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[['Attractive', 'Hotel', 'Restaurant']].loc['f1-score'].to_dict()

ResT_k = metrics.TypeScore(f1)
ResT_k

Logistic Regression Model
Accuracy: 0.9591771340681876
F1 Score: 0.9591545517717532
Classification Report:
              precision    recall  f1-score   support

  Attractive       0.96      0.96      0.96     10472
       Hotel       0.95      0.94      0.95      7729
  Restaurant       0.96      0.97      0.96     13007

    accuracy                           0.96     31208
   macro avg       0.96      0.96      0.96     31208
weighted avg       0.96      0.96      0.96     31208



0.9576700132343521

In [27]:
y_train = train['Town']
y_test = test['Town']

# Utilizamos los embeddings y la columna de region para predecir, es decir tenemos que crear un nuevo dataframe 
# Donde una columna sea la region y la otra el embedding
X_train_mt = pd.DataFrame(X_train_embeddings)
X_train_mt['Region'] = train['Region'].values
# Convertir a categorical
X_train_mt['Region'] = X_train_mt['Region'].astype('category')
X_train_mt['Region'] = X_train_mt['Region'].cat.codes
X_train_mt.columns = X_train_mt.columns.astype(str)

X_test_embeddings = pd.DataFrame(X_test_embeddings)
X_test_embeddings['Region'] = test['Region'].values
# Convertir a categorical
X_test_embeddings['Region'] = X_test_embeddings['Region'].astype('category')
X_test_embeddings['Region'] = X_test_embeddings['Region'].cat.codes
X_test_embeddings.columns = X_test_embeddings.columns.astype(str)

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB


lr = GaussianNB()
lr.fit(X_train_mt, y_train)

y_pred = lr.predict(X_test_embeddings)
print("Logistic Regression Model")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Classification Report:")
print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)

report = classification_report(y_test, y_pred, target_names=test['Town'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResMT_k = metrics.TypeScore(f1)

ResMT_k

Logistic Regression Model
Accuracy: 0.6066713663163291
F1 Score: 0.6275209800292063
Classification Report:
                            precision    recall  f1-score   support

                    Ajijic       0.63      0.51      0.56       566
                   Atlixco       0.36      0.67      0.47       207
                   Bacalar       0.23      0.72      0.34      1657
                    Bernal       0.44      0.73      0.55       177
           Chiapa_de_Corzo       0.12      0.65      0.21       143
                   Cholula       0.74      0.50      0.60       425
                  Coatepec       0.41      0.74      0.53        98
                     Creel       1.00      1.00      1.00       274
           Cuatro_Cienegas       0.77      0.74      0.75       133
                 Cuetzalan       0.31      0.31      0.31       153
           Dolores_Hidalgo       1.00      1.00      1.00       126
          Huasca_de_Ocampo       1.00      1.00      1.00       247
        

0.6307106321888962