In [1]:
import joblib
from tqdm import tqdm
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

from torch.utils.data import DataLoader

In [2]:
import sys
sys.path.append('../')
from utils.metrics import RestMexMetrics
from utils.config import setConfig

device = setConfig()
metrics = RestMexMetrics()

Usando MPS: mps
Test correcto: mps


In [3]:
df = pd.read_csv(r'../data/train/train.csv')
audf = pd.read_csv(r'../data/augmented/train.csv')
data = pd.concat([df, audf], ignore_index=True)

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

In [4]:
texts = (
    '<title>' + data['Title'] + '<title> <review>' + data['Review'] + '<review>' 
).tolist()

In [5]:
model_name_or_path = 'Alibaba-NLP/gte-multilingual-base'
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True)

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from utils.run import embed_texts

In [7]:
model = model.to(device)
embeddings = embed_texts(texts, model, tokenizer, device.type, batch_size=16, dimension=768)
df_embeddings = pd.DataFrame(columns=['embedding', 'Town', 'Type', 'Polarity'])
df_embeddings['embedding'] = list(embeddings)
df_embeddings['Town'] = data['Town']
df_embeddings['Type'] = data['Type']
df_embeddings['Polarity'] = data['Polarity']

Generating embeddings: 100%|██████████| 13285/13285 [1:33:31<00:00,  2.37it/s]  


In [8]:
joblib.dump(df_embeddings, '../data/train/embeddings.pkl')

['../data/train/embeddings.pkl']

# Train

In [9]:
data = joblib.load('../data/train/embeddings.pkl')

In [10]:
# PCA 

pca = PCA()
X_pca = pca.fit_transform(data['embedding'].tolist())

In [11]:
# Plot PCA

fig = px.scatter_3d(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    z=X_pca[:, 2],
    title='PCA of Embeddings',
    color=data['Polarity'],
    labels={'x': 'PC1', 'y': 'PC2', 'z': 'PC3'},
)
fig.update_traces(marker=dict(size=2))
fig.update_layout(scene=dict(
    xaxis_title='PC1',
    yaxis_title='PC2',
    zaxis_title='PC3'
))
fig.show()

In [12]:
train, test = train_test_split(data, test_size=0.20, random_state=42)

# Town

In [13]:
X_train = pd.DataFrame(train['embedding'].to_numpy().tolist())
y_train = train['Town']

pipeline = Pipeline([
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
town_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
town_search.fit(X_train, y_train)

joblib.dump(town_search, '../models/embeddings/town_gridmodel.pkl')

print(f"Best parameters for Town: {town_search.best_params_}")
print(f"Best score for Town: {town_search.best_score_}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 	- Avoid using `tokenizer

Best parameters for Town: {'classifier': SVC()}
Best score for Town: 0.6254863991856394


# Type

In [14]:
X_train = pd.DataFrame(train['embedding'].to_numpy().tolist())
y_train = train['Type']

pipeline = Pipeline([
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
type_seach = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
type_seach.fit(X_train, y_train)

joblib.dump(type_seach, '../models/embeddings/type_gridmodel.pkl')

print(f"Best parameters for Town: {type_seach.best_params_}")
print(f"Best score for Town: {type_seach.best_score_}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best parameters for Town: {'classifier': SVC()}
Best score for Town: 0.9556547747011523


# Polarity

In [None]:
X_train = pd.DataFrame(train['embedding'].to_numpy().tolist())
y_train = train['Polarity']

pipeline = Pipeline([
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=2, n_repeats=1, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
polarity_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
polarity_search.fit(X_train, y_train)

joblib.dump(polarity_search, '../models/embeddings/polarity_gridmodel.pkl')

print(f"Best parameters for Town: {polarity_search.best_params_}")
print(f"Best score for Town: {polarity_search.best_score_}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best parameters for Town: {'classifier': LogisticRegression(max_iter=1000)}
Best score for Town: 0.7193358069088169



lbfgs failed to converge (status=1):
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression

