# Sentiment Analysis Magical Mexican Towns Training Corpus - Classifier

#### Import libraries

In [23]:
import sys
sys.path.append('../')

import joblib

from tqdm import tqdm
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score

In [24]:
from utils.config import setConfig
device = setConfig()

from utils.metrics import RestMexMetrics
metrics = RestMexMetrics()

Usando MPS: mps
Tensor de prueba creado en el dispositivo: tensor([1.], device='mps:0') mps


## Load Tokenization

In [25]:
# Load the tokenized data (After running the tokenization script)

data = pd.read_csv(r'../data/train/train_augmented_tokenized.csv', encoding='utf-8')

data['Title'] = data['Title'].astype(str)
data['Review'] = data['Review'].astype(str)
data['Town'] = data['Town'].astype(str)
data['Region'] = data['Region'].astype(str)
data['Type'] = data['Type'].astype(str)
data['Polarity'] = data['Polarity'].astype(int)

import ast
data['Title_tokens'] = data['Title_tokens'].apply(ast.literal_eval)
data['Review_tokens'] = data['Review_tokens'].apply(ast.literal_eval)


In [26]:
# Split the data into training and testing sets

train, test = train_test_split(data, test_size=0.20, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (170040, 8)
Test shape: (42511, 8)


#### GridSearch

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedKFold

In [28]:
# Hack para evitar la tokenización de Sklearn

def IdentityTokenizer(tokens):
    return tokens

### Type

In [29]:
# Para Type 

X_train = train['Title_tokens'] + train['Review_tokens']
y_train = train['Type']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
        TfidfVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs'), SVC()],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
type_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
type_search.fit(X_train, y_train)

joblib.dump(type_search, '../models/baseline/type_gridmodel.pkl')

print(f"Best parameters for Type: {type_search.best_params_}")
print(f"Best score for Type: {type_search.best_score_}")



Best parameters for Type: {'classifier': SVC(), 'scaler': None, 'vectorizer': TfidfVectorizer(lowercase=False, token_pattern=None,
                tokenizer=<function IdentityTokenizer at 0x3313a1760>)}
Best score for Type: 0.9597083066573825


In [30]:
# Load best model & predict on the test set

best_type_model = joblib.load('../models/baseline/type_gridmodel.pkl').best_estimator_

In [31]:
# Predict on the validation set
X_test = test['Title_tokens'] + test['Review_tokens']
y_test = test['Type']
y_test_pred = best_type_model.predict(X_test)

In [32]:
report = classification_report(y_test, y_test_pred, target_names=test['Type'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[['Attractive', 'Hotel', 'Restaurant']].loc['f1-score'].to_dict()

ResT_k = metrics.TypeScore(f1)
ResT_k

0.9601384542051462

### Magic Town

In [33]:
# Para MT 

Regions = train['Region'].apply(lambda x: [str(x)])

X_train =  Regions + train['Title_tokens'] + train['Review_tokens']
y_train = train['Town']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
        TfidfVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs')],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
town_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
town_search.fit(X_train, y_train)

joblib.dump(town_search, '../models/baseline/magictown_gridmodel.pkl')

print(f"Best parameters for Type: {town_search.best_params_}")
print(f"Best score for Type: {town_search.best_score_}")

Best parameters for Type: {'classifier': LogisticRegression(max_iter=1000), 'scaler': None, 'vectorizer': CountVectorizer(lowercase=False, token_pattern=None,
                tokenizer=<function IdentityTokenizer at 0x3313a1760>)}
Best score for Type: 0.8791903531932806


In [34]:
best_town_model = joblib.load('../models/baseline/magictown_gridmodel.pkl').best_estimator_

In [35]:
# Predict on the validation set
X_test = test['Region'].apply(lambda x: [str(x)]) + test['Title_tokens'] + test['Review_tokens']
y_test = test['Town']
y_test_pred = best_town_model.predict(X_test)

In [36]:
report = classification_report(y_test, y_test_pred, target_names=test['Town'].unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResMT_k = metrics.TypeScore(f1)

ResMT_k

0.8489041099507204

### Sentiment

In [37]:
# Para MT 

X_train = train['Title_tokens'] + train['Review_tokens']
y_train = train['Polarity']

pipeline = Pipeline([
    ('vectorizer', None),
    ('scaler', None),
    ('classifier', None)
])

# Definimos los parámetros para el GridSearchCV

param_grid = {
    'vectorizer': [
        CountVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
        TfidfVectorizer(tokenizer=IdentityTokenizer, lowercase=False, token_pattern=None),
    ],
    'scaler': [None, 'passthrough'],
    'classifier': [MultinomialNB(), LogisticRegression(max_iter=1000, solver='lbfgs')],
    }

# Configuramos RepeatedKFold
rkf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=42)

# Configuramos el GridSearchCV con RepeatedKFold
# Aqui usamos f1_weighted, que calcula el F1-score para cada clase y toma el promedio ponderado por el número de muestras en cada clase.
polarity_search = GridSearchCV(pipeline, param_grid, cv=rkf, scoring='f1_weighted', n_jobs=-1)
polarity_search.fit(X_train, y_train)

joblib.dump(polarity_search, '../models/baseline/polarity_gridmodel.pkl')

print(f"Best parameters for Polarity: {polarity_search.best_params_}")
print(f"Best score for Polarity: {polarity_search.best_score_}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters for Polarity: {'classifier': LogisticRegression(max_iter=1000), 'scaler': None, 'vectorizer': TfidfVectorizer(lowercase=False, token_pattern=None,
                tokenizer=<function IdentityTokenizer at 0x3313a1760>)}
Best score for Polarity: 0.6825952007811742


In [38]:
best_polarity_model = joblib.load('../models/baseline/polarity_gridmodel.pkl').best_estimator_

In [39]:
X_test = test['Title_tokens'] + test['Review_tokens']
y_test = test['Polarity']
y_test_pred = best_polarity_model.predict(X_test)

In [40]:
report = classification_report(y_test, y_test_pred, target_names=test['Polarity'].astype(int).unique(), output_dict=True)
report = pd.DataFrame(report)
f1 = report[y_test.unique()].loc['f1-score'].to_dict()

ResP_k = metrics.TypeScore(f1)

ResP_k

0.494278594707443

In [41]:
print(f"ResP_k: {ResP_k:.4f}")
print(f"ResMT_k: {ResMT_k:.4f}")
print(f"ResT_k: {ResT_k:.4f}")

Sentiment_k = RestMexMetrics.RestMexScore(ResP_k, ResT_k, ResMT_k)
print(f"Sentiment(k): {Sentiment_k:.4f}")

ResP_k: 0.4943
ResMT_k: 0.8489
ResT_k: 0.9601
Sentiment(k): 0.7492
