# Sentiment Analysis Magical Mexican Towns Training Corpus - Classifier

#### Import libraries

In [1]:
import spacy
import wasabi

import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report

from tqdm import tqdm
tqdm.pandas() 

#### Check GPU

In [98]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


## Load Data and preprocess

In [99]:
df = pd.read_csv(r'/Users/roicort/GitHub/REST-MEX25/dataset/train.csv')

df['Title'] = df['Title'].astype(str)
df['Review'] = df['Review'].astype(str)
df['Town'] = df['Town'].astype(str)
df['Region'] = df['Region'].astype(str)
df['Type'] = df['Type'].astype(str)
df['Polarity'] = df['Polarity'].astype(int)

## Tokenization

In [100]:
# Download the spanish model
#!python -m spacy download es_dep_news_trf 
#!python -m spacy download es_core_news_sm

In [101]:
spacy.prefer_gpu()
nlp = spacy.load("es_core_news_sm")

# Tokenize the text data

def tokenize_text(text, nlp):
    """
    Tokenize the input text, removing punctuation, stopwords, and converting to lowercase.
    
    Args:
        text (str): The input text to tokenize.
        nlp (spacy.Language): The spaCy language model.
    
    Returns:
        list: A list of processed tokens.
    """
    # Procesar el texto con el modelo de spaCy
    doc = nlp(text)
    
    # Extraer tokens, eliminar puntuación, stopwords y convertir a minúsculas
    tokens = [token.text.lower() for token in doc if not token.is_stop]
    
    return tokens

In [102]:
# Tokenize the Title data with progress bar
df['Title_tokens'] = df['Title'].progress_apply(lambda x: tokenize_text(x, nlp))

# Tokenize the Review data with progress bar
df['Review_tokens'] = df['Review'].progress_apply(lambda x: tokenize_text(x, nlp))

100%|██████████| 208051/208051 [04:08<00:00, 837.46it/s]
100%|██████████| 208051/208051 [14:14<00:00, 243.42it/s]


In [14]:
# Split the data into training and testing sets

train, test = train_test_split(df, test_size=0.25, random_state=42)
val, test = train_test_split(test, test_size=0.4, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Validation shape: {val.shape}")
print(f"Test shape: {test.shape}")

Train shape: (156038, 8)
Validation shape: (31207, 8)
Test shape: (20806, 8)


#### Basline

In [74]:
def baseline(origin, target, train, val, test, vectorizer, model, scaler=False):

    tokens_str = [' '.join(tokens) for tokens in train[origin]]
    vectorizer.fit(tokens_str)

    tokens_train = vectorizer.transform(tokens_str)
    tokens_val = vectorizer.transform([' '.join(tokens) for tokens in val[origin]])
    tokens_test = vectorizer.transform([' '.join(tokens) for tokens in test[origin]])

    if scaler:
        tokens_train = scaler.fit_transform(tokens_train)
        tokens_val = scaler.transform(tokens_val)
        tokens_test = scaler.transform(tokens_test)

    model.fit(tokens_train, train[target])

    train_preds = model.predict(tokens_train)
    val_preds = model.predict(tokens_val)
    test_preds = model.predict(tokens_test)

    train_acc = accuracy_score(train[target], train_preds)
    val_acc = accuracy_score(val[target], val_preds)
    test_acc = accuracy_score(test[target], test_preds)

    result = {
        'Origin': origin,
        'Target': target,
        'Model': model.__class__.__name__,
        'Scaler': scaler.__class__.__name__ if scaler else None,
        'Vectorizer': vectorizer.__class__.__name__,
        'Train Accuracy': train_acc,
        'Validation Accuracy': val_acc,
        'Test Accuracy': test_acc
    }

    wasabi.msg.info(f"Done training {model.__class__.__name__} model with {vectorizer.__class__.__name__} vectorizer and {scaler.__class__.__name__ if scaler else 'no'} scaler.")

    return result

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

grid = []

In [76]:
origin = 'Title_tokens'

target = 'Type'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

target = 'Town'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

target = 'Polarity'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m


In [77]:
origin = 'Review_tokens'

target = 'Type'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

target = 'Town'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

target = 'Polarity'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)
experiment = baseline(origin, target, train, val, test, vectorizer=CountVectorizer(), model=MultinomialNB())
grid.append(experiment)

[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and no scaler.[0m
[38;5;4mℹ Done training MultinomialNB model with CountVectorizer vectorizer and
no scaler.[0m


In [78]:
gridf = pd.DataFrame(grid)

In [None]:
# Select the best model for each target

gridf.sort_values('Test Accuracy', ascending=True).groupby('Target').first().reset_index()

In [81]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler

origin = 'Review_tokens'

target = 'Type'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    scaler=MaxAbsScaler(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)

target = 'Town'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    scaler=MaxAbsScaler(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)

target = 'Polarity'
experiment = baseline(
    origin, target, train, val, test,
    vectorizer=CountVectorizer(),
    scaler=MaxAbsScaler(),
    model=LogisticRegression(max_iter=1000,  solver='lbfgs')
)
grid.append(experiment)

[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and MaxAbsScaler scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and MaxAbsScaler scaler.[0m
[38;5;4mℹ Done training LogisticRegression model with CountVectorizer
vectorizer and MaxAbsScaler scaler.[0m


In [82]:
gridf = pd.DataFrame(grid)

In [90]:
gridf

Unnamed: 0,Origin,Target,Model,Scaler,Vectorizer,Train Accuracy,Validation Accuracy,Test Accuracy
0,Title_tokens,Type,LogisticRegression,,CountVectorizer,0.793698,0.758996,0.758916
1,Title_tokens,Type,MultinomialNB,,CountVectorizer,0.779586,0.756529,0.754398
2,Title_tokens,Town,LogisticRegression,,CountVectorizer,0.418469,0.359118,0.358022
3,Title_tokens,Town,MultinomialNB,,CountVectorizer,0.388168,0.345115,0.344276
4,Title_tokens,Polarity,LogisticRegression,,CountVectorizer,0.725272,0.679751,0.690089
5,Title_tokens,Polarity,MultinomialNB,,CountVectorizer,0.716652,0.674496,0.684033
6,Review_tokens,Type,LogisticRegression,,CountVectorizer,0.983901,0.948601,0.946217
7,Review_tokens,Type,MultinomialNB,,CountVectorizer,0.942187,0.936937,0.936701
8,Review_tokens,Town,LogisticRegression,,CountVectorizer,0.904824,0.669914,0.667211
9,Review_tokens,Town,MultinomialNB,,CountVectorizer,0.601373,0.534848,0.535759


In [94]:
gridf.sort_values('Test Accuracy', ascending=False).groupby('Target').first()

Unnamed: 0_level_0,Origin,Model,Scaler,Vectorizer,Train Accuracy,Validation Accuracy,Test Accuracy
Target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Polarity,Review_tokens,LogisticRegression,MaxAbsScaler,CountVectorizer,0.852517,0.685295,0.690234
Town,Review_tokens,LogisticRegression,MaxAbsScaler,CountVectorizer,0.904824,0.669914,0.667211
Type,Review_tokens,LogisticRegression,MaxAbsScaler,CountVectorizer,0.97755,0.948313,0.946698
