### IMPORTACIONES

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

### cargar los datos

In [6]:
df = pd.read_parquet('../data_process/vectores/EmbeddingsFinal.parquet')

In [7]:
X_embeddings = np.stack(df['embedding'].values)

### ONE HOT ENCODING

In [None]:
encoder = OneHotEncoder(sparse_output=False)
X_language = encoder.fit_transform(df[['language']])

### combinando features

In [9]:
X = np.hstack([X_embeddings, X_language])

### target

In [10]:
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

### diviendo la data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

### ENTREMANIEMTO

In [13]:
model = XGBClassifier(
    tree_method='hist',
    n_jobs=2,
    random_state=42
)
model.fit(X_train, y_train)

### sacamos los datos que nos interesan

In [15]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

    contento       0.69      0.76      0.72      6794
     enojado       0.61      0.66      0.63      7173
insatisfecho       0.43      0.40      0.41      6566
     neutral       0.46      0.35      0.40      5219
    positivo       0.49      0.51      0.50      5612

    accuracy                           0.55     31364
   macro avg       0.53      0.54      0.53     31364
weighted avg       0.54      0.55      0.54     31364



## guardar modelo

In [16]:
import joblib


In [17]:
joblib.dump({
    'model': model,                     # Modelo entrenado (XGBoost/SVM)
    'label_encoder': le,                # Para decodificar 'sentiment' (ej: 0 → "contento")
    'onehot_encoder': encoder,          # Para codificar el idioma (si usaste language como feature)
}, 'modelo_multilingue.pkl')

['modelo_multilingue.pkl']