In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sentence_transformers import SentenceTransformer
from scipy.sparse import hstack, csr_matrix
import numpy as np

import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import normalize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
file_path = './data/cuantifications/sample2.csv'
data = pd.read_csv(file_path, delimiter=',')

In [3]:
data.head()

Unnamed: 0,IDDECINVTIPOPRODUCTOESPERADO,ANIO,VOLUMEN,PAGINAS,DOI,CUARTIL,CORPUS
0,25,2025,23,11,revistas.espol.edu.ec/index.php/matematica,0,generalization zariouh’s property gaz local sp...
1,24,2025,25,16,10.1186/s12870-025-06196-4,1,exploring benefits amf colonization improving ...
2,24,2025,15,15,I 10.3389/fpls.2024.1500894,1,inoculation micromonospora sp enhances carbohy...
3,24,2024,24,18,10.1186/s12870-024-05423-8,1,mitigating cold stress rice study genotype per...
4,24,2025,2025,8,10.1155/ijmm/5191108,3,characterization affinir primal topological sp...


#### Embeddings

##### TF-IDF

In [4]:
# --- Método A: TF-IDF ---
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),
                                   min_df=2,
                                   max_features=40000,
                                   sublinear_tf=True)
X_text_tfidf = tfidf_vectorizer.fit_transform(data['CORPUS'])

print("TF-IDF shape:", X_text_tfidf.shape)

TF-IDF shape: (333, 6015)


##### paraphrase-multilingual-MiniLM-L12-v2

In [5]:
# Asegurarnos que los datos son válidos
def check_and_clean_corpus(df, column_name):
    # Verificar si la columna existe
    if column_name not in df.columns:
        raise ValueError(f"La columna {column_name} no existe en el DataFrame")

    # Convertir a string y limpiar nulos
    df[column_name] = df[column_name].fillna('')
    df[column_name] = df[column_name].astype(str)

    # Verificar que no hay valores vacíos
    print(f"\nNúmero de valores vacíos: {df[column_name].eq('').sum()}")
    print(f"Número total de registros: {len(df)}")

    return df[column_name].values

In [6]:
# --- Método B: Sentence Transformers ---

# Aplicar la limpieza
try:
    corpus_texts = check_and_clean_corpus(data, 'CORPUS')

    # Cargar modelo y generar embeddings en batches
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

    # Procesar en batches para evitar problemas de memoria
    batch_size = 32
    X_text_embeddings = []

    for i in range(0, len(corpus_texts), batch_size):
        batch = corpus_texts[i:i + batch_size]
        embeddings = model.encode(batch, show_progress_bar=True) # Recibe una lista, en este caso value
        X_text_embeddings.append(embeddings)

    # Concatenar todos los batches
    X_text_embeddings = np.vstack(X_text_embeddings)

    print("\nShape de los embeddings:", X_text_embeddings.shape)

except Exception as e:
    print(f"\nError encontrado: {str(e)}")
    print("\nPrimeros 5 registros del DataFrame:")
    print(data.head())


Número de valores vacíos: 0
Número total de registros: 333


Batches: 100%|██████████| 1/1 [00:01<00:00,  1.97s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.24s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
Batches: 100%|██████████| 1/1 [00:00<00:00,  1.87it/s]


Shape de los embeddings: (333, 384)





##### LDA

In [7]:
documents = data['CORPUS']

In [8]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saraujo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
# 2. Preprocesamiento del texto
def preprocess_text(text):
    stop_words = set(stopwords.words('spanish'))  # Eliminar palabras vacías en español
    lemmatizer = WordNetLemmatizer()
    
    # Tokenización
    tokens = word_tokenize(text.lower())
    # Filtrar palabras no deseadas
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    
    return " ".join(filtered_tokens)

preprocessed_documents = [preprocess_text(doc) for doc in documents]

In [10]:
# 3. Vectorización del texto
vectorizer = CountVectorizer()
document_term_matrix = vectorizer.fit_transform(preprocessed_documents)

In [11]:
# 4. Aplicar LDA con alpha y beta
# alpha (doc_topic_prior) controla la dispersión de los tópicos/temas en los documentos
# beta (topic_word_prior) controla la dispersión de las palabras en los tópicos/temas
lda = LatentDirichletAllocation(
    n_components=7,          # Número de tópicos/temas
    doc_topic_prior=0.5,     # Alpha: Mayor valor = documentos con múltiples tópicos/temas
    topic_word_prior=0.05,   # Beta: Mayor valor = temas con palabras más diversas
    random_state=42
)
lda.fit(document_term_matrix)

In [12]:
# 5. Mostrar los tópicos generados
def display_topics(model, feature_names, num_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Tópico {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]))
        print("\n")

num_top_words = 5
feature_names = vectorizer.get_feature_names_out()
display_topics(lda, feature_names, num_top_words)

Tópico 1:
energy system model study water


Tópico 2:
control desarrollo property analysis study


Tópico 3:
student study method result learning


Tópico 4:
water study system ecuador community


Tópico 5:
study model result system analysis


Tópico 6:
study satisfaction service customer analysis


Tópico 7:
learning study model student research




In [13]:
# 6. Probabilidad de los tópicos en cada documento
topic_distribution = lda.transform(document_term_matrix)
topic_distribution = normalize(topic_distribution, norm='l1', axis=1)

for idx, probs in enumerate(topic_distribution):
    print(f"Documento {idx + 1}:")
    for topic_idx, prob in enumerate(probs):
        print(f"  - Tópico {topic_idx + 1}: {prob:.2f}")

Documento 1:
  - Tópico 1: 0.01
  - Tópico 2: 0.88
  - Tópico 3: 0.01
  - Tópico 4: 0.01
  - Tópico 5: 0.01
  - Tópico 6: 0.01
  - Tópico 7: 0.08
Documento 2:
  - Tópico 1: 0.00
  - Tópico 2: 0.00
  - Tópico 3: 0.00
  - Tópico 4: 0.00
  - Tópico 5: 0.00
  - Tópico 6: 0.98
  - Tópico 7: 0.00
Documento 3:
  - Tópico 1: 0.00
  - Tópico 2: 0.00
  - Tópico 3: 0.00
  - Tópico 4: 0.00
  - Tópico 5: 0.00
  - Tópico 6: 0.00
  - Tópico 7: 0.98
Documento 4:
  - Tópico 1: 0.03
  - Tópico 2: 0.00
  - Tópico 3: 0.00
  - Tópico 4: 0.96
  - Tópico 5: 0.00
  - Tópico 6: 0.00
  - Tópico 7: 0.00
Documento 5:
  - Tópico 1: 0.01
  - Tópico 2: 0.02
  - Tópico 3: 0.01
  - Tópico 4: 0.01
  - Tópico 5: 0.01
  - Tópico 6: 0.92
  - Tópico 7: 0.01
Documento 6:
  - Tópico 1: 0.01
  - Tópico 2: 0.01
  - Tópico 3: 0.01
  - Tópico 4: 0.01
  - Tópico 5: 0.01
  - Tópico 6: 0.01
  - Tópico 7: 0.97
Documento 7:
  - Tópico 1: 0.15
  - Tópico 2: 0.60
  - Tópico 3: 0.01
  - Tópico 4: 0.01
  - Tópico 5: 0.01
  - Tópico 6: 0.

In [14]:
topic_distribution

array([[0.01098822, 0.87813245, 0.00866587, ..., 0.00838266, 0.01026701,
        0.07573288],
       [0.00294631, 0.00327067, 0.00289669, ..., 0.00324594, 0.98091112,
        0.00354488],
       [0.00265614, 0.00327114, 0.00273237, ..., 0.00310263, 0.00378854,
        0.98062908],
       ...,
       [0.00457249, 0.00535804, 0.00538235, ..., 0.00460249, 0.97073853,
        0.00462112],
       [0.00603889, 0.00741255, 0.0074103 , ..., 0.00606527, 0.06054066,
        0.00615574],
       [0.00380951, 0.00471403, 0.97582261, ..., 0.00395489, 0.00399743,
        0.00383802]])

#### Columns

In [15]:
numeric_cols = ["ANIO", "VOLUMEN", "CUARTIL", "PAGINAS"]
text_numeric_cols = ["CORPUS", "ANIO", "VOLUMEN", "CUARTIL", "PAGINAS"]

#### Data clean

In [16]:
data_clean = data[numeric_cols]


In [17]:
data_clean

Unnamed: 0,ANIO,VOLUMEN,CUARTIL,PAGINAS
0,2025,23,0,11
1,2025,25,1,16
2,2025,15,1,15
3,2024,24,1,18
4,2025,2025,3,8
...,...,...,...,...
328,2024,7,1,9
329,2024,1,0,6
330,2022,24,2,24
331,2023,14,2,25


In [18]:
data.shape, data.keys()

((333, 7),
 Index(['IDDECINVTIPOPRODUCTOESPERADO', 'ANIO', 'VOLUMEN', 'PAGINAS', 'DOI',
        'CUARTIL', 'CORPUS'],
       dtype='object'))

##### Estandarizacion y unión

In [19]:
# --- Z-score (StandardScaler) ---
scaler_z = StandardScaler()
df_zscore = data.copy() # toda la data
df_zscore[numeric_cols] = scaler_z.fit_transform(data_clean) # solo las columnas que se necesitam

print("Numeric scaled shape:", df_zscore.shape)

Numeric scaled shape: (333, 7)


In [20]:
# --- MinMaxScaler (0-1) ---
scaler_mm = MinMaxScaler()
df_minmax = data.copy() # toda la data
df_minmax[numeric_cols] = scaler_mm.fit_transform(data_clean) # solo las columnas que se necesitam
df_minmax.shape

(333, 7)

##### Columnas a unir con el embedding

In [21]:
# type_standar = "zscore"
# X_numeric_scaled = df_zscore[numeric_cols] # data[numeric_cols]
type_standar = "minmax"
X_numeric_scaled = df_minmax[numeric_cols]

##### Union con TF-IDF

In [22]:
# Convertir X_numeric_scaled a matriz dispersa con tipo float64
X_numeric_sparse = csr_matrix(X_numeric_scaled.astype(np.float64))

# Asegurarnos que X_text_tfidf es float64
X_text_tfidf = X_text_tfidf.astype(np.float64)

In [23]:
print("Shape de X_text_tfidf:", X_text_tfidf.shape)
print("Shape de X_text_embeddings:", X_text_embeddings.shape)
print("Shape de X_numeric_scaled:", X_numeric_scaled.shape)


Shape de X_text_tfidf: (333, 6015)
Shape de X_text_embeddings: (333, 384)
Shape de X_numeric_scaled: (333, 4)


In [24]:
# A -----------------------------------------------------------------------
# Ahora intentar el hstack
try:
    # Verificar shapes antes de combinar
    print("Shape de X_text_tfidf:", X_text_tfidf.shape)
    print("Shape de X_numeric_sparse:", X_numeric_sparse.shape)

    # Realizar la combinación
    X_final_tfidf = hstack([X_text_tfidf, X_numeric_sparse])
    print("Shape final:", X_final_tfidf.shape)

    # Verificar que no hay valores nulos o infinitos
    print("Tiene valores infinitos:", np.isinf(X_final_tfidf.data).any())
    print("Tiene valores NaN:", np.isnan(X_final_tfidf.data).any())

except Exception as e:
    print("Error:", str(e))

    # Información adicional para diagnóstico
    print("\nTipos de datos:")
    print("X_text_tfidf dtype:", X_text_tfidf.dtype)
    print("X_numeric_scaled dtype:", X_numeric_scaled.dtype)

Shape de X_text_tfidf: (333, 6015)
Shape de X_numeric_sparse: (333, 4)
Shape final: (333, 6019)
Tiene valores infinitos: False
Tiene valores NaN: False


##### Union con paraphrase-multilingual-MiniLM-L12-v2

In [25]:
try:
    # Convertir X_numeric_scaled a array denso si no lo es ya
    X_numeric_dense = X_numeric_scaled if isinstance(X_numeric_scaled, np.ndarray) else X_numeric_scaled.to_numpy()

    # Asegurar que ambos sean arrays numpy
    X_text_embeddings = np.array(X_text_embeddings, dtype=np.float64)
    X_numeric_dense = np.array(X_numeric_dense, dtype=np.float64)

    # Combinar horizontalmente usando np.hstack o np.concatenate
    X_final_embeddings = np.concatenate([X_text_embeddings, X_numeric_dense], axis=1)

    print("Shape final:", X_final_embeddings.shape)

except Exception as e:
    print("Error:", str(e))

    # Información de diagnóstico
    print("\nTipos de datos:")
    print("X_text_embeddings dtype:", X_text_embeddings.dtype)

    # Imprimir el tipo de dato de cada columna en X_numeric_scaled
    if isinstance(X_numeric_scaled, pd.DataFrame):
        for col in X_numeric_scaled.columns:
            print(f"X_numeric_scaled['{col}'] dtype:", X_numeric_scaled[col].dtype)
    else:
        print("X_numeric_scaled no es un DataFrame")
        print("Tipo de X_numeric_scaled:", type(X_numeric_scaled))

Shape final: (333, 388)


### Ver resultados de embeddings y estandarizacion 

##### Embeddings

In [26]:
print("Shape de X_final_tfidf:", X_final_tfidf.shape, type(X_final_tfidf))
print("Shape de X_final_embeddings:", X_final_embeddings.shape, type(X_final_embeddings))

Shape de X_final_tfidf: (333, 6019) <class 'scipy.sparse._csr.csr_matrix'>
Shape de X_final_embeddings: (333, 388) <class 'numpy.ndarray'>


In [27]:
# X_final_tfidf.toarray()[0]

In [28]:
# X_final_embeddings[0]

##### Columnas de estandarizacion

In [29]:
data_final = df_minmax[text_numeric_cols]
# data_final = df_zscore[text_numeric_cols]

# Datos originales numericos
# data_final = data[text_numeric_cols]

In [30]:
data_final.head()

Unnamed: 0,CORPUS,ANIO,VOLUMEN,CUARTIL,PAGINAS
0,generalization zariouh’s property gaz local sp...,1.0,9e-06,0.0,0.113402
1,exploring benefits amf colonization improving ...,1.0,1e-05,0.25,0.164948
2,inoculation micromonospora sp enhances carbohy...,1.0,6e-06,0.25,0.154639
3,mitigating cold stress rice study genotype per...,0.888889,1e-05,0.25,0.185567
4,characterization affinir primal topological sp...,1.0,0.00081,0.75,0.082474


### Procesamiento

* Codificación de etiquetas de las características categóricas = lo que se hizo en CUARTIL
* Escalado de las características mediante el escalador estándar = la estandarizacion con df_zscore y df_minmax
* Lematizacion y condificacion de Titulo, resumen y keys => X_final_tfidf y X_final_embeddings

In [31]:
#Obtener variables categóricas
s = (data.dtypes == 'object')
object_cols = list(s[s].index)

print("Variables categóricas del dataset:", object_cols)

Variables categóricas del dataset: ['DOI', 'CORPUS']


#### To Dataframe

In [32]:
print("Shape de X_final_tfidf:", X_final_tfidf.shape, type(X_final_tfidf))
print("Shape de X_final_embeddings:", X_final_embeddings.shape, type(X_final_embeddings))

Shape de X_final_tfidf: (333, 6019) <class 'scipy.sparse._csr.csr_matrix'>
Shape de X_final_embeddings: (333, 388) <class 'numpy.ndarray'>


In [33]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
X_final_tfidf.shape[1] == len(tfidf_feature_names) + 3

# Nombres de columnas
all_columns = list(tfidf_feature_names) + ['ANIO', 'VOLUMEN', 'CUARTIL', 'PAGINAS']

# Convertir a DataFrame (primero a denso si es pequeño)
X_final_tfidf_dense = X_final_tfidf.toarray()
df_tfidf = pd.DataFrame(X_final_tfidf_dense, columns=all_columns)

In [34]:
df_tfidf.head()

Unnamed: 0,004,005,01,010,012,042,05,057,08,091,...,él significant,él study,él tabla,él él,ón,ús,ANIO,VOLUMEN,CUARTIL,PAGINAS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,9e-06,0.0,0.113402
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1e-05,0.25,0.164948
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,6e-06,0.25,0.154639
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,1e-05,0.25,0.185567
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.00081,0.75,0.082474


In [35]:
# Nombres para las columnas de embedding
embedding_dim = X_final_embeddings.shape[1] - 4
embedding_columns = [f'embed_{i}' for i in range(embedding_dim)]
all_columns = embedding_columns + ['ANIO', 'VOLUMEN', 'CUARTIL', 'PAGINAS']

# Convertir a DataFrame
df_embeddings = pd.DataFrame(X_final_embeddings, columns=all_columns)

In [36]:
df_embeddings.head()

Unnamed: 0,embed_0,embed_1,embed_2,embed_3,embed_4,embed_5,embed_6,embed_7,embed_8,embed_9,...,embed_378,embed_379,embed_380,embed_381,embed_382,embed_383,ANIO,VOLUMEN,CUARTIL,PAGINAS
0,-0.22988,0.249529,0.081523,0.202636,-0.193608,-0.030638,0.188579,-0.098298,-0.218437,-0.08878,...,-0.042838,-0.232117,-0.262647,0.127149,-0.106181,-0.089396,1.0,9e-06,0.0,0.113402
1,-0.012822,0.14934,0.024315,-0.009801,0.088035,-0.033907,0.11128,-0.171367,-0.193791,0.28999,...,0.178573,-0.004228,-0.047453,-0.051901,0.115226,0.034473,1.0,1e-05,0.25,0.164948
2,-0.055708,0.190835,-0.065515,0.225034,0.369172,0.113739,0.287937,-0.156514,-0.296518,0.024632,...,0.300612,-0.066641,-0.076587,-0.150014,0.088423,-0.016509,1.0,6e-06,0.25,0.154639
3,0.137942,0.176798,-0.03231,0.311785,0.261316,0.186766,0.018803,-0.187183,-0.103112,0.044219,...,0.363133,0.012749,-0.090156,-0.0919,-0.104825,0.144255,0.888889,1e-05,0.25,0.185567
4,-0.118311,-0.016865,0.039658,0.219906,-0.250331,0.054814,-0.052862,0.115424,-0.129113,-0.247627,...,-0.027158,-0.341358,-0.035781,0.166203,-0.286782,0.02461,1.0,0.00081,0.75,0.082474


#### Save

In [37]:
# Guardar el DataFrame en un archivo CSV
df_tfidf.to_csv('data/embeddingstrain/df_tfidf_' + type_standar + '.csv', index=False)
df_embeddings.to_csv('data/embeddingstrain/df_embeddings_' + type_standar + '.csv', index=False)