# Extracción de características TF-IDF

Primero importamos todas las librerías necesarias

In [1]:
import pandas as pd
import numpy as np
import re
import string
import spacy
import gensim

pd.options.display.max_colwidth = None


Creamos un pequeño cuerpo de textos de ejemplo *(CORPUS)*

In [2]:
corpus = ['El cielo es azul y bonito',
          'Me encanta el cielo azul, pero no el cielo plomizo',
          'Bonito cielo hacía ese día',
          'Hoy he desayunado huevos con jamón y tostadas',
          'Juan odia las tostadas y los huevos con jamón',
          'las tostadas de jamón están muy buenas']

## Limpieza del texto
Definimos una función simple de limpieza y normalización del texto y la aplicamos a nuestro corpus.

In [3]:
nlp = spacy.load("es_core_news_sm")
def normalizar_doc(doc):
    '''Función que normaliza un texto cogiendo sólo
    las palabras en minúsculas mayores de 3 caracteres'''
    # separamos en tokens
    tokens = nlp(doc)
    # filtramos stopwords
    filtered_tokens = [t.lower_ for t in tokens if
                       len(t.text)>3 and
                       not t.is_space and
                       not t.is_punct]
    # juntamos de nuevo en una cadena
    doc = ' '.join(filtered_tokens)
    return doc

In [4]:
norm_corpus = list(map(normalizar_doc, corpus))
norm_corpus

['cielo azul bonito',
 'encanta cielo azul pero cielo plomizo',
 'bonito cielo hacía',
 'desayunado huevos jamón tostadas',
 'juan odia tostadas huevos jamón',
 'tostadas jamón están buenas']

# Librería `scikit-learn`

## Modelo TF-IDF
Este modelo promedia la frecuencia de aparición de cada término por el número de documentos en los que aparece.

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(norm=None, use_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix.shape

(6, 15)

In [6]:
#también es una matriz sparse
tv_matrix

<6x15 sparse matrix of type '<class 'numpy.float64'>'
	with 24 stored elements in Compressed Sparse Row format>

In [7]:
#tenemos los mismos atributos que en el CountVectorizer
tv.get_feature_names()

['azul',
 'bonito',
 'buenas',
 'cielo',
 'desayunado',
 'encanta',
 'están',
 'hacía',
 'huevos',
 'jamón',
 'juan',
 'odia',
 'pero',
 'plomizo',
 'tostadas']

In [8]:
tv.vocabulary_

{'cielo': 3,
 'azul': 0,
 'bonito': 1,
 'encanta': 5,
 'pero': 12,
 'plomizo': 13,
 'hacía': 7,
 'desayunado': 4,
 'huevos': 8,
 'jamón': 9,
 'tostadas': 14,
 'juan': 10,
 'odia': 11,
 'están': 6,
 'buenas': 2}

In [9]:
tv_matrix = tv_matrix.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,1.85,1.85,0.0,1.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.85,0.0,0.0,3.12,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,2.25,2.25,0.0
2,0.0,1.85,0.0,1.56,0.0,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.25,0.0,0.0,0.0,1.85,1.56,0.0,0.0,0.0,0.0,1.56
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.85,1.56,2.25,2.25,0.0,0.0,1.56
5,0.0,0.0,2.25,0.0,0.0,0.0,2.25,0.0,0.0,1.56,0.0,0.0,0.0,0.0,1.56


In [10]:
#pesos para cada término (valor idf(t))
tv.idf_

array([1.84729786, 1.84729786, 2.25276297, 1.55961579, 2.25276297,
       2.25276297, 2.25276297, 2.25276297, 1.84729786, 1.55961579,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 1.55961579])

In [11]:
#doc. freq. para cada término
df = np.sum(tv_matrix>0, axis=0)
df

array([2, 2, 1, 3, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 3])

In [12]:
#Frec. de documentos y peso IDF para cada término
[f"{n} ({df}): {i:.2f}" for n, i, df in zip(tv.get_feature_names(), tv.idf_, df)]

['azul (2): 1.85',
 'bonito (2): 1.85',
 'buenas (1): 2.25',
 'cielo (3): 1.56',
 'desayunado (1): 2.25',
 'encanta (1): 2.25',
 'están (1): 2.25',
 'hacía (1): 2.25',
 'huevos (2): 1.85',
 'jamón (3): 1.56',
 'juan (1): 2.25',
 'odia (1): 2.25',
 'pero (1): 2.25',
 'plomizo (1): 2.25',
 'tostadas (3): 1.56']

In [13]:
#La matriz TF-IDF es la BoW multiplicada por los pesos IDF
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv_matrix = cv.fit_transform(norm_corpus).toarray()
cv_matrix

array([[1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0],
       [0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1],
       [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]])

In [14]:
pd.DataFrame(np.round(cv_matrix*tv.idf_, 2), columns=vocab)

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,1.85,1.85,0.0,1.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.85,0.0,0.0,3.12,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,2.25,2.25,0.0
2,0.0,1.85,0.0,1.56,0.0,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.25,0.0,0.0,0.0,1.85,1.56,0.0,0.0,0.0,0.0,1.56
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.85,1.56,2.25,2.25,0.0,0.0,1.56
5,0.0,0.0,2.25,0.0,0.0,0.0,2.25,0.0,0.0,1.56,0.0,0.0,0.0,0.0,1.56


Cálculo de los pesos IDF

In [15]:
#idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1
n = tv_matrix.shape[0]
np.log((n+1)/(1+df))+1

array([1.84729786, 1.84729786, 2.25276297, 1.55961579, 2.25276297,
       2.25276297, 2.25276297, 2.25276297, 1.84729786, 1.55961579,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 1.55961579])

In [16]:
#fórmula estándar
#idf(t) = log [ n / (df(t)] + 1 
np.log(n/(df))+1

array([2.09861229, 2.09861229, 2.79175947, 1.69314718, 2.79175947,
       2.79175947, 2.79175947, 2.79175947, 2.09861229, 1.69314718,
       2.79175947, 2.79175947, 2.79175947, 2.79175947, 1.69314718])

Si normalizamos, se ajustan los valores tf-idf en cada documento según la norma 'l2' (suma de cuadrados) o 'l1' (suma de valores absolutos)

In [17]:
tv_l2 = TfidfVectorizer(norm='l2', use_idf=True)
tv_matrix_l2 = tv_l2.fit_transform(norm_corpus).toarray()
pd.DataFrame(np.round(tv_matrix_l2, 2), columns=tv_l2.get_feature_names())

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,0.61,0.61,0.0,0.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.35,0.0,0.0,0.59,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0,0.42,0.42,0.0
2,0.0,0.56,0.0,0.47,0.0,0.0,0.0,0.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.62,0.0,0.0,0.0,0.51,0.43,0.0,0.0,0.0,0.0,0.43
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.43,0.36,0.52,0.52,0.0,0.0,0.36
5,0.0,0.0,0.58,0.0,0.0,0.0,0.58,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.4


In [18]:
pd.DataFrame(np.round(tv_matrix, 2), columns=tv_l2.get_feature_names())

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,1.85,1.85,0.0,1.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.85,0.0,0.0,3.12,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,2.25,2.25,0.0
2,0.0,1.85,0.0,1.56,0.0,0.0,0.0,2.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.25,0.0,0.0,0.0,1.85,1.56,0.0,0.0,0.0,0.0,1.56
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.85,1.56,2.25,2.25,0.0,0.0,1.56
5,0.0,0.0,2.25,0.0,0.0,0.0,2.25,0.0,0.0,1.56,0.0,0.0,0.0,0.0,1.56


In [19]:
np.sqrt(np.sum(tv_matrix_l2**2, axis=1)) #cada fila está normalizada a uno (norma 'L2')

array([1., 1., 1., 1., 1., 1.])

In [20]:
np.sqrt(np.sum(tv_matrix**2, axis=1)) #valores de cada documento sin normalizar

array([3.04260089, 5.32606215, 3.3045199 , 3.65407351, 4.29269079,
       3.87487868])

## Cálculo de la matriz en nuevos documentos
Al calcular l matriz TF-IDF para el nuevo corpus, el peso de cada término (IDF) no se modifica

In [21]:
nuevo_corpus = ['El Cielo amenaza lluvia', 'Pedro desayuna tostadas de jamón con tomate']
norm_nuevo_corpus = list(map(normalizar_doc, nuevo_corpus))
new_matrix=tv.transform(norm_nuevo_corpus).toarray()
pd.DataFrame(np.round(new_matrix, 2), columns=vocab)

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,0.0,0.0,0.0,1.56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.56,0.0,0.0,0.0,0.0,1.56


In [22]:
new_matrix=tv_l2.transform(norm_nuevo_corpus).toarray()
pd.DataFrame(np.round(new_matrix, 2), columns=vocab)

Unnamed: 0,azul,bonito,buenas,cielo,desayunado,encanta,están,hacía,huevos,jamón,juan,odia,pero,plomizo,tostadas
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.71,0.0,0.0,0.0,0.0,0.71


### Modelo n-gramas
con el vectorizador `tfidfvectorizer` también podemos especificar el rango de n-gramas y el `min_df`

In [23]:
bv = TfidfVectorizer(ngram_range=(1,2), min_df=2, norm=None)
bv_matrix = bv.fit_transform(norm_corpus)

bv_matrix = bv_matrix.toarray()
vocab_bigram = bv.get_feature_names()
pd.DataFrame(np.round(bv_matrix, 2), columns=vocab_bigram)

Unnamed: 0,azul,bonito,cielo,cielo azul,huevos,huevos jamón,jamón,tostadas
0,1.85,1.85,1.56,1.85,0.0,0.0,0.0,0.0
1,1.85,0.0,3.12,1.85,0.0,0.0,0.0,0.0
2,0.0,1.85,1.56,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.85,1.85,1.56,1.56
4,0.0,0.0,0.0,0.0,1.85,1.85,1.56,1.56
5,0.0,0.0,0.0,0.0,0.0,0.0,1.56,1.56


# Librería `Gensim`
Para trabajar con la librería `Gensim` es necesario transformar los documentos en una lista de tokens.

In [24]:
def normalizar_doc_tokenize(doc):
    '''Función que normaliza un texto cogiendo sólo
    las palabras en minúsculas mayores de 3 caracteres'''
    # separamos en tokens
    tokens = nlp(doc)
    # filtramos stopwords
    filtered_tokens = [t.lower_ for t in tokens if
                       len(t.text)>3 and
                       not t.is_space and
                       not t.is_punct]

    return filtered_tokens

Convertimos nuestros texto de ejemplo en una lista de tokens y visualizamos el primer documento como ejemplo:

In [25]:
tokenized_corpus = list(map(normalizar_doc_tokenize, corpus))
tokenized_corpus

[['cielo', 'azul', 'bonito'],
 ['encanta', 'cielo', 'azul', 'pero', 'cielo', 'plomizo'],
 ['bonito', 'cielo', 'hacía'],
 ['desayunado', 'huevos', 'jamón', 'tostadas'],
 ['juan', 'odia', 'tostadas', 'huevos', 'jamón'],
 ['tostadas', 'jamón', 'están', 'buenas']]

In [26]:
norm_corpus

['cielo azul bonito',
 'encanta cielo azul pero cielo plomizo',
 'bonito cielo hacía',
 'desayunado huevos jamón tostadas',
 'juan odia tostadas huevos jamón',
 'tostadas jamón están buenas']

## Modelo TF-IDF
Primero hay que calcular el modelo BoW:

In [27]:
from gensim.corpora import Dictionary

diccionario = Dictionary(tokenized_corpus)

Primero aprendemos las palabras y luego generamos la matriz sobre el `corpus` que queramos:

In [28]:
mapped_corpus = [diccionario.doc2bow(text)
                 for text in tokenized_corpus]

In [29]:
mapped_corpus

[[(0, 1), (1, 1), (2, 1)],
 [(0, 1), (2, 2), (3, 1), (4, 1), (5, 1)],
 [(1, 1), (2, 1), (6, 1)],
 [(7, 1), (8, 1), (9, 1), (10, 1)],
 [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1)],
 [(9, 1), (10, 1), (13, 1), (14, 1)]]

In [30]:
for (i, tf) in mapped_corpus[1]:
    print(f"{diccionario[i]}: {tf}")

azul: 1
cielo: 2
encanta: 1
pero: 1
plomizo: 1


In [31]:
#frec. de documentos de cada token
for i in diccionario.dfs:
    print(f"{diccionario[i]}: {diccionario.dfs[i]}")

cielo: 3
azul: 2
bonito: 2
encanta: 1
pero: 1
plomizo: 1
hacía: 1
desayunado: 1
huevos: 2
jamón: 3
tostadas: 3
juan: 1
odia: 1
están: 1
buenas: 1


In [32]:
#frec. aparición de cada token
for i in diccionario.cfs:
    print(f"{diccionario[i]}: {diccionario.cfs[i]}")

cielo: 4
azul: 2
bonito: 2
encanta: 1
pero: 1
plomizo: 1
hacía: 1
desayunado: 1
huevos: 2
jamón: 3
tostadas: 3
juan: 1
odia: 1
están: 1
buenas: 1


## Modelo TF-IDF
Hay que hacer una transformación sobre la matriz BoW

In [33]:
from gensim.models import TfidfModel

tfidf = TfidfModel(mapped_corpus)
corpus_tfidf = tfidf[mapped_corpus]

De nuevo, la librería `gensim` genera por cada documento una lista de tuplas (ID,frecuencia) donde ahora la frecuencia está normalizada por la inversa de la frecuencia de documentos que contienen el término:

In [34]:
corpus_tfidf

<gensim.interfaces.TransformedCorpus at 0x7fd818ccd910>

El modelo devuelve un *iterable*

In [35]:
corpus_tfidf[1]

[(0, 0.30755279409405734),
 (2, 0.38808841717595355),
 (3, 0.5015970026820341),
 (4, 0.5015970026820341),
 (5, 0.5015970026820341)]

In [36]:
for (i, v) in corpus_tfidf[1]:
    print(f"{diccionario[i]}: {v:.2f}")

azul: 0.31
cielo: 0.39
encanta: 0.50
pero: 0.50
plomizo: 0.50


In [37]:
for (i, v) in corpus_tfidf[0]:
    print(f"{diccionario[i]}: {v:.2f}")

azul: 0.65
bonito: 0.65
cielo: 0.41


## Aplicación de los modelos a nuevos textos
Para aplicar un modelo BoW o TF-IDF a un nuevo documento hay que utilizar los modelos ya entrenados en `gensim` sobre el corpus original. Hay que calcular el BoW del nuevo corpus y sobre éste su TF-IDF

In [38]:
tokenized_nuevo_corpus = [normalizar_doc_tokenize(doc) for doc in nuevo_corpus]

mapped_nuevo_corpus = [diccionario.doc2bow(text)
                 for text in tokenized_nuevo_corpus]

mapped_nuevo_corpus

[[(2, 1)], [(9, 1), (10, 1)]]

### Modelo TF-IDF

In [39]:
nuevo_corpus_tfidf = tfidf[mapped_nuevo_corpus]
nuevo_corpus_tfidf

<gensim.interfaces.TransformedCorpus at 0x7fd818d440a0>

In [40]:
[v for v in nuevo_corpus_tfidf]

[[(2, 1.0)], [(9, 0.7071067811865475), (10, 0.7071067811865475)]]

In [41]:
#Aplicando todo el proceso en un único paso
list(tfidf[map(lambda x: diccionario.doc2bow(normalizar_doc_tokenize(x)), nuevo_corpus)])

[[(2, 1.0)], [(9, 0.7071067811865475), (10, 0.7071067811865475)]]