## Coding Excercise #0510
Ejercicio basado en el material del Máster en Big Data Analytics de Lluís F. Hurtado (lhurtado at dsic.upv.es)

In [40]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# Funciones auxiliares

from prettytable import PrettyTable

def pinta_matriz_dispersa(M, nombre_col=None, pre=2):
    filas, columnas = M.shape
    header = nombre_col != None
    pt = PrettyTable(nombre_col, header=header)
    for fila in range(filas):
        vf = M.getrow(fila)
        _, cind = vf.nonzero()
        pt.add_row([round(vf[0, c],pre) if c in cind else '-' for c in range(columnas)])
    return pt


### 1. TF IDF representation:

#### 1.1. Create a TF IDF matrix:

In [37]:
# The data.
my_docs = ["learning intelligence machine learning statistics",
           "machine classification learning performance",
           "machine classification machine learning machine performance"]

In [38]:
# A very simple pre-processing.
my_docs = [x.lower() for x in my_docs]

###TF usando CountVectorizer o TfidfVectorizer

In [None]:
print('\nContadores binarios')
vec = CountVectorizer(binary=True)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print('\nContadores')
vec = CountVectorizer()
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print('\nTF sin normalizar (equivalente a Contadores)')
vec = TfidfVectorizer(norm=None, use_idf=False)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print("\nTF normalizado 'l1' (Contadores normalizados para sumar 1): TF(t,d)= frec(t,d) / |d|")
vec = TfidfVectorizer(norm='l1', use_idf=False)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))


print("\nTF normalizado 'l2' (Contadores normalizados para que el módulo del vector sea 1)")
vec = TfidfVectorizer(norm='l2', use_idf=False)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

## tf-idf (Term frequency – Inverse document frequency)

$$
\color{Blue}{
tf(t,d) = \text{número de veces que el token $t$ está en el document $d$}}
$$


$$
\color{Red}{
idf(t,D) = \left\{\begin{matrix}
{\text{no suavizado} : ln \left ( \frac{|D|}{|\{d\in D: t \in d\}|} \right ) }
\\
{\text{suavizado}  : ln \left ( \frac{|D|+1}{ | \{d\in D: t \in d\} |+1} \right )}
\end{matrix}\right.}
$$

$$ tf\text{-}idf = \color{Blue}{tf} \cdot (\color{Red}{idf} + 1) $$



TfidfVectorizer() arguments: <br>
- *max_features* : maximum number of features (distict words). <br>
- *min_df* : The minimum DF. Integer value means count and real number (0~1) means proportion. <br>
- *max_df* : The maximum DF. Integer value means count and real number (0~1) means proportion. Helps to filter out the stop words. <br>


In [51]:
print("\nTF-IDF sin suavizar")
vec = TfidfVectorizer(norm=None, smooth_idf=False)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print("\nTF-IDF con idf suavizado")
vec = TfidfVectorizer(norm=None, smooth_idf=True)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print("\nTF-IDF: con idf suavizado y normalización 'l1'")
vec = TfidfVectorizer(norm='l1', smooth_idf=True)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print("\nTF-IDF: con idf suavizado y normalización 'l2'(opción por defecto)")
vec = TfidfVectorizer()
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))

print("\nTF-IDF: sin suavizado y normalización 'l1'")
vec = TfidfVectorizer(norm='l1', smooth_idf=False)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))


TF-IDF sin suavizar
+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|       -        |    2.0986    |   2.0    |   1.0   |      -      |   2.0986   |
|     1.4055     |      -       |   1.0    |   1.0   |    1.4055   |     -      |
|     1.4055     |      -       |   1.0    |   3.0   |    1.4055   |     -      |
+----------------+--------------+----------+---------+-------------+------------+

TF-IDF con idf suavizado
+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|       -        |    1.6931    |   2.0    |   1.0   |      -      |   1.6931   |
|     1.2877     |      -       |   1.0    |   1.0 

In [52]:
print("\nTF-IDF: con idf suavizado y normalización 'l1'")
vec = TfidfVectorizer(norm='l1', smooth_idf=True)
X = vec.fit_transform(my_docs)
voca = list(vec.get_feature_names_out())
print(pinta_matriz_dispersa(X, voca, 4))


TF-IDF: con idf suavizado y normalización 'l1'
+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|       -        |    0.2651    |  0.3132  |  0.1566 |      -      |   0.2651   |
|     0.2814     |      -       |  0.2186  |  0.2186 |    0.2814   |     -      |
|     0.1958     |      -       |  0.1521  |  0.4562 |    0.1958   |     -      |
+----------------+--------------+----------+---------+-------------+------------+


In [56]:
# Output the features.
print(vec.get_feature_names_out())

['classification' 'intelligence' 'learning' 'machine' 'performance'
 'statistics']


#### 1.2. Calculate the cosine similarity:

In [59]:
# The cosine similarity matrix.
1 - np.round(pairwise_distances(X, metric="cosine"),3)

array([[1.   , 0.397, 0.418],
       [0.397, 1.   , 0.87 ],
       [0.418, 0.87 , 1.   ]])

In [60]:
np.round(cosine_similarity(X),3)

array([[1.   , 0.397, 0.418],
       [0.397, 1.   , 0.87 ],
       [0.418, 0.87 , 1.   ]])

In [69]:
Q1 = ['machine intelligence']
Y=vec.transform(Q1)
print(pinta_matriz_dispersa(Y, voca, 4))

+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|       -        |    0.6287    |    -     |  0.3713 |      -      |     -      |
+----------------+--------------+----------+---------+-------------+------------+


In [70]:
Y.shape,X.shape

((1, 6), (3, 6))

In [71]:
cosine_similarity(Y,X)

array([[0.60020875, 0.2205588 , 0.41807806]])

In [72]:
Q2 = ['machine learning']
Y=vec.transform(Q2)
print(pinta_matriz_dispersa(Y, voca, 4))
cosine_similarity(Y,X)

+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|       -        |      -       |   0.5    |   0.5   |      -      |     -      |
+----------------+--------------+----------+---------+-------------+------------+


array([[0.6474939 , 0.61335554, 0.77509337]])

In [73]:
Q3 = ['machine learning classification']
Y=vec.transform(Q3)
print(pinta_matriz_dispersa(Y, voca, 4))
cosine_similarity(Y,X)

+----------------+--------------+----------+---------+-------------+------------+
| classification | intelligence | learning | machine | performance | statistics |
+----------------+--------------+----------+---------+-------------+------------+
|     0.3917     |      -       |  0.3042  |  0.3042 |      -      |     -      |
+----------------+--------------+----------+---------+-------------+------------+


array([[0.47876391, 0.82951944, 0.81068529]])