# Distância cosseno

Referência: https://www.machinelearningplus.com/nlp/cosine-similarity/

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

In [2]:
docs = [None] * 4 
docs[0] = 'I like football'
docs[1] = 'John likes football'
docs[2] = 'Mike likes basketball'
docs[3] = 'My house is beatiful'
docs

['I like football',
 'John likes football',
 'Mike likes basketball',
 'My house is beatiful']

## Count Vectorizer

Convert a collection of text documents to a matrix of token counts.

In [3]:
count_vectorizer = CountVectorizer(stop_words='english')
sparse_matrix = count_vectorizer.fit_transform(docs)

In [4]:
print(count_vectorizer.get_feature_names())

['basketball', 'beatiful', 'football', 'house', 'john', 'like', 'likes', 'mike']


In [5]:
dense_matrix = sparse_matrix.toarray()
dense_matrix

array([[0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 1, 1],
       [0, 1, 0, 1, 0, 0, 0, 0]], dtype=int64)

## Distância cosseno

In [6]:
cosine_distances(dense_matrix, dense_matrix)

array([[0.        , 0.59175171, 1.        , 1.        ],
       [0.59175171, 0.        , 0.66666667, 1.        ],
       [1.        , 0.66666667, 0.        , 1.        ],
       [1.        , 1.        , 1.        , 0.        ]])

## Similaridade cosseno

In [7]:
cosine_similarity(dense_matrix, dense_matrix)

array([[1.        , 0.40824829, 0.        , 0.        ],
       [0.40824829, 1.        , 0.33333333, 0.        ],
       [0.        , 0.33333333, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

## TF-IDF - Term Frequency / Inverse Document Frequency

Convert a collection of raw documents to a matrix of TF-IDF features.


$ \mbox{tfidf}_{t,d,D} = \mbox{tf}_{t,d} \times \mbox{idf}_{t,D} $


Where t denotes the terms; d denotes each document; D denotes the collection of documents.


$ \mbox{idf}_{t,D} = (1 +\log \mbox{tf}_{t,d}) \cdot \log \frac{|D|}{\mbox{df}_t} $


In [8]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_sparse_matrix = tfidf_vectorizer.fit_transform(docs)

In [9]:
tfidf_sparse_matrix.shape

(4, 8)

In [10]:
print(count_vectorizer.get_feature_names())

['basketball', 'beatiful', 'football', 'house', 'john', 'like', 'likes', 'mike']


In [11]:
tfidf_dense_matrix = tfidf_sparse_matrix.toarray()
tfidf_dense_matrix.round(4)

array([[0.    , 0.    , 0.6191, 0.    , 0.    , 0.7853, 0.    , 0.    ],
       [0.    , 0.    , 0.5264, 0.    , 0.6677, 0.    , 0.5264, 0.    ],
       [0.6176, 0.    , 0.    , 0.    , 0.    , 0.    , 0.4869, 0.6176],
       [0.    , 0.7071, 0.    , 0.7071, 0.    , 0.    , 0.    , 0.    ]])

In [12]:
cosine_distances(tfidf_dense_matrix, tfidf_dense_matrix)

array([[0.        , 0.67408645, 1.        , 1.        ],
       [0.67408645, 0.        , 0.74367516, 1.        ],
       [1.        , 0.74367516, 0.        , 1.        ],
       [1.        , 1.        , 1.        , 0.        ]])