<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Vectorización


In [1]:
import numpy as np
import math

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
def vocabulary(corpus):
    words = set()
    for sentence in corpus:
        for word in sentence.split():
            words.add(word)
            
    return list(words)

In [5]:
vocabulary(corpus)

['martes', 'dia', 'el', 'de', 'muchas', 'que', 'hoy', 'gracias', 'es']

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [6]:
def oneHotEncoding(corpus):
    words = vocabulary(corpus)
    fmatrix = np.zeros((len(corpus), len(words)))

    for i, sentence in enumerate(corpus):
        for word in sentence.split():
            fmatrix[i][words.index(word)] = 1

    return fmatrix

In [7]:
oneHotEncoding(corpus)

array([[0., 1., 0., 0., 0., 1., 1., 0., 1.],
       [1., 1., 1., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.]])

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [8]:
def frequency_matrix(corpus):
    words = vocabulary(corpus)
    fmatrix = np.zeros((len(corpus), len(words)))

    for i, sentence in enumerate(corpus):
        for word in sentence.split():
            fmatrix[i][words.index(word)] += 1

    return fmatrix

In [9]:
frequency_matrix(corpus)

array([[0., 1., 0., 0., 0., 1., 1., 0., 1.],
       [2., 1., 1., 1., 0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1., 0., 0., 1., 0.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [10]:
def document_frequency(corpus, word):
    frequency = 0

    for i, sentence in enumerate(corpus):
        for w in sentence.split():
            if word == w:
                frequency += 1
                break

    return frequency

In [11]:
document_frequency(corpus, 'dia')

2

In [12]:
def tf_idf_matrix(corpus):
    words = vocabulary(corpus)
    f_matrix = frequency_matrix(corpus)
    documents_count = len(corpus)
    matrix = np.zeros((documents_count, len(words)))

    for i, sentence in enumerate(corpus):
        for word in sentence.split():
            j = words.index(word)
            tf = f_matrix[i][j]
            idf = math.log10(documents_count / document_frequency(corpus, word))
            matrix[i][j] = tf * idf

    return matrix

In [13]:
tf_idf_matrix(corpus)

array([[0.        , 0.17609126, 0.        , 0.        , 0.        ,
        0.47712125, 0.17609126, 0.        , 0.17609126],
       [0.35218252, 0.17609126, 0.47712125, 0.47712125, 0.        ,
        0.        , 0.17609126, 0.        , 0.17609126],
       [0.17609126, 0.        , 0.        , 0.        , 0.47712125,
        0.        , 0.        , 0.47712125, 0.        ]])

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [14]:
def similarity_documents(corpus, index, method='tf_idf'):
    documents = list()
    
    if method == 'tf_idf':
        matrix = tf_idf_matrix(corpus)
        
    elif method == 'freq':
        matrix = frequency_matrix(corpus)
        
    elif method == 'oneHotEncoder':
        matrix = oneHotEncoding(corpus)
        
    document = matrix[index]

    for i, j in enumerate(matrix):
        if i == index:
            continue
        similarity = cosine_similarity(document, j)
        documents.append({'similarity': similarity, 'Sentence': corpus[i]})
            
    return sorted(documents, key=lambda x: x['similarity'], reverse=True)   

In [15]:
similarity_documents(corpus, 2, 'tf_idf')

[{'similarity': 0.10845711727883083,
  'Sentence': 'martes el dia de hoy es martes'},
 {'similarity': 0.0, 'Sentence': 'que dia es hoy'}]

In [16]:
similarity_documents(corpus, 2, 'freq')

[{'similarity': 0.3849001794597505,
  'Sentence': 'martes el dia de hoy es martes'},
 {'similarity': 0.0, 'Sentence': 'que dia es hoy'}]

In [17]:
similarity_documents(corpus, 2, 'oneHotEncoder')

[{'similarity': 0.23570226039551587,
  'Sentence': 'martes el dia de hoy es martes'},
 {'similarity': 0.0, 'Sentence': 'que dia es hoy'}]