<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Vectorización


In [2]:
import numpy as np

In [52]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [4]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [27]:
#Aquí se supone que no habrá puntuación para manejar

vocab = set()

for text in corpus:
    words = text.lower().split(' ')
    vocab.update(words)

vocab


{'de', 'dia', 'el', 'es', 'gracias', 'hoy', 'martes', 'muchas', 'que'}

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [28]:
one_hot_encoding= []

vocab_list = list(vocab)
print(vocab_list)

for text in corpus:
    encoding = []
    words = text.lower().split(' ')

    for i, word in enumerate(vocab_list):
        encoding.append(int(word in words))

    one_hot_encoding.append(encoding)

one_hot_encoding

    


['de', 'dia', 'martes', 'el', 'que', 'muchas', 'es', 'gracias', 'hoy']


[[0, 1, 0, 0, 1, 0, 1, 0, 1],
 [1, 1, 1, 1, 0, 0, 1, 0, 1],
 [0, 0, 1, 0, 0, 1, 0, 1, 0]]

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [32]:
frequency_vector = []

vocab_list = list(vocab)
print(vocab_list)

for text in corpus:
    vector = [0]*len(vocab_list)
    words = text.lower().split(' ')

    for word in words:
        if word in vocab_list:
            vector[vocab_list.index(word)] += 1

    frequency_vector.append(vector)

frequency_vector

    

['de', 'dia', 'martes', 'el', 'que', 'muchas', 'es', 'gracias', 'hoy']


[[0, 1, 0, 0, 1, 0, 1, 0, 1],
 [1, 1, 2, 1, 0, 0, 1, 0, 1],
 [0, 0, 1, 0, 0, 1, 0, 1, 0]]

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [38]:
tf_idf = []

vocab_list = list(vocab)
print(vocab_list)

#Calculate IDF
idf_vector = [0]*len(vocab_list)
for text in corpus:
    words = text.lower().split(' ')

    for word in vocab_list:
        if word in words:
            idf_vector[vocab_list.index(word)] += 1

for i in range(len(idf_vector)):
    idf_vector[i] = np.log(len(corpus) / idf_vector[i])

print(idf_vector)


for text in corpus:
    tf_vector = [0]*len(vocab_list)
    words = text.lower().split(' ')

    #Calculate TF term
    for word in words:
        if word in vocab_list:
            tf_vector[vocab_list.index(word)] += 1

    #Calculate TF-IDF
    for i in range(len(tf_vector)):
        tf_vector[i] = tf_vector[i]*idf_vector[i]

    tf_idf.append(tf_vector)

tf_idf



['de', 'dia', 'martes', 'el', 'que', 'muchas', 'es', 'gracias', 'hoy']
[1.0986122886681096, 0.4054651081081644, 0.4054651081081644, 1.0986122886681096, 1.0986122886681096, 1.0986122886681096, 0.4054651081081644, 1.0986122886681096, 0.4054651081081644]


[[0.0,
  0.4054651081081644,
  0.0,
  0.0,
  1.0986122886681096,
  0.0,
  0.4054651081081644,
  0.0,
  0.4054651081081644],
 [1.0986122886681096,
  0.4054651081081644,
  0.8109302162163288,
  1.0986122886681096,
  0.0,
  0.0,
  0.4054651081081644,
  0.0,
  0.4054651081081644],
 [0.0,
  0.0,
  0.4054651081081644,
  0.0,
  0.0,
  1.0986122886681096,
  0.0,
  1.0986122886681096,
  0.0]]

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [59]:
def sort_by_tf_idf_similarity(corpus, index):

    tf_idf = []

    vocab_list = list(vocab)

    #Calculate IDF
    idf_vector = [0]*len(vocab_list)
    for text in corpus:
        words = text.lower().split(' ')

        for word in vocab_list:
            if word in words:
                idf_vector[vocab_list.index(word)] += 1

    for i in range(len(idf_vector)):
        idf_vector[i] = np.log(len(corpus) / idf_vector[i])

    #Calculate TF
    for text in corpus:
        tf_vector = [0]*len(vocab_list)
        words = text.lower().split(' ')

        #Calculate TF term
        for word in words:
            if word in vocab_list:
                tf_vector[vocab_list.index(word)] += 1

        #Calculate TF-IDF
        for i in range(len(tf_vector)):
            tf_vector[i] = tf_vector[i]*idf_vector[i]

        tf_idf.append(tf_vector)


    dtype = [('document', int), ('similarity', float)]
    values = []

    #Calculate similarity of each document
    for i, text in enumerate(tf_idf):
        similarity = cosine_similarity(np.asarray(tf_idf[index]), np.asarray(text))
        values.append((i, similarity))

    #Create a temporary array to sort the documents
    tmp_array = np.array(values, dtype=dtype)
    tmp_sorted = np.sort(tmp_array, order='similarity')

    sorted_corpus = []
    for i, _ in tmp_sorted:
        sorted_corpus.append(corpus[i])

    return sorted_corpus    

sort_by_tf_idf_similarity(corpus, 1)




['martes muchas gracias', 'que dia es hoy', 'martes el dia de hoy es martes']