<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Word2vect

## Alumno: Fux, Santiago Javier (CEIA-6ta Cohorte)
### Fecha: 2023-03-08


In [1]:
import numpy as np

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
def get_terms(docs):
  # init list
  term_list = []

  for doc in docs:
    # get tokens separated by '' and filter to get a single item of each one
    terms = doc.split(' ')  
    terms = np.unique(terms)

    for term in terms:
      # if not in the list then add it
      if term not in term_list:
        term_list.append(term)
  return term_list



### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [5]:
def get_onehot_enc(docs):
  # get terms list
  term_list = get_terms(docs)
  # init final list
  res = []
  for doc in docs:
    # create new array to append
    aux = [0 for i in range(len(term_list))]
    terms = doc.split(' ')

    for term in terms:
      try:
        aux[term_list.index(term)] = 1
      except:
        #not found
        pass
    res.append(aux)
  return res

In [6]:
# test
get_onehot_enc(corpus)


[[1, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 1, 1, 0, 1, 1, 1, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 1, 1]]

### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [7]:
def get_freq_enc(docs):
  # get terms list
  term_list = get_terms(docs)
  # init final list
  res = []
  for doc in docs:
    # create new array to append
    aux = [0 for i in range(len(term_list))]
    terms = doc.split(' ')

    for term in terms:
      try:
        aux[term_list.index(term)] += 1
      except:
        #not found
        pass
    res.append(aux)
  return res

In [8]:
# test
get_freq_enc(corpus)

[[1, 1, 1, 1, 0, 0, 0, 0, 0],
 [1, 1, 1, 0, 1, 1, 2, 0, 0],
 [0, 0, 0, 0, 0, 0, 1, 1, 1]]

### 4- TF-IDF
Dada una lista de textos, devolver una matriz con la representacion TFIDF

In [9]:
def get_idf(docs, term_list):
  res = [0 for i in range(len(term_list))]
  one_hot_res = get_onehot_enc(docs)
  one_hot_sum = np.sum(one_hot_res, axis=0)
  res = [np.log10(len(docs) / v) for v in one_hot_sum if v > 0]
  return res

def get_tfidf_enc(docs):
  tf_vals = get_freq_enc(docs)
  idf_vals = get_idf(docs, get_terms(docs))
  # print(f'tf_vals = {tf_vals}')
  # print(f'idf_vals = {idf_vals}')
  return np.array(tf_vals) * idf_vals

In [10]:
# test
res = get_tfidf_enc(corpus)
res

array([[0.17609126, 0.17609126, 0.17609126, 0.47712125, 0.        ,
        0.        , 0.        , 0.        , 0.        ],
       [0.17609126, 0.17609126, 0.17609126, 0.        , 0.47712125,
        0.47712125, 0.35218252, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.17609126, 0.47712125, 0.47712125]])

### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [13]:
def get_cosines_similarity(docs, enc_list, idx):
  sim_cos = []
  for i, doc in enumerate(docs):
    aux = cosine_similarity(enc_list[idx], enc_list[i])
    sim_cos.append(aux)
  sim_cos.sort(reverse=True)
  return sim_cos

def compare_docs(docs, idx):
  #init list to save cos
  sim_cos = []
  # get list of encodings
  onehot_enc_list = get_onehot_enc(docs)
  freq_enc_list = get_freq_enc(docs)
  tfidf_enc_list = get_tfidf_enc(docs)
  # calculate similarity
  onehot_cos_list = get_cosines_similarity(docs, onehot_enc_list, idx)
  freq_cos_list = get_cosines_similarity(docs, freq_enc_list, idx)
  tfidf_cos_list = get_cosines_similarity(docs, tfidf_enc_list, idx)
  print(f'---DOC {idx}---')
  print(f'-->One Hot Encoding: {onehot_cos_list}')
  print(f'-->Freq Encoding: {freq_cos_list}')
  print(f'-->TFIDF Encoding: {tfidf_cos_list}\n')

In [14]:

compare_docs(corpus, 0)
compare_docs(corpus, 1)
compare_docs(corpus, 2)


---DOC 0---
-->One Hot Encoding: [1.0, 0.6123724356957946, 0.0]
-->Freq Encoding: [1.0, 0.5, 0.0]
-->TFIDF Encoding: [0.9999999999999998, 0.20034190268098703, 0.0]

---DOC 1---
-->One Hot Encoding: [1.0000000000000002, 0.6123724356957946, 0.23570226039551587]
-->Freq Encoding: [1.0, 0.5, 0.3849001794597505]
-->TFIDF Encoding: [1.0, 0.20034190268098703, 0.10845711727883083]

---DOC 2---
-->One Hot Encoding: [1.0000000000000002, 0.23570226039551587, 0.0]
-->Freq Encoding: [1.0000000000000002, 0.3849001794597505, 0.0]
-->TFIDF Encoding: [0.9999999999999999, 0.10845711727883083, 0.0]

