<img src="https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/logoFIUBA.jpg" width="500" align="center">


# Procesamiento de lenguaje natural
## Word2vect


In [1]:
import numpy as np
import pandas as pd
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * (np.linalg.norm(b)))

### Datos

In [3]:
corpus = np.array(['que dia es hoy', 'martes el dia de hoy es martes', 'martes muchas gracias'])

Documento 1 --> que dia es hoy \
Documento 2 --> martes el dia de hoy es martes \
Documento 3 --> martes muchas gracias

### 1 - Obtener el vocabulario del corpus (los términos utilizados)
- Cada documento transformarlo en una lista de términos
- Armar un vector de términos no repetidos de todos los documentos

In [4]:
doc1 = corpus[0]
voc1 = doc1.split(" ")
doc2 = corpus[1]
voc2 = doc2.split(" ")
doc3 = corpus[2]
voc3 = doc3.split(" ")

In [5]:
bow = voc1 + voc2 + voc3
bow = set(bow)

### 2- OneHot encoding
Data una lista de textos, devolver una matriz con la representación oneHotEncoding de estos

In [6]:
#Uno los documentos en una matriz/Dataframe
doc1=pd.DataFrame({"doc1": voc1}).T
doc2=pd.DataFrame({"doc2": voc2}).T
doc3=pd.DataFrame({"doc3": voc3}).T
corp = pd.concat([doc1,doc2,doc3])
corpus = corp.to_numpy()  #obtengo matriz con documentos
rows, columns = corpus.shape
OHE_array = np.zeros((rows,len(bow)))
bow=pd.DataFrame(bow)
bow = bow.to_numpy()

In [7]:
def One_Hot_Encoder (corpus):
  for word in range(len(bow)):
    for i in range(rows):
      for j in range(len(corpus[i])):
        if corpus[i,j] == bow[word]:
          OHE_array[i,word] = 1
 # OHE_array.squeeze()
  #OHE_Data = pd.DataFrame(OHE_array)
 # OHE_Data.columns = bow
  #return OHE_Data

In [8]:
One_Hot_Encoder(corpus)
OHE_array.squeeze()
OHE_Data = pd.DataFrame(OHE_array)
OHE_Data.columns=bow
OHE_Data.head()

Unnamed: 0,"(dia,)","(es,)","(hoy,)","(martes,)","(muchas,)","(de,)","(gracias,)","(que,)","(el,)"
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0


### 3- Vectores de frecuencia
Data una lista de textos, devolver una matriz con la representación de frecuencia de estos

In [25]:
def TF(corpus,bow,rows):
    for word in range(len(bow)):
        for i in range(rows):
            for j in range(len(corpus[i])):
                if corpus[i,j] == bow[word]:
                    TF_array[i,word] = TF_array[i,word] + 1
    #TF_Data = pd.DataFrame(TF_array)
    #TF_Data.columns = bow
    return TF_array#, TF_Data

In [26]:
rows, columns = corpus.shape
TF_array = np.zeros((rows, len(bow)))
TF(corpus, bow, rows)

array([[1., 1., 1., 0., 0., 0., 0., 1., 0.],
       [1., 1., 1., 2., 0., 1., 0., 0., 1.],
       [0., 0., 0., 1., 1., 0., 1., 0., 0.]])

### 4- TF-IDF
Data una lista de textos, devolver una matriz con la representacion TFIDF

In [81]:
# Log N/DF
# N = cantidad de documentos en el corpus
# DF = cantida de documentos con la palabra n en el corpus
def TFIDF(corpus, TF_array, rows):
    TF_Data = pd.DataFrame(TF_array)
    DF = OHE_Data.sum(axis=0)
    IDF = np.log10(rows/DF)
    IDF = np.array(IDF)
    TF_array = TF_Data.to_numpy()
   # TF_IDF = TF_array * IDF
   # TF_IDF = pd.DataFrame(TF_IDF)
   # TF_IDF.columns= bow
    return TF_array * IDF,  np.log10(rows/DF)

In [82]:
TF_IDF, IDF = TFIDF(corpus,TF_array,rows)
TF_IDF = pd.DataFrame(TF_IDF)
TF_IDF.columns = bow
IDF = np.array(IDF)
#TF_array = TF_Data.to_numpy()
TF_IDF

Unnamed: 0,"(dia,)","(es,)","(hoy,)","(martes,)","(muchas,)","(de,)","(gracias,)","(que,)","(el,)"
0,0.176091,0.176091,0.176091,0.0,0.0,0.0,0.0,0.477121,0.0
1,0.176091,0.176091,0.176091,0.352183,0.0,0.477121,0.0,0.0,0.477121
2,0.0,0.0,0.0,0.176091,0.477121,0.0,0.477121,0.0,0.0


### 5 - Comparación de documentos
Realizar una funcion que reciba el corpus y el índice de un documento y devuelva los documentos ordenados por la similitud coseno

In [97]:
def comparacion(corpus, indice, rows):
    TF_IDF, IDF= TFIDF(corpus, TF_array, rows)
    ref = np.zeros(len(corpus))
    for i in range(len(corpus)):
        resultado = cosine_similarity(TF_IDF[indice, :], TF_IDF[i, :])
        print("Similtud coseno doc{} contra doc{} es: {}".format(indice, i, resultado))
    #A = ref.argsort()[::-1][:len(corpus)]
    return ref.argsort()[::-1][:len(corpus)]

In [100]:
result = comparacion(corpus, indice = 1, rows = 3)

Similtud coseno doc1 contra doc0 es: 0.2003419026809871
Similtud coseno doc1 contra doc1 es: 1.0
Similtud coseno doc1 contra doc2 es: 0.10845711727883083


In [101]:
print("La similitud cos() para los documentos pertenecientes al corpus es: ", result)

La similitud cos() para los documentos pertenecientes al corpus es:  [2 1 0]
