### Algoritmos en PYSPARK BOW.hs

#### - bagofwords
#### - tf
#### - idf
#### - tfidf
#### - ngrams

#### - BAGOFWORDS

In [23]:
def bagofwords(Doc):
    """
    Cambia la letras a minusculas,
    Filtra letra mayores a 2,
    Quita espacios en blanco
    Args:
        texto (str): A string.
    
    Returns:
        [[str]]: Arreglo que contiene documentos que son [str]
    """
    x=Doc.strip().lower().split()
    return list(filter(lambda x:len(x)>2,x))

In [28]:
Corpus=["apple want commute easy amp accord new patent battery aapl"
"studio outlet computer need battery future",
        "need help apple iphone6 amp iphone6plus checkitout"]
Texto=sc.parallelize(Corpus,6)
Texto=Texto.map(bagofwords)
Texto.take(2)

[['apple',
  'want',
  'commute',
  'easy',
  'amp',
  'accord',
  'new',
  'patent',
  'battery',
  'aaplstudio',
  'outlet',
  'computer',
  'need',
  'battery',
  'future'],
 ['need', 'help', 'apple', 'iphone6', 'amp', 'iphone6plus', 'checkitout']]

#### - TF

In [13]:
import math
def TF(Doc):
    """Funcion que calcula el tf de un documento 

    Args:
        Doc (str): un Str que es un documento que contiene palabras

    Returns:
        (str,double): un RDD contiene (palabra,double) tupla."""
    #Obtenemos el total de palabras de doc
    total=len(Doc)
    #Lista sin repeticiones
    DocSR=list(set(Doc))
    #Obtenemos el tf para cada palabra
    mapeo=list(map(lambda x:(x,Doc.count(x)),DocSR))
    tf=list(map(lambda x:(x[0],1+math.log10(x[1]/total)),mapeo))
    return (tf)

In [29]:
Corpus=["apple want commute easy amp accord new patent battery aapl"
"studio outlet computer need battery future",
        "need help apple iphone6 amp iphone6plus checkitout"]
Texto=sc.parallelize(Corpus,4)
Texto=Texto.map(bagofwords)
TextoTF=Texto.map(TF)
TextoTF.take(2)

[[('outlet', -0.17609125905568135),
  ('amp', -0.17609125905568135),
  ('aaplstudio', -0.17609125905568135),
  ('new', -0.17609125905568135),
  ('accord', -0.17609125905568135),
  ('patent', -0.17609125905568135),
  ('computer', -0.17609125905568135),
  ('need', -0.17609125905568135),
  ('commute', -0.17609125905568135),
  ('battery', 0.1249387366082999),
  ('want', -0.17609125905568135),
  ('future', -0.17609125905568135),
  ('apple', -0.17609125905568135),
  ('easy', -0.17609125905568135)],
 [('amp', 0.15490195998574308),
  ('checkitout', 0.15490195998574308),
  ('iphone6plus', 0.15490195998574308),
  ('need', 0.15490195998574308),
  ('help', 0.15490195998574308),
  ('apple', 0.15490195998574308),
  ('iphone6', 0.15490195998574308)]]

#### - IDF

In [15]:
import math
def IDF(Doc,corpus):
    """Funcion que calcula el IDF de un corpus de TF 

    Args:
        Doc [[(str,double)]]: texto TF
        Doc [str]:texto corpus basico procesado con bagofwords

    Returns:
        (str,double): corpus con IDF de cada palabra sin repeticiones"""
    DocP=Doc.flatMap(lambda x:x)
    DocP=DocP.map(lambda x:x[0]).distinct()
    #Total documentos en corpus
    total=corpus.count()
    corpus2=corpus.collect()
    #Extraer cantida de veces de cada palabra en los documentos
    IDF=DocP.map(lambda x:list(map(lambda y:(x,1 if x in y else 0),corpus2)))
    IDF2=IDF.flatMap(lambda x:x)
    IDF2=IDF2.groupByKey().map(lambda x:(x[0],sum(x[1])))
    IDF2=IDF2.map(lambda x:(x[0],math.log10(1+total/x[1])))
    return IDF2

In [30]:
Corpus=["apple want commute easy amp accord new patent battery aapl"
"studio outlet computer need battery future",
        "need help apple iphone6 amp iphone6plus checkitout"]
Texto=sc.parallelize(Corpus,4)
Texto=Texto.map(bagofwords)
TextoTF=Texto.map(TF)
TextoIDF=IDF(TextoTF,Texto)
TextoIDF.collect()

[('aaplstudio', 0.47712125471966244),
 ('accord', 0.47712125471966244),
 ('commute', 0.47712125471966244),
 ('iphone6', 0.47712125471966244),
 ('outlet', 0.47712125471966244),
 ('need', 0.3010299956639812),
 ('new', 0.47712125471966244),
 ('battery', 0.47712125471966244),
 ('checkitout', 0.47712125471966244),
 ('iphone6plus', 0.47712125471966244),
 ('help', 0.47712125471966244),
 ('amp', 0.3010299956639812),
 ('patent', 0.47712125471966244),
 ('computer', 0.47712125471966244),
 ('want', 0.47712125471966244),
 ('future', 0.47712125471966244),
 ('apple', 0.3010299956639812),
 ('easy', 0.47712125471966244)]

#### - TFIDF

In [17]:
def RecIDF(word,corpusIDF):
        """Funcion que recuperar el valor IDF de una palabra
        Args:
            word str:palabra a buscar
            Doc [(str,double)]:texto IDF
        Returns:
            double: IDF numerico de una palbara"""
        L=corpusIDF
        EL=list(filter(lambda x:x[0]==word,L))
        return EL[0][1]

In [18]:
import math

def TFIDF(TextoTF,TextoIDF):
    """Funcion que calcula el IDF para cada Doc de TF
    Args:
        Doc [[(str,double)]]: texto TF
        Doc [(str,double)]:texto IDF

    Returns:
        [(str,double)]: texto con el TFIDF de cada palabra en DOC"""
    corpusIDF=TextoIDF.collect()
    TFIDF=TextoTF.map(lambda x:list(map((lambda y:(y[0],y[1]*RecIDF(y[0],corpusIDF))),x)))
    return  TFIDF

In [31]:
Corpus=["apple want commute easy amp accord new patent battery aapl"
"studio outlet computer need battery future",
        "need help apple iphone6 amp iphone6plus checkitout"]
Texto=sc.parallelize(Corpus,4)
Texto=Texto.map(bagofwords)
TextoTF=Texto.map(TF)
TextoIDF=IDF(TextoTF,Texto)
TextoTFIDF=TFIDF(TextoTF,TextoIDF)
TextoTFIDF.collect()

[[('outlet', -0.0840168824658118),
  ('amp', -0.05300875094999675),
  ('aaplstudio', -0.0840168824658118),
  ('new', -0.0840168824658118),
  ('accord', -0.0840168824658118),
  ('patent', -0.0840168824658118),
  ('computer', -0.0840168824658118),
  ('need', -0.05300875094999675),
  ('commute', -0.0840168824658118),
  ('battery', 0.05961092677364147),
  ('want', -0.0840168824658118),
  ('future', -0.0840168824658118),
  ('apple', -0.05300875094999675),
  ('easy', -0.0840168824658118)],
 [('amp', 0.046630136342850424),
  ('checkitout', 0.07390701750693268),
  ('iphone6plus', 0.07390701750693268),
  ('need', 0.046630136342850424),
  ('help', 0.07390701750693268),
  ('apple', 0.046630136342850424),
  ('iphone6', 0.07390701750693268)]]

#### - nGrams

In [20]:
def taken(n,tokens):
    """funcion para recuperar n elementos de un array no sc
    Args:
        n int:cantidad a recuperar
        [tokens]:arreglo
    Returs:
        Str con n elementos del arreglo"""
    if len(tokens)>=n:
        tokens=list(map(lambda x:str(x),tokens))
        val=((" ").join(tokens[0:n]))
    else:
        val=""
    return val
def tails(tokens):
    """funcion para generar una lista con todas las colas posibles
    Args:
        [tokens]:arreglo
    Returs:
        [tokens]:arreglo con un elemento menos"""
    tailsss=[tokens]
    for i in range(len(tokens)-1):
        tailsss.append(tailsss[-1][1:])
    return tailsss
def nGrams(n,tokens):
    """Funcion que genera ngrams de una secuencia de tokens
    Args:
        n int: cantidad de gramas
        [str]:lista de tokens

    Returns:
        [str]: lista de tokens"""
    tokens=tokens.split()
    tokens=tails(tokens)
    Tok=list(map(lambda x:taken(n,x),tokens))
    #elimino los vacios
    Tok=list(filter(lambda x:len(x)>0,Tok))
    return Tok

In [33]:
Corpus=["apple want commute easy amp accord new patent battery aapl"
"studio outlet computer need battery future",
        "need help apple iphone6 amp iphone6plus checkitout"]
Corpus=sc.parallelize(Corpus,4)
#Calculamos Bigramas
Bigrama=Corpus.map(lambda x:nGrams(2,x))
print('Bigrama',Bigrama.take(5))
print()
#Calculamos Trigramas
Trigramas=Corpus.map(lambda x:nGrams(3,x))
print("Trigramas",Trigramas.take(5))
print()
#Calculamos Tetragramas
Tetragramas=Corpus.map(lambda x:nGrams(4,x))
print("Tetragramas",Tetragramas.take(5))
print()

Bigrama [['apple want', 'want commute', 'commute easy', 'easy amp', 'amp accord', 'accord new', 'new patent', 'patent battery', 'battery aaplstudio', 'aaplstudio outlet', 'outlet computer', 'computer need', 'need battery', 'battery future'], ['need help', 'help apple', 'apple iphone6', 'iphone6 amp', 'amp iphone6plus', 'iphone6plus checkitout']]

Trigramas [['apple want commute', 'want commute easy', 'commute easy amp', 'easy amp accord', 'amp accord new', 'accord new patent', 'new patent battery', 'patent battery aaplstudio', 'battery aaplstudio outlet', 'aaplstudio outlet computer', 'outlet computer need', 'computer need battery', 'need battery future'], ['need help apple', 'help apple iphone6', 'apple iphone6 amp', 'iphone6 amp iphone6plus', 'amp iphone6plus checkitout']]

Tetragramas [['apple want commute easy', 'want commute easy amp', 'commute easy amp accord', 'easy amp accord new', 'amp accord new patent', 'accord new patent battery', 'new patent battery aaplstudio', 'patent ba