In [17]:
import RAKE
import operator
import pandas as pd
import random
import os
from os.path import join
from rake_nltk import Rake

In [18]:
# Texto de ejemplo
text = """Google quietly rolled out a new way for Android users to listen 
to podcasts and subscribe to shows they like, and it already works on 
your phone. Podcast production company Pacific Content got the exclusive 
on it.This text is taken from Google news."""

text2 = """spaCy is an open-source software library for advanced natural language processing,
written in the programming languages Python and Cython. The library is published under the MIT license
and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""

In [19]:
# Usamos RAKE
stop_dir = "Stoplist"

rake_obj = RAKE.Rake(stop_dir)

# Extract keywords
key_rake = rake_obj.run(text)
print(key_rake)

[('podcast production company pacific content', 25.0), ('google quietly rolled', 8.5), ('google news', 4.5), ('android users', 4.0), ('listen', 1.0), ('podcasts', 1.0), ('subscribe', 1.0), ('shows', 1.0), ('works', 1.0), ('phone', 1.0), ('exclusive', 1.0), ('text', 1.0)]


In [20]:
# Convertir a DataFrame para despliegue
df = pd.DataFrame(key_rake, columns=['Frase','Score'])
display(df)

Unnamed: 0,Frase,Score
0,podcast production company pacific content,25.0
1,google quietly rolled,8.5
2,google news,4.5
3,android users,4.0
4,listen,1.0
5,podcasts,1.0
6,subscribe,1.0
7,shows,1.0
8,works,1.0
9,phone,1.0


In [21]:

# Ahora usamos la otra librería
rake = Rake()

# Extraemos las palabras clave
rake.extract_keywords_from_text(text)
keywords = rake.get_ranked_phrases_with_scores()
keywords

[(36.0, 'podcast production company pacific content got'),
 (8.5, 'google quietly rolled'),
 (4.5, 'google news'),
 (4.0, 'new way'),
 (4.0, 'android users'),
 (4.0, 'already works'),
 (1.0, 'text'),
 (1.0, 'taken'),
 (1.0, 'subscribe'),
 (1.0, 'shows'),
 (1.0, 'podcasts'),
 (1.0, 'phone'),
 (1.0, 'listen'),
 (1.0, 'like'),
 (1.0, 'exclusive')]

In [22]:
# para que se parezca a la otra tabla, invertimos las tuplas y
# creamos el DataFrame
keywords = [tuple(reversed(keyword)) for keyword in keywords]


In [23]:
# Creamos un nuevo DataFrame
df1 = pd.DataFrame(keywords, columns=['Frase','Score'])
display(df)

Unnamed: 0,Frase,Score
0,podcast production company pacific content,25.0
1,google quietly rolled,8.5
2,google news,4.5
3,android users,4.0
4,listen,1.0
5,podcasts,1.0
6,subscribe,1.0
7,shows,1.0
8,works,1.0
9,phone,1.0


# Extraemos palabras clave de nuestro corpus

In [24]:
MY_HOME = os.path.expanduser('~')
TEXTOS_DIR = join(MY_HOME, 'NLP','textos')
# generamos números aleatorios para leer nuestros archivos
indices = [random.randint(0,283) for _ in range(0,3)]

In [25]:
# leemos los archivos como strings
documentos = []
for i in range(0,3):
    file = f'article{indices[i]}.txt'
    filename = join(TEXTOS_DIR, file)
    with open(filename) as f:
        texto = f.read()
        documentos.append(texto)

In [26]:
# Extraemos las palabras clave de cada documento
# Primero usamos RAKE
for i in range(len(documentos)):
    keywords = rake_obj.run(documentos[i])
    print(f"Processing document {i}")
    df = pd.DataFrame(keywords,columns=['Frase','Score'])
    print(df.head(20))
    print("\n\n")
    

Processing document 0
                                              Frase      Score
0   libertarian porn-merchant defending free speech  36.000000
1               third-rate las vegas club performer  34.000000
2                          middle-class jewish home  16.000000
3       syntactically-challenged immigrant engineer  16.000000
4                    regular blue-collar eccentrics  16.000000
5                      staid costume movies amadeus  14.500000
6                                song-and-dance man  14.333333
7                                  fritz lang spent   9.000000
8                              russian tanks rolled   9.000000
9                      anti-authoritarian outsiders   9.000000
10                                     made ed wood   9.000000
11                            fruity english accent   9.000000
12                             booked carnegie hall   9.000000
13                   chameleon confidence trickster   9.000000
14                     constantly

In [27]:
#Ahora usamos rake_nltk

for i in range(len(documentos)):
    rake.extract_keywords_from_text(documentos[i])
    keywords = rake.get_ranked_phrases_with_scores()
    keywords = [tuple(reversed(keyword)) for keyword in keywords]
    print(f"Processing document {i}")
    df = pd.DataFrame(keywords,columns=['Frase','Score'])
    print(df.head(20))
    print("\n\n")

Processing document 0
                                            Frase      Score
0   radio city music hall christmas show complete  47.000000
1           fritz lang spent two years travelling  32.500000
2                oddball comedian whose true aims  24.500000
3                   rate las vegas club performer  23.500000
4                       dance man ,' says kaufman  17.595238
5                  merchant defending free speech  16.000000
6                    least good revisiting europe  16.000000
7                      whose tony clifton persona  15.333333
8                        totally meshugga ,' andy  15.333333
9                    staid costume movies amadeus  15.000000
10                           good deal less funny  14.500000
11                      brilliant ,' says shapiro  14.333333
12                           people v larry flynt  13.166667
13                 though pursuing similar themes  13.000000
14                     best known outside america  13.000000
15