In [3]:
# Importamos data de titulos de noticias de los últimos 15 años obtenidos de Kaggle
import pandas as pd

data = pd.read_csv('abcnews-date-text.csv', error_bad_lines=False);
data_text = data[['headline_text']]

#Agrega una columna adicional como index basado en los indexes de la misma estructura
data_text['index'] = data_text.index
documents = data_text


print(len(documents))
print(documents[:5])



1103665
                                       headline_text  index
0  aba decides against community broadcasting lic...      0
1     act fire witnesses must be aware of defamation      1
2     a g calls for infrastructure protection summit      2
3           air nz staff in aust strike for pay rise      3
4      air nz strike to affect australian travellers      4


In [10]:
# Preprocesamiento de datos

#Ejecutaremos los siguientes pasos:

# 1. Tokenization: Parte el texto en sentencias y las sentencias en palabras. Las palabras se ponen en minuscula
#                  y se remueve la puntuación
# 2. Palabras que tienen menos de 3 caracteres son removidos.
# 3. Se eliminan todas las palabras de parada
# 4. Las palabras son lematizadas: Las palabras en tercera persona son cambiadas a primera persona 
#                                  y los verbos en pasado y futuro son cambiado a presente
# 5. Las palabras se derivan: Las palabras se reducen en su forma raiz

# Loading gensim and nltk libraries

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer, PorterStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to D:\Datos de
[nltk_data]     Usuarios\s80338\Application Data\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Se escribe una funcion para ejecutar la lematización y preprocesamiento en el conjunto de datos

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [16]:
# seleccionamos un documento para visualizar luego del preprocesamiento
stemmer = PorterStemmer()

doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['rain', 'helps', 'dampen', 'bushfires']


 tokenized and lemmatized document: 
['rain', 'help', 'dampen', 'bushfir']


In [18]:
processed_docs = documents['headline_text'].map(preprocess)
processed_docs[:10]

print(processed_docs[:5])

0        [decid, commun, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object


In [20]:
# Bag of Words on the Data set
# Crear un diccionaro basado en 'processed_docs' que contiene 
# el número de veces que una palabra aparece en los datos de entrenamiento
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 20:
        break

0 broadcast
1 commun
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect
10 summit
11 aust
12 rise
13 staff
14 strike
15 affect
16 australian
17 travel
18 ambiti
19 jump
20 olsson


In [21]:
# Gensim filter_extremes

#  Filtra los tokens que aparecen en
# 1. menos de 15 documentos (número absoluto) o 
# 2. más que 0.5 documentos (fracción del tamaño del cuerpo total, no es número absoluto)
# 3. Después de los dos pasos anteriores, mantenga solo los primeros 100000 tokens más frecuentes.

dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [22]:
# Gensim doc2bow
# Para cada documento creamos un diccionario que reporte cuantas palabras
# and cuantas veces estas palabras aparecen. Guarde esto en 'bow_corpus',
# entonces checkee el documento seleccionado previamente.

bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(76, 1), (112, 1), (483, 1), (4021, 1)]

In [23]:
# Previsualizar Bag of Words para nuestra muestra documentos procesados
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 76 ("bushfir") appears 1 time.
Word 112 ("help") appears 1 time.
Word 483 ("rain") appears 1 time.
Word 4021 ("dampen") appears 1 time.


In [24]:
# TF-IDF
# Crear el objeto de modelo tf-idf usando models.TfidModel en 'bow_corpus'
# y guardarlo en 'tdidf', luego aplicar tranformación al cuerpo entero y
# llamarlo 'corpus_tfidf'. Finalmente previsualizamos los scores TF-IDF
# para nuestro primer documento

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.5903602896750699),
 (1, 0.38524510107363613),
 (2, 0.4974556071174764),
 (3, 0.5055678583740412)]


In [25]:
# Running LDA using Bag of Words
# Entrenar nuestro modelo lda usando gensim.models.LdaMulticore y 
# guardarlo en 'lda_model'

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

In [26]:
# Para cada tópico, exploraremos las palabras que aparecen en ese tópico y su peso relativo.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))
    
# Puedes distinguir los diferentes tópicos usando las palabras en cada tópico y sus pesos correspondientes?

Topic: 0 
Words: 0.031*"queensland" + 0.020*"countri" + 0.018*"hospit" + 0.017*"tasmanian" + 0.017*"hour" + 0.015*"work" + 0.014*"children" + 0.013*"citi" + 0.011*"prison" + 0.010*"young"
Topic: 1 
Words: 0.029*"australia" + 0.023*"world" + 0.017*"market" + 0.012*"record" + 0.012*"share" + 0.011*"victoria" + 0.010*"industri" + 0.009*"melbourn" + 0.009*"australian" + 0.009*"port"
Topic: 2 
Words: 0.035*"trump" + 0.022*"kill" + 0.022*"north" + 0.020*"coast" + 0.016*"china" + 0.015*"attack" + 0.015*"west" + 0.014*"adelaid" + 0.013*"price" + 0.013*"gold"
Topic: 3 
Words: 0.027*"govern" + 0.016*"plan" + 0.014*"rural" + 0.013*"say" + 0.013*"council" + 0.012*"indigen" + 0.012*"turnbul" + 0.012*"water" + 0.011*"chang" + 0.011*"commun"
Topic: 4 
Words: 0.017*"tasmania" + 0.015*"time" + 0.015*"fight" + 0.014*"lose" + 0.011*"john" + 0.010*"life" + 0.009*"premier" + 0.009*"make" + 0.009*"unit" + 0.008*"michael"
Topic: 5 
Words: 0.027*"elect" + 0.023*"south" + 0.016*"open" + 0.015*"australia" + 0.0

In [27]:
# Running LDA using TF-IDF
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    
# Nuevamente, puedes distinguir los diferentes tópicos usando las palabras en cada tópico y sus pesos correspondientes?

Topic: 0 Word: 0.016*"countri" + 0.015*"hour" + 0.012*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"budget" + 0.006*"say" + 0.005*"indigen" + 0.005*"violenc" + 0.005*"abbott"
Topic: 1 Word: 0.008*"royal" + 0.006*"climat" + 0.006*"festiv" + 0.006*"dollar" + 0.006*"commiss" + 0.006*"asylum" + 0.006*"dairi" + 0.005*"coal" + 0.005*"toni" + 0.005*"seeker"
Topic: 2 Word: 0.009*"live" + 0.008*"farm" + 0.007*"marriag" + 0.007*"busi" + 0.006*"council" + 0.006*"plan" + 0.005*"water" + 0.005*"andrew" + 0.005*"social" + 0.004*"updat"
Topic: 3 Word: 0.012*"drum" + 0.011*"interview" + 0.007*"australia" + 0.007*"septemb" + 0.007*"novemb" + 0.007*"august" + 0.006*"stori" + 0.006*"june" + 0.006*"zealand" + 0.006*"polic"
Topic: 4 Word: 0.007*"friday" + 0.007*"peter" + 0.007*"korea" + 0.006*"search" + 0.005*"north" + 0.005*"wrap" + 0.005*"pacif" + 0.005*"kid" + 0.004*"miss" + 0.004*"paul"
Topic: 5 Word: 0.007*"kill" + 0.006*"syria" + 0.006*"bomb" + 0.005*"decemb" + 0.005*"attack" + 0.005*"capit" + 0.

In [28]:
# Evaluación del desempeño clasificando un documento de muestra usando 
# el modelo LDA Bag of Words
# Checkeare donde nuestro documento de texto sería clasificado

processed_docs[4310]

['rain', 'help', 'dampen', 'bushfir']

In [29]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))
    
# Nuestro documento de prueba tiene la mayor probabilidad de ser parte del tema que 
# nuestro modelo asignó, que es la clasificación precisa.


Score: 0.41999998688697815	 
Topic: 0.022*"hous" + 0.020*"die" + 0.016*"home" + 0.014*"crash" + 0.014*"break" + 0.013*"perth" + 0.013*"take" + 0.011*"lead" + 0.011*"near" + 0.009*"star"

Score: 0.4199969172477722	 
Topic: 0.027*"govern" + 0.016*"plan" + 0.014*"rural" + 0.013*"say" + 0.013*"council" + 0.012*"indigen" + 0.012*"turnbul" + 0.012*"water" + 0.011*"chang" + 0.011*"commun"

Score: 0.020001854747533798	 
Topic: 0.017*"brisban" + 0.016*"famili" + 0.014*"child" + 0.013*"report" + 0.012*"jail" + 0.012*"abus" + 0.011*"sentenc" + 0.011*"show" + 0.010*"victim" + 0.010*"releas"

Score: 0.02000102587044239	 
Topic: 0.034*"polic" + 0.018*"health" + 0.018*"interview" + 0.017*"miss" + 0.014*"investig" + 0.013*"sydney" + 0.013*"concern" + 0.011*"search" + 0.011*"servic" + 0.011*"call"

Score: 0.020000219345092773	 
Topic: 0.035*"trump" + 0.022*"kill" + 0.022*"north" + 0.020*"coast" + 0.016*"china" + 0.015*"attack" + 0.015*"west" + 0.014*"adelaid" + 0.013*"price" + 0.013*"gold"

Score: 0.0

In [30]:
# Evaluación del desempeño clasificando un documento de muestra utilizando el modelo LDA TF-IDF.
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))
    
# Nuestro documento de prueba tiene la mayor probabilidad de ser parte del tema que nuestro 
# modelo asignó, que es la clasificación precisa.


Score: 0.5668457746505737	 
Topic: 0.019*"rural" + 0.013*"news" + 0.010*"queensland" + 0.009*"weather" + 0.008*"john" + 0.007*"nation" + 0.006*"bushfir" + 0.006*"monday" + 0.006*"flood" + 0.005*"east"

Score: 0.27314287424087524	 
Topic: 0.014*"murder" + 0.014*"charg" + 0.010*"market" + 0.010*"court" + 0.008*"turnbul" + 0.008*"assault" + 0.008*"guilti" + 0.008*"child" + 0.007*"sentenc" + 0.007*"jail"

Score: 0.020002830773591995	 
Topic: 0.016*"countri" + 0.015*"hour" + 0.012*"govern" + 0.008*"health" + 0.007*"fund" + 0.006*"budget" + 0.006*"say" + 0.005*"indigen" + 0.005*"violenc" + 0.005*"abbott"

Score: 0.02000211551785469	 
Topic: 0.009*"live" + 0.008*"farm" + 0.007*"marriag" + 0.007*"busi" + 0.006*"council" + 0.006*"plan" + 0.005*"water" + 0.005*"andrew" + 0.005*"social" + 0.004*"updat"

Score: 0.02000155858695507	 
Topic: 0.008*"royal" + 0.006*"climat" + 0.006*"festiv" + 0.006*"dollar" + 0.006*"commiss" + 0.006*"asylum" + 0.006*"dairi" + 0.005*"coal" + 0.005*"toni" + 0.005*"seek

In [31]:
# Testing model on unseen document
unseen_document = 'How a Pentagon deal became an identity crisis for Google'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.34999993443489075	 Topic: 0.032*"charg" + 0.029*"court" + 0.023*"murder" + 0.023*"polic" + 0.019*"face"
Score: 0.34998103976249695	 Topic: 0.027*"elect" + 0.023*"south" + 0.016*"open" + 0.015*"australia" + 0.013*"final"
Score: 0.18333333730697632	 Topic: 0.017*"brisban" + 0.016*"famili" + 0.014*"child" + 0.013*"report" + 0.012*"jail"
Score: 0.0166804026812315	 Topic: 0.035*"trump" + 0.022*"kill" + 0.022*"north" + 0.020*"coast" + 0.016*"china"
Score: 0.016671961173415184	 Topic: 0.027*"govern" + 0.016*"plan" + 0.014*"rural" + 0.013*"say" + 0.013*"council"
Score: 0.01666666753590107	 Topic: 0.031*"queensland" + 0.020*"countri" + 0.018*"hospit" + 0.017*"tasmanian" + 0.017*"hour"
Score: 0.01666666753590107	 Topic: 0.029*"australia" + 0.023*"world" + 0.017*"market" + 0.012*"record" + 0.012*"share"
Score: 0.01666666753590107	 Topic: 0.017*"tasmania" + 0.015*"time" + 0.015*"fight" + 0.014*"lose" + 0.011*"john"
Score: 0.01666666753590107	 Topic: 0.022*"hous" + 0.020*"die" + 0.016*"hom