In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import boto3
from sklearn.feature_extraction.text import CountVectorizer
from smart_open import smart_open
from pyspark.sql import SparkSession

nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer


In [2]:
stop = stopwords.words('english')
stemmer = SnowballStemmer("english")

In [3]:
spark=SparkSession.builder.appName('proyecto4bigdata').getOrCreate()

In [4]:
dataframePySpark=spark.read.csv('/FileStore/tables/articles1.csv',inferSchema=True,header=True)
dataframePySpark = dataframePySpark.withColumnRenamed("id", "id_book").withColumnRenamed("_c0", "id")

In [5]:
pandas_dataframe = dataframePySpark.select("*").toPandas()

In [6]:
pandas_dataframe['title'] = [re.sub(r"[^a-zA-Z0-9]+",' ', str(x)) for x in pandas_dataframe['title']]
pandas_dataframe['content'] = [re.sub(r"[^a-zA-Z0-9]+",' ', str(x)) for x in pandas_dataframe['content']]
pandas_dataframe['author'] = [re.sub(r"[^a-zA-Z0-9]+",' ', str(x)) for x in pandas_dataframe['author']]

In [7]:
pandas_dataframe.head()

Unnamed: 0,id,id_book,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON Congressional Republicans have a ne...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,After the bullet shells get counted the blood ...
2,2,17285,Tyrus Wong Bambi Artist Thwarted by Racial Bia...,New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,When Walt Disney s Bambi opened in 1942 critic...
3,3,17286,Among Deaths in 2016 a Heavy Toll in Pop Music...,New York Times,William McDonald,2017-04-10,2017.0,4.0,,Death may be the great equalizer but it isn t ...
4,4,17287,Kim Jong un Says North Korea Is Preparing to T...,New York Times,Choe Sang Hun,2017-01-02,2017.0,1.0,,SEOUL South Korea North Korea s leader Kim sai...


In [8]:
pandas_dataframe['title'] = pandas_dataframe['title'].map(lambda x: ''.join(str(e) + ' ' for e in x))
pandas_dataframe['content'] = pandas_dataframe['content'].map(lambda x:''.join(str(e)+ ' ' for e in x))
pandas_dataframe['author'] = pandas_dataframe['author'].map(lambda x: ''.join(str(e)+ ' ' for e in x))

In [9]:
pandas_dataframe['title'] = pandas_dataframe.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
pandas_dataframe['content'] = pandas_dataframe.apply(lambda row: nltk.word_tokenize(row['content']), axis=1)
pandas_dataframe['author'] = pandas_dataframe.apply(lambda row: nltk.word_tokenize(row['author']), axis=1)

In [10]:
pandas_dataframe['title'] = pandas_dataframe['title'].apply(lambda x: [item for item in x if item not in stop])
pandas_dataframe['content'] = pandas_dataframe['content'].apply(lambda x: [item for item in x if item not in stop])
pandas_dataframe['author'] = pandas_dataframe['author'].apply(lambda x: [item for item in x if item not in stop])

In [11]:
pandas_dataframe.head()

Unnamed: 0,id,id_book,title,publication,author,date,year,month,url,content
0,0,17283,hous republican fret about win their health ca...,New York Times,carl huls,2016-12-31,2016.0,12.0,,washington congression republican new fear com...
1,1,17284,rift between offic resid kill persist south br...,New York Times,benjamin mueller al baker,2017-06-19,2017.0,6.0,,after bullet shell get count blood dri votiv c...
2,2,17285,tyrus wong bambi artist thwart racial bias die...,New York Times,margalit fox,2017-01-06,2017.0,1.0,,when walt disney bambi open 1942 critic prais ...
3,3,17286,among death 2016 heavi toll pop music the new ...,New York Times,william mcdonald,2017-04-10,2017.0,4.0,,death may great equal necessarili evenhand of ...
4,4,17287,kim jong un say north korea is prepar test lon...,New York Times,choe sang hun,2017-01-02,2017.0,1.0,,seoul south korea north korea leader kim said ...


In [12]:
pandas_dataframe['title'] = pandas_dataframe['title'].apply(lambda x: [stemmer.stem(y) for y in x])
pandas_dataframe['content'] = pandas_dataframe['content'].apply(lambda x: [stemmer.stem(y) for y in x]) 
pandas_dataframe['author'] = pandas_dataframe['author'].apply(lambda x: [stemmer.stem(y) for y in x]) 
pandas_dataframe.head()

Unnamed: 0,id,id_book,title,publication,author,date,year,month,url,content
0,0,17283,"[hous, republican, fret, about, win, their, he...",New York Times,"[carl, huls]",2016-12-31,2016.0,12.0,,"[washington, congression, republican, new, fea..."
1,1,17284,"[rift, between, offic, resid, kill, persist, s...",New York Times,"[benjamin, mueller, al, baker]",2017-06-19,2017.0,6.0,,"[after, bullet, shell, get, count, blood, dri,..."
2,2,17285,"[tyrus, wong, bambi, artist, thwart, racial, b...",New York Times,"[margalit, fox]",2017-01-06,2017.0,1.0,,"[when, walt, disney, bambi, open, 1942, critic..."
3,3,17286,"[among, death, 2016, heavi, toll, pop, music, ...",New York Times,"[william, mcdonald]",2017-04-10,2017.0,4.0,,"[death, may, great, equal, necessarili, evenha..."
4,4,17287,"[kim, jong, un, say, north, korea, is, prepar,...",New York Times,"[choe, sang, hun]",2017-01-02,2017.0,1.0,,"[seoul, south, korea, north, korea, leader, ki..."


In [13]:
count_vectorizer = CountVectorizer(stop_words='english')

In [14]:
count_data = count_vectorizer.fit_transform(pandas_dataframe['content'])

In [15]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    figure ,ax = plt.subplots()
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    ax.set_xticks(x_pos, words)
    ax.set_xticklabels(words)
    ax.set_xlabel('words')
    ax.set_ylabel('counts')
    display(figure)

In [16]:
plot_10_most_common_words(count_data, count_vectorizer)

In [17]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 5
number_words = 10
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [18]:
textosToken = pandas_dataframe.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)

In [19]:
import gensim.corpora as corpora

id2word = corpora.Dictionary(textosToken)
corpus = [id2word.doc2bow(text) for text in textosToken]

In [20]:
from gensim import models

In [21]:
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [22]:
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook

topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

# Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr)

# Plot the Topic Clusters using Bokeh
output_notebook()
n_topics = 4
mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
              plot_width=900, plot_height=700)
plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
show(plot)

In [23]:
from pyLDAvis import gensim
pyLDAvis.enable_notebook()
vis = gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis