# Text  Visualization

In [53]:
import pandas as pd
import numpy as np
import gensim, spacy
import matplotlib.pyplot as plt
from gensim.utils import simple_preprocess
import nltk
#NLTK Stop words
from nltk.corpus import stopwords
import altair as alt

In [54]:
df = pd.read_csv('VoxData.csv', header=0).dropna()

In [55]:
print(df.shape)

(6903, 8)


In [56]:
df.head()

Unnamed: 0,title,author,category,published_date,updated_on,slug,blurb,body
0,Every year of a prison term makes a couple 32 ...,Dara Lind,Criminal Justice,2014-05-29 12:30:05,2014-05-29 12:30:07,http://www.vox.com/2014/5/29/5756646/every-yea...,But even a short jail stay can strain a marria...,A new study by criminologists Sonja Siennick a...
1,Making sense of Donald Trump,John Patty,Mischiefs of Faction,2016-01-12 19:50:08,2016-01-12 19:50:09,http://www.vox.com/mischiefs-of-faction/2016/1...,Social science predicted that it can't predict...,The current fight for the GOP presidential nom...
2,Acting white: the most insidious myth about bl...,JenÃ©e Desmond-Harris,Race in America,2015-03-04 13:40:02,2015-05-04 02:51:51,http://www.vox.com/2015/3/4/8138739/acting-whi...,This popular theory about how African-American...,You've probably heard it before: too many blac...
3,Hillary Clintonâ€™s pitch: Tim Kaine will be t...,Dylan Matthews,Hillary Clinton,2016-07-23 21:23:13,2016-07-25 15:56:38,http://www.vox.com/2016/7/23/12263516/tim-kain...,He's not Tom Perez or Cory Booker. But...,"To many on the left, Tim Kaine’s selection as ..."
4,"Democratic debate 2015: start time, schedule, ...",Andrew Prokop,Debates,2015-11-13 16:20:02,2015-11-14 23:47:28,http://www.vox.com/2015/11/13/9728432/democrat...,The three remaining candidates will debate in ...,The horrific attacks in Paris will loom large ...


# Text with metadata

In [57]:
''' distribution of authors'''


' distribution of authors'

In [58]:
''' distribution of categories '''

categories = df.groupby(['category']).size().reset_index(name='count')

alt.Char(categories).mark_bar().encode(
    x = alt.X()
)

AttributeError: module 'altair' has no attribute 'Char'

In [None]:
df[df['category']=='Politics & Policy']['body'].iloc[0]

# Processing for Filtering and Machine Learning Models

## Tokenization

In [None]:
''' we focus on the category of culture here '''
processed = df[df['category']=='Politics & Policy'].reset_index(drop=True)

In [None]:
processed

In [None]:
''' first trial of tokenization using simple_preprocess '''
data_words = gensim.utils.simple_preprocess(processed['body'][0])

In [None]:
data_words

In [None]:
data_word_list = [simple_preprocess(sentence) for sentence in processed['body']]

In [None]:
print("length of data_word_list: " , len(data_word_list))
print("length of data_word_list[0]: " , len(data_word_list[0]))

In [None]:
nltk.download('stopwords')

In [None]:

stop_words = stopwords.words('english')
stop_words.extend(['com', 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 
                   'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 
                   'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 
                   'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 
                   'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])


In [None]:
data_words = [[word for word in doc if word not in stop_words] for doc in data_word_list]

In [None]:
print("length of data_words: " , len(data_words))
print("length of data_words[0]: " , len(data_words[0]))

## Stemming

In [None]:
#Stemming using porter Stemming Algorithm
from gensim.parsing.porter import PorterStemmer
p = PorterStemmer()

data_ready = []

for text in data_words:
    data_stemmed = p.stem_documents(text)
    data_ready.append(data_stemmed)
# data_ready

In [None]:
len(data_ready)

## Lemmatization

In [None]:
'''
too slow, do not run here
'''


'''
# Initialize spacy 'en' model, keeping only tagger component needed for lemmatization
nlp = spacy.load('en', disable=['parser', 'ner'])
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']
for sent in data_words:
    # Parse the sentence using the loaded 'en' model object `nlp`. Extract the lemma for each token and join
    doc = nlp(" ".join(sent)) 
    data_ready.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
# remove stopwords once more after lemmatization
data_ready = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_ready]

'''

## Construct TF-IDF

In [None]:
import gensim.corpora as corpora
from gensim.sklearn_api import TfIdfTransformer

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

model = TfIdfTransformer(dictionary=id2word)

In [None]:
# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

num_docs = id2word.num_docs
num_terms = len(id2word.keys())

In [None]:
for doc in corpus[:1]:
    print([[id, id2word[id], freq] for id, freq in doc])

In [None]:
tfidf_corpus = model.fit_transform(corpus)

In [None]:
tfidf_corpus[0]

In [None]:
# construct an array of tf-idf vectors
from gensim.matutils import corpus2dense, corpus2csc

corpus_tfidf_dense = corpus2dense(tfidf_corpus, num_terms, num_docs)

In [None]:
corpus_tfidf_dense.shape

In [None]:
X = corpus_tfidf_dense[corpus_tfidf_dense.max(axis=1) > 0.1]
X.shape

## Clustering & Projection

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans


In [None]:
# result_pca = PCA(n_components=2).fit_transform(X.T)
result_tsne = TSNE(n_components=2, perplexity=10).fit_transform(X.T)

tsne_df = pd.DataFrame(data=result_tsne, columns=['x','y'])



In [None]:
words = []
for id in id2word.keys():
    words.append(id2word[id])

In [None]:
mat = pd.DataFrame(data=corpus_tfidf_dense, index=words)
mat = mat[mat.max(axis=1) > 0.1]

wordtfidf = pd.DataFrame(data=mat.values.T, columns=mat.index)
wordtfidf.head()

In [None]:
mat.max(axis=1).shape

## Trend

In [None]:
''' distribution of authors'''
author_count = processed.groupby('author').size().reset_index(name='count')

alt.Chart(author_count).mark_bar().encode(
    x = alt.X('author:N',  sort='-y'),
    y = alt.Y('count:Q'),
)

In [None]:
author_count = author_count.sort_values(by='count', ascending=False)

In [None]:
wordtfidf['author'] = processed['author']

In [None]:
author_key_words = []

''' key words for top 10 authors'''
for author in author_count['author'][:10]:
    # get the sum tf-idf for each word, do sum() across rows for each column
    group_df = wordtfidf[wordtfidf['author'] == author].mean(axis=0)
    # sort the tf-idf values
    to_sort = [{'freq': group_df[x], 'word': x} for x in group_df.index]
    sorted(to_sort, key=lambda d: d['freq'], reverse=True)
    # add the words to the list
    for i in range(10):
        author_key_words.append([author, to_sort[i]['word'], to_sort[i]['freq']])
        
author_keyword_df = pd.DataFrame(data=group_key_words, columns=['author', 'keyword', 'tfidf'])

In [None]:
author_keyword_df