In [26]:
import numpy as np
import pandas as pd

from sklearn.decomposition import TruncatedSVD, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim import corpora, models, similarities, matutils

import preprocessing
import vectorize

In [2]:
documents = preprocessing.get_clean_tweets(subset=10000)

In [3]:
len(documents)

9997

In [5]:
cv, cv_names = vectorize.get_count_vectorized(documents)

In [6]:
tfidf, tfidf_names = vectorize.get_tfidf_vectorized(documents)

In [17]:
tfidf.shape

(9997, 10788)

In [10]:
lsa = TruncatedSVD(2)
lsa_topic = lsa.fit_transform(cv)
lsa.explained_variance_ratio_

array([0.0694311 , 0.03757325])

In [13]:
topic_word = pd.DataFrame(lsa.components_, index=['component_1','component_2'], columns = cv_names)

In [14]:
topic_word

Unnamed: 0,abaixo,abandoned,abandons,abd,abdsc,abeg,abet,abiding,abilities,ability,...,zellerbach,zero,zerotrust,zeus,zhengzhou,zines,zion,zodiacal,zuckermouth,zulu
component_1,4.246475e-09,7.784227e-07,8.973645e-08,1e-06,3.6e-05,7.563343e-08,2.5e-05,4e-06,3.966989e-07,2.7e-05,...,2.3e-05,3.1e-05,3e-06,4e-06,8.481597e-07,6.432944e-07,1.4e-05,2e-06,7.685274e-08,2e-06
component_2,3.021631e-08,3.732165e-06,7.309567e-07,8e-06,0.000318,6.9718e-07,4.4e-05,5.2e-05,2.006524e-06,5.2e-05,...,1e-06,0.000129,3.9e-05,6.8e-05,2.918478e-06,3.766078e-06,0.000248,5.9e-05,4.79119e-07,5e-06


In [18]:
nmf = NMF(2)
nmf_topic = nmf.fit_transform(cv)

In [19]:
topic_word = pd.DataFrame(nmf.components_, index=['component_1','component_2'],columns=cv_names)

In [20]:
topic_word

Unnamed: 0,abaixo,abandoned,abandons,abd,abdsc,abeg,abet,abiding,abilities,ability,...,zellerbach,zero,zerotrust,zeus,zhengzhou,zines,zion,zodiacal,zuckermouth,zulu
component_1,1.878844e-08,5e-06,3.83651e-07,8e-06,0.000122,2.871732e-07,0.000221,0.0,3e-06,0.000233,...,0.000221,0.000233,0.0,0.0,6e-06,4e-06,0.0,0.0,4.357295e-07,1.3e-05
component_2,2.536121e-07,3.1e-05,6.193973e-06,6.9e-05,0.002733,5.910041e-06,0.000378,0.000447,1.7e-05,0.000446,...,1.4e-05,0.001102,0.000333,0.000581,2.5e-05,3.1e-05,0.002104,0.000505,4.096941e-06,3.7e-05


In [21]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for i, topic in enumerate(model.components_):
        if not topic_names or not topic_names[i]:
            print("\nTopic ", i)
        else:
            print("\nTopic: '", topic_names[i], "'")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [22]:
display_topics(lsa, cv_names, 5)


Topic  0
people, venezuela, need, aid, life

Topic  1
pay, think, ve, tax, fund


In [23]:
display_topics(nmf, cv_names, 10)


Topic  0
people, venezuela, need, aid, life, children, day, maduro, nicol, humanitarian

Topic  1
pay, think, ve, tax, fund, billboards, mercers, avoidance, offshore, papers


In [24]:
pd.DataFrame(nmf_topic, columns = ['component_1','component_2'])

Unnamed: 0,component_1,component_2
0,0.000145,0.001145
1,0.024702,0.020163
2,0.000242,0.000866
3,0.000032,0.001132
4,0.000427,0.001281
5,0.000071,0.000316
6,0.021728,0.000412
7,0.000385,0.002470
8,0.000036,0.012331
9,0.000017,0.000326
