In [1]:
import glob
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [3]:
data = pd.concat(map(pd.read_csv, glob.glob(os.path.join('../data/raw/', "*.csv"))))
data.head()

Unnamed: 0.1,Unnamed: 0,id_str,created_at,retweets,description,text,screen_name,user_created_at,followers
0,0,1295497809664319489,Mon Aug 17 23:08:41 +0000 2020,0,,RT @MeidasTouch: Thanks to your support we wer...,pamelabond111,Fri Sep 09 18:09:39 +0000 2016,339
1,0,1295497809626447872,Mon Aug 17 23:08:41 +0000 2020,0,写真撮る人 31♂/既婚 📷α7RⅢ,@shakeikurararan おはようございますー！！🙌☀️,photo_coco_,Mon Dec 03 04:27:12 +0000 2018,455
2,0,1295497809647374336,Mon Aug 17 23:08:41 +0000 2020,0,#BlueWave🌊\n#voteBlue2020 🌊\n#theResistance \n...,RT @RBReich: The next time Trump brags about d...,LisaJarrett6,Tue Nov 21 07:19:30 +0000 2017,1120
3,0,1295497809643237376,Mon Aug 17 23:08:41 +0000 2020,0,#JAEPIL: omg ily. ┆ { fan account } 🌌☕ she/her...,RT @smolpiri: ʰᵘʰᵉᵘᵐʰᵉᵘᵐ ^-^ https://t.co/Yk2U...,ZER0UKHAE,Sat May 19 07:52:57 +0000 2018,908
4,0,1295497809647591424,Mon Aug 17 23:08:41 +0000 2020,0,"CREER ES CREAR. No vemos el mundo como es, vem...",RT @ulichaparro12: Pueden llamarnos como quier...,amirahitt,Mon May 10 19:51:34 +0000 2010,2375


In [4]:
documents = data['description'].astype(str).to_list()
print(documents)



In [5]:
no_features = 1000

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [6]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
no_topics = 10

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA

lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

In [8]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}")
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 3
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)

Topic 0
nan 𝚘𝚏 frase
Topic 1
fan account bts
Topic 2
https www net
Topic 3
la en el
Topic 4
ig sc writer
Topic 5
love rihunclewillie jesus
Topic 6
com http gmail
Topic 7
just don twitter
Topic 8
que não se
Topic 9
insta snap god
Topic 0
la que en
Topic 1
just stan bts
Topic 2
nan com https
Topic 3
ig love ela
Topic 4
ll insta girl
Topic 5
world like 20
Topic 6
fan account loves
Topic 7
mom wife god
Topic 8
love don trump
Topic 9
twitter art blacklivesmatter
