In [52]:
import pandas as pd
import numpy as np

import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
import spacy

from textblob import TextBlob, Word

import re

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

Edited version of AOC to make it more organized

In [53]:
pd.set_option('display.max_colwidth', 400)  # or 199

Now, preprocessing (using same code as for kickstarter as baseline, come back here to tweak later)

Look into https://pypi.org/project/tweet-preprocessor/ for tweet processing later!

In [54]:
nlp = spacy.load('en', disable=['parser', 'ner'])
# nlp = spacy.load('en')

In [55]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #lemmatize with Spacy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    
    return text

In [60]:
tweet_stopwords = stopwords.words('english') + \
    ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
     'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
     'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
     'day', 'days', 'captain', 'marvel', 'mcu', 'captainmarvel', 'pron']

In [57]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [58]:
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_0.pickle', 'rb') as f:
    df_0 = pickle.load(f)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_1.pickle', 'rb') as f:
    df_1 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_2.pickle', 'rb') as f:
    df_2 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_3.pickle', 'rb') as f:
    df_3 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_4.pickle', 'rb') as f:
    df_4 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_5.pickle', 'rb') as f:
    df_5 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_6.pickle', 'rb') as f:
    df_6 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_7.pickle', 'rb') as f:
    df_7 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_11.pickle', 'rb') as f:
    df_11 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_15.pickle', 'rb') as f:
    df_15 = pickle.load(f)

In [59]:
df_0['final_text'] = np.where((df_0['retweet_text'].isnull() == False), df_0['retweet_text'], df_0['main_text'])
df_1['final_text'] = np.where((df_1['retweet_text'].isnull() == False), df_1['retweet_text'], df_1['main_text'])
df_2['final_text'] = np.where((df_2['retweet_text'].isnull() == False), df_2['retweet_text'], df_2['main_text'])
df_3['final_text'] = np.where((df_3['retweet_text'].isnull() == False), df_3['retweet_text'], df_3['main_text'])
df_4['final_text'] = np.where((df_4['retweet_text'].isnull() == False), df_4['retweet_text'], df_4['main_text'])
df_5['final_text'] = np.where((df_5['retweet_text'].isnull() == False), df_5['retweet_text'], df_5['main_text'])
df_6['final_text'] = np.where((df_6['retweet_text'].isnull() == False), df_6['retweet_text'], df_6['main_text'])
df_7['final_text'] = np.where((df_7['retweet_text'].isnull() == False), df_7['retweet_text'], df_7['main_text'])
df_11['final_text'] = np.where((df_11['retweet_text'].isnull() == False), df_11['retweet_text'], df_11['main_text'])
df_15['final_text'] = np.where((df_15['retweet_text'].isnull() == False), df_15['retweet_text'], df_15['main_text'])

RUN ALL BELOW ONCE PREV IS FINISHED!

In [61]:
df_0['tweet_processed'] = df_0['final_text'].apply(lambda x:pre_process(x))
df_1['tweet_processed'] = df_1['final_text'].apply(lambda x:pre_process(x))
df_2['tweet_processed'] = df_2['final_text'].apply(lambda x:pre_process(x))
df_3['tweet_processed'] = df_3['final_text'].apply(lambda x:pre_process(x))
df_4['tweet_processed'] = df_4['final_text'].apply(lambda x:pre_process(x))
df_5['tweet_processed'] = df_5['final_text'].apply(lambda x:pre_process(x))
df_6['tweet_processed'] = df_6['final_text'].apply(lambda x:pre_process(x))
df_7['tweet_processed'] = df_7['final_text'].apply(lambda x:pre_process(x))
df_11['tweet_processed'] = df_11['final_text'].apply(lambda x:pre_process(x))
df_15['tweet_processed'] = df_15['final_text'].apply(lambda x:pre_process(x))

In [62]:
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_0_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_0, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_1_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_1, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_2_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_2, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_3_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_3, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_4_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_4, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_5_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_5, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_6_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_6, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_7_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_7, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_11_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_11, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_15_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_15, to_write)


In [63]:
X_0 = df_1['tweet_processed']
X_1 = df_1['tweet_processed']
X_2 = df_2['tweet_processed']
X_3 = df_3['tweet_processed']
X_4 = df_4['tweet_processed']
X_5 = df_5['tweet_processed']
X_6 = df_6['tweet_processed']
X_7 = df_7['tweet_processed']
X_11 = df_11['tweet_processed']
X_15 = df_15['tweet_processed']


In [64]:
tfidf_0 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_1 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_2 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_3 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_4 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_5 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_6 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_7 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_11 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_15 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))


Community_0

In [65]:
bag_of_words_0 = tfidf_0.fit_transform(X_0)
lsa_0 = TruncatedSVD(20)
doc_topic_0 = lsa_0.fit_transform(bag_of_words_0)
lsa_0.explained_variance_ratio_

array([0.28967975, 0.06303829, 0.07126341, 0.06829032, 0.04576414,
       0.03643453, 0.01060932, 0.00745032, 0.0059875 , 0.00447338,
       0.00461396, 0.00426893, 0.00413396, 0.00368238, 0.00301009,
       0.00298623, 0.00297964, 0.00271909, 0.00250638, 0.00252132])

In [66]:
display_topics(lsa_0, tfidf_0.get_feature_names(), 10)


Topic  0
co fvumqtm, fvumqtm, fvumqtm hj, hj, without context, without, context co, context, co, thor

Topic  1
widow, black widow, black, thor, widow train, thor watch, co ydfekkjae, train pistol, ydfekkjae, pistol co

Topic  2
want, want home, bitch niggas, want war, utcom, niggas want, home want, ftbp, co ftbp, ftbp utcom

Topic  3
facility endgame, co lczkximxrh, lczkximxrh, show avenger, endgame co, show, endgame, avenger facility, facility, avenger

Topic  4
co ovkuamt, ovkuamt xk, ovkuamt, thor battlefield, xk, battlefield co, battlefield, thor, co, thor co

Topic  5
fire, fire gun, widow fire, co vrq, facility thor, gun avenger, vrq, gun, thor co, black widow

Topic  6
business co, co jvb, avenger distract, distract thanos, tw, jvb, jvb tw, handle business, thor handle, thanos wait

Topic  7
mf bitch, mf, bitch, bad bitch, bad, really bitch, damn actually, drop international, actually drop, mf flex

Topic  8
worth, worth watch, worth good, watch, worth hype, go, good, worth av

Community_1

In [67]:
bag_of_words_1 = tfidf_1.fit_transform(X_1)
lsa_1 = TruncatedSVD(20)
doc_topic_1 = lsa_1.fit_transform(bag_of_words_1)
lsa_1.explained_variance_ratio_

array([0.28967975, 0.06303829, 0.07126341, 0.06829032, 0.04576414,
       0.03643453, 0.01060932, 0.00745032, 0.00598745, 0.00447332,
       0.00461379, 0.00426922, 0.00413456, 0.00368372, 0.00302664,
       0.00299894, 0.00297355, 0.00267876, 0.0025251 , 0.00254885])

In [68]:
display_topics(lsa_1, tfidf_1.get_feature_names(), 10)


Topic  0
co fvumqtm, fvumqtm, fvumqtm hj, hj, without context, without, context co, context, co, thor

Topic  1
widow, black widow, black, thor, widow train, thor watch, co ydfekkjae, train pistol, ydfekkjae, pistol co

Topic  2
want, want home, bitch niggas, want war, utcom, niggas want, home want, ftbp, co ftbp, ftbp utcom

Topic  3
facility endgame, co lczkximxrh, lczkximxrh, show avenger, endgame co, show, endgame, avenger facility, facility, avenger

Topic  4
co ovkuamt, ovkuamt xk, ovkuamt, thor battlefield, xk, battlefield co, battlefield, thor, co, thor co

Topic  5
fire, fire gun, widow fire, co vrq, facility thor, gun avenger, vrq, gun, thor co, black widow

Topic  6
jvb tw, avenger distract, distract thanos, tw, thor handle, jvb, co jvb, business co, handle business, thanos wait

Topic  7
mf bitch, mf, bitch, bad bitch, bad, really bitch, drop international, damn actually, mf flex, actually drop

Topic  8
worth, worth watch, worth good, watch, worth hype, go, jbendana give,

Community_2

In [69]:
bag_of_words_2 = tfidf_2.fit_transform(X_2)
lsa_2 = TruncatedSVD(20)
doc_topic_2 = lsa_2.fit_transform(bag_of_words_2)
lsa_2.explained_variance_ratio_

array([0.04040905, 0.0221644 , 0.01081053, 0.01120958, 0.00828544,
       0.00769537, 0.00721827, 0.00574295, 0.00620663, 0.00578497,
       0.00592726, 0.00588957, 0.00564478, 0.00555213, 0.00538718,
       0.00532296, 0.00503983, 0.00489027, 0.00436428, 0.00413235])

In [70]:
display_topics(lsa_2, tfidf_2.get_feature_names(), 10)


Topic  0
weekend higherfurtherfaster, open weekend, higherfurtherfaster co, higherfurtherfaster, fan, open, weekend, brielarson pop, night surprise, theater saturday

Topic  1
ultimate experience, popcorn soda, soda, help fan, fan popcorn, ultimate, popcorn, help, experience, experience brielarson

Topic  2
studio, co, ticket, ticket co, ulr, co ulr, theater ticket, theater, ulr lzx, lzx co

Topic  3
lee co, stan lee, lee, stan, change, intro, image stan, intro image, ivh, co ivh

Topic  4
sveuvf, wpy sveuvf, boom congrat, co wpy, well deserve, wpy, brielarson well, boom, deserve co, congrat brielarson

Topic  5
uylpevb, co uylpevb, intro roll, roll co, roll, intro, co, avengersendgame, intro co, avengersendgame theater

Topic  6
avengersendgame, avengersendgame theater, studio avengersendgame, theater april, april co, april, whatev watch, trailer studio, brand trailer, watch brand

Topic  7
co uugwmtwqt, uugwmtwqt, proud co, proud, co, context, context co, avengersendgame co, thor, t

Community_3

In [71]:
bag_of_words_3 = tfidf_3.fit_transform(X_3)
lsa_3 = TruncatedSVD(20)
doc_topic_3 = lsa_3.fit_transform(bag_of_words_3)
lsa_3.explained_variance_ratio_

array([0.12655283, 0.09293576, 0.09037782, 0.08112643, 0.07417496,
       0.06547991, 0.04233742, 0.03303307, 0.02231412, 0.0148271 ,
       0.01324264, 0.01187566, 0.01167761, 0.01036296, 0.00945058,
       0.00795511, 0.00739407, 0.00586535, 0.00546535, 0.00538644])

In [72]:
display_topics(lsa_3, tfidf_3.get_feature_names(), 10)


Topic  0
co wjebx, yxxpsg co, co yxxpsg, wjebx, wjebx qp, yxxpsg, qp, captainmarvelxboxsweepstake nopurchnec, xbox captainmarvelxboxsweepstake, march rule

Topic  1
pop, follow originalfunko, originalfunko chance, originalfunko, chance win, win, follow, funkowomenofpower, chance, funkowomenofpower internationalwomensday

Topic  2
box, box close, order mcc, mcc box, mcc, box order, win collector, close march, corp box, collector corp

Topic  3
lwzcakbfsy, co lwzcakbfsy, win walmart, walmart exclusive, walmart, exclusive pop, pop co, exclusive, wtrgxd, co kne

Topic  4
co kne, wtrgxd, kne wtrgxd, kne, pop goosethecat, cat pop, win goose, goosethecat co, goosethecat, goose cat

Topic  5
uxvbxgpunv, co uxvbxgpunv, win chase, pop funkowomenofpower, chase pop, chase, funkowomenofpower internationalwomensday, internationalwomensday co, internationalwomensday, funkowomenofpower

Topic  6
nkyuxukvwf, co nkyuxukvwf, exclusive korath, korath pop, pop funkoeccc, win eccc, funkoeccc co, eccc exclu

Community_4

In [73]:
bag_of_words_4 = tfidf_4.fit_transform(X_4)
lsa_4 = TruncatedSVD(20)
doc_topic_4 = lsa_4.fit_transform(bag_of_words_4)
lsa_4.explained_variance_ratio_

array([0.04360452, 0.04029493, 0.02579917, 0.01700832, 0.01770867,
       0.01649673, 0.01333475, 0.01209863, 0.01132071, 0.00974927,
       0.00907375, 0.00920448, 0.00729515, 0.00730978, 0.00709057,
       0.00603571, 0.00564932, 0.00547563, 0.00539442, 0.00541835])

In [74]:
display_topics(lsa_4, tfidf_4.get_feature_names(), 10)


Topic  0
brielarson, model draw, xd enjoy, draw xd, since star, credit sailor, moon role, ohfhak, co ohfhak, brielarson credit

Topic  1
ykreclow, co ykreclow, brielarson president, president co, president, brielarson, co, hvuzhv co, hvuzhv, motherfuck love

Topic  2
war, ragnarok historical, operation homecoming, war overpopulation, terrorism ragnarok, military operation, historical revisionism, homecoming unemployment, man sponsor, scarce resource

Topic  3
hvuzhv, xwhbaqfvw, co xwhbaqfvw, co hvuzhv, motherfuck love, hvuzhv co, motherfuck, love samuel, jackson co, samuel jackson

Topic  4
hijab marvelstudios, co xdlktuc, marvelstudios badass, decide design, xdlktuc, design hijab, hero high, badass hero, hijab, faster co

Topic  5
reference, movie, spend reference, therealstanlee shout, friend therealstanlee, universe survive, shout part, lifetime spend, mess lifetime, marvelstudio friend

Topic  6
hashtag, click hashtag, cat background, trend hashtag, hashtag welcome, um korea, back

Community_5

In [75]:
bag_of_words_5 = tfidf_5.fit_transform(X_5)
lsa_5 = TruncatedSVD(20)
doc_topic_5 = lsa_5.fit_transform(bag_of_words_5)
lsa_5.explained_variance_ratio_

array([0.05224814, 0.06405837, 0.05646166, 0.04148708, 0.04522894,
       0.03533295, 0.02914876, 0.03245864, 0.03206032, 0.0315328 ,
       0.03169025, 0.02287117, 0.02199024, 0.01752033, 0.01726561,
       0.01293848, 0.00937351, 0.00947109, 0.00927093, 0.00842558])

In [76]:
display_topics(lsa_5, tfidf_5.get_feature_names(), 10)


Topic  0
thor, thor never, never aim, rws, aim, aim head, head avengersendgame, flinch thor, co rws, flinch

Topic  1
co ybavku, ybavku, scar co, scar, co, logo co, inspire logo, co rnuixscvsh, rnuixscvsh, mavel inspire

Topic  2
iron man, iron, man, avenger iron, avenger, man incredible, incredible hulk, hulk, incredible, america first

Topic  3
rnuixscvsh, mavel, mavel inspire, inspire logo, co rnuixscvsh, logo co, inspire, logo, summarize, py eh

Topic  4
summarize, summarize video, py eh, co py, eh yk, py, eh, yk, video co, video

Topic  5
jb, paradigm, hall movie, gsc paradigm, gsc, hall, seat, paradigm mall, time seat, place gsc

Topic  6
loyal, loyal bestie, co xrlaxmo, xrlaxmo, bestie co, bestie, co, leave credit, still people, co pyqrf

Topic  7
leave credit, pyqrf rvkp, co pyqrf, rvkp, still people, already st, pyqrf, movie yet, yet still, people leave

Topic  8
never nick, bct, fury singing, mmmi, mmmi bct, co mmmi, ring co, singing ring, singing, nick fury

Topic  9
kiky, 

Community_6

In [77]:
bag_of_words_6 = tfidf_6.fit_transform(X_6)
lsa_6 = TruncatedSVD(20)
doc_topic_6 = lsa_6.fit_transform(bag_of_words_6)
lsa_6.explained_variance_ratio_

array([0.03100691, 0.01792091, 0.01688233, 0.01516062, 0.01164227,
       0.01140538, 0.0110262 , 0.01123518, 0.00997816, 0.00919846,
       0.00808808, 0.00724714, 0.00679672, 0.0059257 , 0.00535652,
       0.00504265, 0.00457967, 0.00460596, 0.00432974, 0.00417823])

In [78]:
display_topics(lsa_6, tfidf_6.get_feature_names(), 10)


Topic  0
woman immediately, son come, room seem, kind bigotry, police win, celebrate utterly, win tolerate, patronizing counterproductive, counterproductive celebrate, tolerate kind

Topic  1
df ivjzzu, opening monologue, review perfection, ivjzzu, monologue rlm, monologue, rlm review, perfection co, co df, rlm

Topic  2
troop, thank, name thank, troop real, superhero troop, midnight tonight, back join, add name, hi samuel, live american

Topic  3
critic, movie, co pzyk, opinion fit, pzyk, pzyk uwcws, afraid hate, lie happy, life critic, internationalwomensday female

Topic  4
nuke, pic, audience, review, pic left, pic right, rat order, rating pic, right minute, rottentomatoe mass

Topic  5
woman co, base real, exploit supreme, life exploit, supreme leader, jong un, leader kim, popular western, kim jong, un thinly

Topic  6
plot, hole many, give poorly, plot plot, write jumpy, go regret, contrivance, plot contrivance, regret movie, many plot

Topic  7
trump, planet, supporter complici

Community_7

In [79]:
bag_of_words_7 = tfidf_7.fit_transform(X_7)
lsa_7 = TruncatedSVD(20)
doc_topic_7 = lsa_7.fit_transform(bag_of_words_7)
lsa_7.explained_variance_ratio_

array([0.15482183, 0.05758228, 0.04712704, 0.03127667, 0.03007945,
       0.03032132, 0.02429275, 0.02356071, 0.02161187, 0.01809549,
       0.01654174, 0.0148685 , 0.01147911, 0.01005909, 0.00853592,
       0.00805964, 0.00781883, 0.00722712, 0.00695422, 0.00671655])

In [80]:
display_topics(lsa_7, tfidf_7.get_feature_names(), 10)


Topic  0
really, want boring, boring bland, cat really, bland sam, jackson cat, really want, sam jackson, sam, boring

Topic  1
go avenger, theater ashamed, endgame time, total badass, love total, damn excited, badass post, dawg op, excited go, op love

Topic  2
good, woman good, unpopular opinion, unpopular, opinion wonder, wonder woman, wonder, opinion, woman, popular

Topic  3
movie, tl dr, tl, magic, landmark magic, also sjw, leftish, decent shrug, female action, anything also

Topic  4
kceihtpkop, sneezing co, co kceihtpkop, sneezing, co, bar, fury, awesome, bar bud, fury converse

Topic  5
awesome, fantastic, eat healthy, home cook, meal absolutely, love awesome, session fun, good vibe, fun stream, cook meal

Topic  6
trailer time, time hour, ultimate duo, thor ultimate, duo watch, dawg thor, watch trailer, duo, hour, ultimate

Topic  7
bar, fury, hawkeye recreate, co fqgd, actual bar, yes actual, bar nick, bar bud, obviously co, fqgd sl

Topic  8
fantastic, good, meal, stream, 

Community_11

In [81]:
bag_of_words_11 = tfidf_11.fit_transform(X_11)
lsa_11 = TruncatedSVD(20)
doc_topic_11 = lsa_11.fit_transform(bag_of_words_11)
lsa_11.explained_variance_ratio_

array([0.12003292, 0.11778732, 0.09167505, 0.04478375, 0.02790912,
       0.02099542, 0.0185694 , 0.01806148, 0.01352641, 0.01236449,
       0.01105531, 0.01105033, 0.01030782, 0.00991043, 0.00930064,
       0.00778772, 0.00748445, 0.00621288, 0.00607933, 0.00589165])

In [82]:
display_topics(lsa_11, tfidf_11.get_feature_names(), 10)


Topic  0
parent, move parent, basement, female next, next move, parent basement, next, move, female, movie

Topic  1
sweat, favorite part, uhhh marveling, uhhh, sweat uhhh, love favorite, date love, part sweat, marveling, favorite

Topic  2
bad idk, whether go, neckbeard bad, pound, go review, pound dude, good hand, bunch pound, dude neckbeard, hand bunch

Topic  3
movie without, watch superhero, neckbeard boycott, dust burp, boycott nice, without cheeto, actually glad, burp, nice watch, glad neckbeard

Topic  4
whip wildly, head whip, wildly lady, whip, fun head, lady hear, wildly, head, hear, lady

Topic  5
weirdo thank, multi billion, weirdo, world embarrassing, medium review, slop, slop pretend, letter medium, corporate slop, corporate

Topic  6
aadmi, umbrella academy, everyday matlab, everyday, chupi, chupi badla, drop show, date everyday, open netflix, doll

Topic  7
ask, bohemian rhapsody, headphone, bohemian, rhapsody, ask bohemian, international, woman, international woman, 

Community_15

In [83]:
bag_of_words_15 = tfidf_15.fit_transform(X_15)
lsa_15 = TruncatedSVD(20)
doc_topic_15 = lsa_15.fit_transform(bag_of_words_15)
lsa_15.explained_variance_ratio_

array([0.44614817, 0.01694105, 0.01099905, 0.0109376 , 0.00900884,
       0.00605395, 0.00552942, 0.00491899, 0.0044471 , 0.00441569,
       0.00399101, 0.00372945, 0.00337048, 0.00308458, 0.00290979,
       0.00276216, 0.0027453 , 0.00256277, 0.00252182, 0.00235015])

In [84]:
display_topics(lsa_15, tfidf_15.get_feature_names(), 10)


Topic  0
lczhmmq, lczhmmq yu, yu, duetting, duetting ring, ring something, jackson duetting, something co, co lczhmmq, larson samuel

Topic  1
context co, context, without context, without, co fvumqtm, fvumqtm hj, fvumqtm, hj, co, spoiler

Topic  2
higherfurtherfaster co, weekend higherfurtherfaster, open weekend, higherfurtherfaster, open, fan, weekend, brielarson, theater, surprise fan

Topic  3
nine inch, nail, inch, nail shirt, inch nail, nine, shirt leather, motorcycle wear, pandering interest, pandering

Topic  4
larson slander, co otrizk, absolutely brie, timeline co, otrizk asd, slander timeline, asd, otrizk, slander, timeline

Topic  5
originalfunko, follow originalfunko, originalfunko chance, chance win, chance, follow, win, pop, funkowomenofpower, funkowomenofpower internationalwomensday

Topic  6
cool, really, time, go, watch, buy, help, really show, date th, hey boyfriend

Topic  7
go, watch, good, go watch, want, want go, wanna, feel, cinema, still

Topic  8
box, box off

Now I have my top words in each LSA object w/ 20 topics each. I'm going to pickle each model and then use a new file to:
 - Find which tweets score the highest in each topic to get examples
 - Name communities
 - Sentiment analysis (maybe weekend)

In [85]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_0.pickle', 'wb') as to_write:
    pickle.dump(lsa_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_1.pickle', 'wb') as to_write:
    pickle.dump(lsa_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_2.pickle', 'wb') as to_write:
    pickle.dump(lsa_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_3.pickle', 'wb') as to_write:
    pickle.dump(lsa_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_4.pickle', 'wb') as to_write:
    pickle.dump(lsa_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_5.pickle', 'wb') as to_write:
    pickle.dump(lsa_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_6.pickle', 'wb') as to_write:
    pickle.dump(lsa_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_7.pickle', 'wb') as to_write:
    pickle.dump(lsa_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_11.pickle', 'wb') as to_write:
    pickle.dump(lsa_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_15.pickle', 'wb') as to_write:
    pickle.dump(lsa_15, to_write)
      

NMF Models?

In [86]:
nmf_0 = NMF(20)
doc_topic_NMF_0 = nmf_0.fit_transform(bag_of_words_0)
print(nmf_0.reconstruction_err_)

display_topics(nmf_0, tfidf_0.get_feature_names(), 10)

106.26531036149292

Topic  0
co fvumqtm, fvumqtm, fvumqtm hj, hj, without context, without, context co, context, co, context pt

Topic  1
ydfekkjae, co ydfekkjae, thor watch, train pistol, widow train, pistol co, watch black, pistol, train, watch

Topic  2
want, want home, ftbp utcom, utcom, want war, co ftbp, home want, ftbp, niggas want, bitch niggas

Topic  3
lczkximxrh, co lczkximxrh, facility endgame, show avenger, endgame co, show, endgame, avenger facility, facility, avenger

Topic  4
thor battlefield, co ovkuamt, ovkuamt xk, ovkuamt, xk, battlefield co, battlefield, thor, co, thanos thor

Topic  5
gun avenger, fire gun, facility thor, widow fire, vrq, co vrq, gun, fire, thor co, avenger facility

Topic  6
business co, tw, thor handle, avenger distract, jvb tw, distract thanos, handle business, jvb, co jvb, wait thor

Topic  7
mf bitch, mf, bitch, bad bitch, bad, really bitch, actually drop, drop international, woman mf, damn actually

Topic  8
worth, worth watch, worth good, wo

In [87]:
nmf_1 = NMF(20)
doc_topic_NMF_1 = nmf_1.fit_transform(bag_of_words_1)
print(nmf_1.reconstruction_err_)

display_topics(nmf_1, tfidf_1.get_feature_names(), 10)

106.26530968104078

Topic  0
co fvumqtm, fvumqtm, fvumqtm hj, hj, without context, without, context co, context, co, context pt

Topic  1
ydfekkjae, co ydfekkjae, thor watch, train pistol, widow train, pistol co, watch black, pistol, train, watch

Topic  2
want, want home, ftbp utcom, utcom, want war, co ftbp, home want, ftbp, niggas want, bitch niggas

Topic  3
lczkximxrh, co lczkximxrh, facility endgame, show avenger, endgame co, show, endgame, avenger facility, facility, avenger

Topic  4
thor battlefield, co ovkuamt, ovkuamt xk, ovkuamt, xk, battlefield co, battlefield, thor, co, thanos thor

Topic  5
widow fire, facility thor, fire gun, vrq, co vrq, gun avenger, gun, fire, thor co, avenger facility

Topic  6
business co, tw, thor handle, avenger distract, jvb tw, distract thanos, handle business, jvb, co jvb, wait thor

Topic  7
mf bitch, mf, bitch, bad bitch, bad, really bitch, woman mf, damn actually, actually drop, mf flex

Topic  8
worth, worth watch, worth good, worth hype, w

In [88]:
nmf_2 = NMF(20)
doc_topic_NMF_2 = nmf_2.fit_transform(bag_of_words_2)
print(nmf_2.reconstruction_err_)

display_topics(nmf_2, tfidf_2.get_feature_names(), 10)

568.2097616940998

Topic  0
htohktois, co htohktois, pop theater, brielarson pop, night surprise, theater saturday, saturday night, fan open, surprise fan, saturday

Topic  1
ultimate experience, popcorn soda, soda, help fan, fan popcorn, ultimate, popcorn, help, experience, experience brielarson

Topic  2
studio play, play theater, play, theater ticket, moment studio, wait moment, ticket co, ticket, studio, moment

Topic  3
image stan, intro image, ivh ok, ivh, co ivh, change intro, image, ok, lee co, intro

Topic  4
sveuvf, wpy sveuvf, boom congrat, co wpy, well deserve, wpy, brielarson well, boom, deserve co, congrat brielarson

Topic  5
uylpevb, co uylpevb, intro roll, roll co, roll, intro, co, intro co, logo intro, rcx

Topic  6
avengersendgame theater, studio avengersendgame, theater april, april co, april, whatev watch, trailer studio, brand trailer, watch brand, ncamd jspp

Topic  7
uugwmtwqt, co uugwmtwqt, proud co, proud, co, brielarson, congratulation, brielarson proud, powe

In [89]:
nmf_3 = NMF(20)
doc_topic_NMF_3 = nmf_3.fit_transform(bag_of_words_3)
print(nmf_3.reconstruction_err_)

display_topics(nmf_3, tfidf_3.get_feature_names(), 10)

177.25178740857922

Topic  0
yxxpsg co, yxxpsg, co wjebx, co yxxpsg, wjebx qp, wjebx, qp, captainmarvelxboxsweepstake nopurchnec, xbox captainmarvelxboxsweepstake, captainmarvelxboxsweepstake

Topic  1
mkgxe, co mkgxe, funkowomenofpower co, motorcycle pop, danver motorcycle, win carol, pop ride, ride funkowomenofpower, motorcycle, ride

Topic  2
box, sh vfhn, co qvshhlrzh, vfhn co, vfhn, qvshhlrzh, co sh, sh, order mcc, box close

Topic  3
lwzcakbfsy, co lwzcakbfsy, win walmart, walmart exclusive, walmart, exclusive pop, pop co, exclusive, pop, originalfunko chance

Topic  4
kne wtrgxd, wtrgxd, co kne, kne, pop goosethecat, cat pop, win goose, goosethecat co, goosethecat, goose cat

Topic  5
uxvbxgpunv, co uxvbxgpunv, pop funkowomenofpower, win chase, chase pop, chase, funkowomenofpower internationalwomensday, internationalwomensday co, internationalwomensday, funkowomenofpower

Topic  6
co nkyuxukvwf, nkyuxukvwf, korath pop, pop funkoeccc, exclusive korath, win eccc, funkoeccc co, ecc

In [90]:
nmf_4 = NMF(20)
doc_topic_NMF_4 = nmf_4.fit_transform(bag_of_words_4)
print(nmf_4.reconstruction_err_)

display_topics(nmf_4, tfidf_4.get_feature_names(), 10)

420.9617840379525

Topic  0
model draw, draw xd, xd enjoy, since star, credit sailor, moon role, ohfhak, co ohfhak, brielarson credit, star brielarson

Topic  1
co ykreclow, ykreclow, brielarson president, president co, president, brielarson, co, brielarson co, co pjfmhvhnzc, pjfmhvhnzc

Topic  2
war, overpopulation scarce, operation homecoming, scarce resource, ragnarok historical, revisionism winter, terrorism ragnarok, war overpopulation, military operation, unemployment infinity

Topic  3
hvuzhv, xwhbaqfvw, motherfuck love, co hvuzhv, co xwhbaqfvw, hvuzhv co, motherfuck, love samuel, jackson co, samuel jackson

Topic  4
hijab marvelstudios, design hijab, xdlktuc, co xdlktuc, marvelstudios badass, decide design, hero high, badass hero, hijab, faster co

Topic  5
reference, movie, shout part, marvelstudio friend, spend reference, therealstanlee shout, mess lifetime, friend therealstanlee, universe survive, lifetime spend

Topic  6
hashtag, cat background, click hashtag, korea trend, 

In [91]:
nmf_5 = NMF(20)
doc_topic_NMF_5 = nmf_5.fit_transform(bag_of_words_5)
print(nmf_5.reconstruction_err_)

display_topics(nmf_5, tfidf_5.get_feature_names(), 10)

110.3096285597803

Topic  0
head avengersendgame, rws, aim, co rws, never aim, thor never, flinch thor, aim head, flinch, head

Topic  1
ybavku, co ybavku, scar co, scar, co, goosethecat, goosethecat co, brielarson, meow, zpumpa

Topic  2
avenger iron, iron, iron man, man, avenger, thor, hulk ironman, order america, movie chronological, thor dark

Topic  3
rnuixscvsh, inspire logo, mavel, mavel inspire, co rnuixscvsh, logo co, inspire, logo, co, pivn

Topic  4
co py, summarize, eh yk, py eh, summarize video, py, eh, yk, video co, video

Topic  5
jb, gsc paradigm, paradigm, hall movie, gsc, hall, seat, paradigm mall, time seat, mall jb

Topic  6
co xrlaxmo, loyal bestie, loyal, bestie co, xrlaxmo, bestie, co, goosethecat, goosethecat co, brielarson

Topic  7
leave credit, already st, rvkp, pyqrf, pyqrf rvkp, still people, co pyqrf, movie yet, yet still, people leave

Topic  8
fury singing, mmmi, co mmmi, never nick, mmmi bct, bct, ring co, singing ring, singing, nick fury

Topic  9
kiky

In [92]:
nmf_6 = NMF(20)
doc_topic_NMF_6 = nmf_6.fit_transform(bag_of_words_6)
print(nmf_6.reconstruction_err_)

display_topics(nmf_6, tfidf_6.get_feature_names(), 10)

234.55316232497648

Topic  0
woman immediately, room seem, win tolerate, patronizing counterproductive, seem patronizing, kind bigotry, son come, tolerate kind, celebrate utterly, counterproductive celebrate

Topic  1
monologue, monologue rlm, ivjzzu, df ivjzzu, review perfection, rlm review, opening monologue, perfection co, co df, rlm

Topic  2
troop, thank, live american, join thank, thank card, name thank, troop real, hi samuel, back join, troop add

Topic  3
critic, movie, afraid hate, pzyk, critic tailor, fit social, co pzyk, lie happy, internationalwomensday female, tailor

Topic  4
nuke, pic, audience, review, morning pic, pic right, abysmally, co lhjglexldv, mass nuke, right minute

Topic  5
un thinly, leader kim, caucasian woman, base real, jong, jong un, western film, disguise caucasian, thinly disguise, exploit supreme

Topic  6
plot, go regret, hole many, give poorly, write jumpy, plot plot, contrivance, plot contrivance, regret movie, many plot

Topic  7
trump, supporter 

In [93]:
nmf_7 = NMF(20)
doc_topic_NMF_7 = nmf_7.fit_transform(bag_of_words_7)
print(nmf_7.reconstruction_err_)

display_topics(nmf_7, tfidf_7.get_feature_names(), 10)

54.656090065763465

Topic  0
really, boring bland, want boring, jackson cat, bland sam, cat really, really want, sam jackson, sam, boring

Topic  1
op love, damn excited, time theater, badass post, total badass, endgame time, dawg op, scene damn, love total, excited go

Topic  2
unpopular opinion, unpopular, opinion wonder, opinion, woman good, wonder woman, wonder, woman, good, woman gt

Topic  3
movie, tl dr, tl, leftish, magic, movie half, landmark, half decent, leftish trash, propaganda brainwashing

Topic  4
sneezing, co kceihtpkop, kceihtpkop, sneezing co, co, studio, context, context co, co uugwmtwqt, proud co

Topic  5
awesome, awesome fact, jpyo jyppa, jpyo, jyppa, co jpyo, awesome co, awesome end, wait, fact

Topic  6
ultimate duo, thor ultimate, dawg thor, duo watch, time hour, trailer time, watch trailer, duo, hour, trailer

Topic  7
bar, fury, fqgd, hawkeye recreate, fury converse, bar bud, sl, bar nick, converse, yes actual

Topic  8
fantastic, old, afternoon, lan start, 

In [94]:
nmf_11 = NMF(20)
doc_topic_NMF_11 = nmf_11.fit_transform(bag_of_words_11)
print(nmf_11.reconstruction_err_)

display_topics(nmf_11, tfidf_11.get_feature_names(), 10)

25.67483560150584

Topic  0
parent basement, move parent, female next, basement, next move, parent, next, move, female, movie

Topic  1
date love, love favorite, favorite part, uhhh, marveling, sweat, sweat uhhh, part sweat, uhhh marveling, favorite

Topic  2
pound, dude neckbeard, bunch pound, whether go, bad idk, good hand, neckbeard bad, pound dude, go review, hand bunch

Topic  3
without cheeto, burp, glad neckbeard, neckbeard boycott, actually glad, boycott nice, dust burp, watch superhero, movie without, nice watch

Topic  4
whip, wildly lady, fun head, lady hear, head whip, whip wildly, wildly, head, hear, lady

Topic  5
embarrassing nightmare, corporate slop, nightmare everyone, letter medium, nightmare, thank world, social justice, social, weirdo thank, world embarrassing

Topic  6
aadmi, kaam na, kare kuch, keep drop, badla, badla open, kuch, kuch bus, everyday matlab, everyday

Topic  7
ask, bohemian, rhapsody, headphone, ask bohemian, bohemian rhapsody, international, woman

In [95]:
nmf_15 = NMF(20)
doc_topic_NMF_15 = nmf_15.fit_transform(bag_of_words_15)
print(nmf_15.reconstruction_err_)

display_topics(nmf_15, tfidf_15.get_feature_names(), 10)

46.841353650513895

Topic  0
something co, lczhmmq yu, lczhmmq, duetting ring, duetting, yu, jackson duetting, ring something, co lczhmmq, larson samuel

Topic  1
context co, context, without context, hj, co fvumqtm, fvumqtm, fvumqtm hj, without, co, spoiler

Topic  2
surprise fan, saturday night, theater saturday, night surprise, fan open, pop theater, brielarson pop, surprise, saturday, weekend higherfurtherfaster

Topic  3
nine inch, inch, inch nail, nail shirt, nail, nine, offensive pandering, answer deeply, sue scene, garbage play

Topic  4
absolutely brie, co otrizk, larson slander, otrizk asd, asd, otrizk, slander timeline, timeline co, slander, timeline

Topic  5
originalfunko, originalfunko chance, follow originalfunko, chance win, chance, follow, win, pop, funkowomenofpower, funkowomenofpower internationalwomensday

Topic  6
cool, time, really show, time also, also thank, boyfriend cool, mkgjpsyofn co, th time, nsfokb, cool buy

Topic  7
go, wanna, go watch, want go, go go, w

In [96]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_0.pickle', 'wb') as to_write:
    pickle.dump(nmf_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_1.pickle', 'wb') as to_write:
    pickle.dump(nmf_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_2.pickle', 'wb') as to_write:
    pickle.dump(nmf_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_3.pickle', 'wb') as to_write:
    pickle.dump(nmf_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_4.pickle', 'wb') as to_write:
    pickle.dump(nmf_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_5.pickle', 'wb') as to_write:
    pickle.dump(nmf_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_6.pickle', 'wb') as to_write:
    pickle.dump(nmf_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_7.pickle', 'wb') as to_write:
    pickle.dump(nmf_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_11.pickle', 'wb') as to_write:
    pickle.dump(nmf_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_15.pickle', 'wb') as to_write:
    pickle.dump(nmf_15, to_write)


    

Running Topic Modeling on ALL text data, both NMF and LSA, on various topic counts, to feed into DBScan / KMeans for cluster analysis

In [97]:
# ## Creating list of the processed text files to merge, so it's all in the same format and can re-use code
# dfs_to_merge = [df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_26, df_40]

In [98]:
# total = 0

# for df in dfs_to_merge:
#     print(df.shape)
#     total += df.shape[0]
    
# print(total)

In [99]:
# df_all_preprocessed = pd.concat(dfs_to_merge)

In [100]:
# df_all_preprocessed.shape

In [101]:
# df_all = df_all_preprocessed.copy()

In [102]:
# df_all.head()

In [103]:
# with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'wb') as to_write:
#     pickle.dump(df_all, to_write)

In [104]:
# with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'rb') as f:
#     df_all = pickle.load(f)

So the concat worked, the # of rows equals the number of all the rows added up (just to be safe)

Now I'll run a few different LSAs and NMFs on the total data, and move on to clustering

# Preprocess Text - for both LSA and NMF

In [105]:
# # Can come back to tweek this if I notice anything
# tweet_stopwords = stopwords.words('english') + \
#     ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
#      'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
#      'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
#      'day', 'days', 'aoc', 'alexandria', 'ocasio', 'cortez', 'ocasio-cortez', 'pron']

In [106]:
# tfidf_all = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))

In [107]:
# X_all = df_all['tweet_processed']

In [108]:
# bag_of_words_all = tfidf_all.fit_transform(X_all)

In [109]:
# type(bag_of_words_all)

# LSA on all text
*10, 15, 20, 25 topics*

In [110]:
# # LSA on 10 topics
# lsa_all_10 = TruncatedSVD(10)
# lsa_all_10.fit_transform(bag_of_words_all)
# lsa_all_10.explained_variance_ratio_

In [111]:
# display_topics(lsa_all_10, tfidf_all.get_feature_names(), 10)

In [112]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_10.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_10, to_write)

---------

In [113]:
# # LSA on 15 topics
# lsa_all_15 = TruncatedSVD(15)
# lsa_all_15.fit_transform(bag_of_words_all)
# lsa_all_15.explained_variance_ratio_

In [114]:
# display_topics(lsa_all_15, tfidf_all.get_feature_names(), 10)

In [115]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_15.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_15, to_write)

---------

In [116]:
# # LSA on 20 topics
# lsa_all_20 = TruncatedSVD(20)
# lsa_all_20.fit_transform(bag_of_words_all)
# lsa_all_20.explained_variance_ratio_

In [117]:
# display_topics(lsa_all_20, tfidf_all.get_feature_names(), 10)

In [118]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_20.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_20, to_write)

---------

In [119]:
# # LSA on 25 topics
# lsa_all_25 = TruncatedSVD(25)
# lsa_all_25.fit_transform(bag_of_words_all)
# lsa_all_25.explained_variance_ratio_

In [120]:
# display_topics(lsa_all_25, tfidf_all.get_feature_names(), 10)

In [121]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_25.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_25, to_write)

# NMF on all text
*10, 15, 20, 25 topics*

In [122]:
# # NMF on 10 topics
# nmf_all_10 = NMF(10)
# nmf_all_10.fit_transform(bag_of_words_all)
# print(nmf_all_10.reconstruction_err_)

In [123]:
# display_topics(nmf_all_10, tfidf_all.get_feature_names(), 10)

In [124]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_10.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_10, to_write)

---------

In [125]:
# # NMF on 15 topics
# nmf_all_15 = NMF(15)
# nmf_all_15.fit_transform(bag_of_words_all)
# print(nmf_all_15.reconstruction_err_)

In [126]:
# display_topics(nmf_all_15, tfidf_all.get_feature_names(), 10)

In [127]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_15.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_15, to_write)

---------

In [128]:
# # NMF on 20 topics
# nmf_all_20 = NMF(20)
# nmf_all_20.fit_transform(bag_of_words_all)
# print(nmf_all_20.reconstruction_err_)

In [129]:
# display_topics(nmf_all_20, tfidf_all.get_feature_names(), 10)

In [130]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_20.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_20, to_write)

---------

In [131]:
# # NMF on 25 topics
# nmf_all_25 = NMF(25)
# nmf_all_25.fit_transform(bag_of_words_all)
# print(nmf_all_25.reconstruction_err_)

# ## setting a variable to the fit transformed

In [132]:
# nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all))
# nmf_model_25_df.shape

In [133]:
# print(nmf_all_25.reconstruction_err_)
# nmf_model_25_df.head()

NOTE TO SELF - if the above works, I'll need to make DFs of all versions again (which will take around another hour) :(

Then I'll concat them with index and screen name, so I can assign values back to the tweeters otherwise. One annoying thing is I didn't pass in tweet ID so I'll have to think about that too

In [134]:
# display_topics(nmf_all_25, tfidf_all.get_feature_names(), 10)

In [135]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_25.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_25, to_write)

Now I will create DF's where I append this info to a dataframe

then try dbscan with these

In [136]:
# df_topic_base = df_all.reset_index()
# df_topic_base.head()

crap - now I know why I saved the fit_transform(bag_of_words_ objects previously, that's what I need to use to create the df. For now I'm seeing if I can just call that as the "data" and see if that works. Otherwise I will need to set them all equal to variables

In [137]:
# df_topic_base.shape

In [138]:
# lsa_all_10 = TruncatedSVD(10)

# lsa_model_10_df = pd.DataFrame(data=lsa_all_10.fit_transform(bag_of_words_all))
# lsa_model_10_df.shape

In [139]:
# lsa_all_15 = TruncatedSVD(15)

# lsa_model_15_df = pd.DataFrame(data=lsa_all_15.fit_transform(bag_of_words_all))
# lsa_model_15_df.shape

In [140]:
# lsa_all_20 = TruncatedSVD(20)

# lsa_model_20_df = pd.DataFrame(data=lsa_all_20.fit_transform(bag_of_words_all))
# lsa_model_20_df.shape

In [141]:
# lsa_all_25 = TruncatedSVD(25)

# lsa_model_25_df = pd.DataFrame(data=lsa_all_25.fit_transform(bag_of_words_all))
# lsa_model_25_df.shape

In [142]:
# df_lsa_model_10 = pd.merge(df_topic_base, lsa_model_10_df, left_index=True, right_index=True)
# df_lsa_model_15 = pd.merge(df_topic_base, lsa_model_15_df, left_index=True, right_index=True)
# df_lsa_model_20 = pd.merge(df_topic_base, lsa_model_20_df, left_index=True, right_index=True)
# df_lsa_model_25 = pd.merge(df_topic_base, lsa_model_25_df, left_index=True, right_index=True)

---------

In [143]:
# nmf_all_10 = NMF(10)

# nmf_model_10_df = pd.DataFrame(data=nmf_all_10.fit_transform(bag_of_words_all))
# nmf_model_10_df.shape

In [144]:
# nmf_all_15 = NMF(15)

# nmf_model_15_df = pd.DataFrame(data=nmf_all_15.fit_transform(bag_of_words_all))
# nmf_model_15_df.shape

In [145]:
# nmf_all_20 = NMF(20)

# nmf_model_20_df = pd.DataFrame(data=nmf_all_20.fit_transform(bag_of_words_all))
# nmf_model_20_df.shape

In [146]:
# # nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all)) ALREADY DONE!
# nmf_model_25_df.shape

In [147]:
# df_nmf_model_10 = pd.merge(df_topic_base, nmf_model_10_df, left_index=True, right_index=True)
# df_nmf_model_15 = pd.merge(df_topic_base, nmf_model_15_df, left_index=True, right_index=True)
# df_nmf_model_20 = pd.merge(df_topic_base, nmf_model_20_df, left_index=True, right_index=True)
# df_nmf_model_25 = pd.merge(df_topic_base, nmf_model_25_df, left_index=True, right_index=True)


In [148]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_10.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_10, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_15.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_15, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_20.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_20, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_25.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_25, to_write)
    

# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_10.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_10, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_15.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_15, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_20.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_20, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_25.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_25, to_write)
    
    

    
