In [1]:
import pandas as pd
import numpy as np

import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
import spacy

from textblob import TextBlob, Word

import re

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
pd.set_option('display.max_colwidth', 400)  # or 199

I'm going to analyze the first community first, then functionize it and do the rest

In [11]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_0.pickle', 'rb') as f:
    df_0 = pickle.load(f)

In [13]:
df_0.tail()

Unnamed: 0,screen_name,followers_count,modularity_class,main_text,retweet_text,tweet_date,PageRank,rt_count
1321317,turkytomj,6183,0.0,"@ShadowsOfLiars @provdcrumblover @AOC And also why did she inject lies about her background, if she didn't want her background to be part of her ""Picture"".",,Thu Mar 07 21:29:09 +0000 2019,7.379573e-07,0
1508601,Dtruth9233,1882,0.0,RT @rising_serpent: Alexandria Ocasio-Cortez's Chief of Staff Ran Slush Fund - Funneled Over $1 Million in Campaign Donations to His Own Co…,"Alexandria Ocasio-Cortez's Chief of Staff Ran Slush Fund - Funneled Over $1 Million in Campaign Donations to His Own Companies.\nTwo PACs founded by AOC's top aide, Saikat Chakrabarti funneled over $ 1 million into two of his own private companies. https://t.co/4xgeskr941",Thu Mar 07 02:07:52 +0000 2019,7.357909e-07,140
340125,Qwarrior8,128,0.0,"RT @rising_serpent: You miss the whole point Ms Ocasio-Cortex. Wait staff are the antithesis of socialism. They get a low base pay, working…","You miss the whole point Ms Ocasio-Cortex. Wait staff are the antithesis of socialism. They get a low base pay, working mainly for tips, the most stringent pay-for-performance in the workforce. They don't ask for handouts.\nPeople aren't mocking, they're asking for an improvement. https://t.co/YXZn56wKvT",Mon Mar 11 05:47:49 +0000 2019,7.357909e-07,64
362073,kwitbelyakin,216,0.0,"RT @rising_serpent: You miss the whole point Ms Ocasio-Cortex. Wait staff are the antithesis of socialism. They get a low base pay, working…","You miss the whole point Ms Ocasio-Cortex. Wait staff are the antithesis of socialism. They get a low base pay, working mainly for tips, the most stringent pay-for-performance in the workforce. They don't ask for handouts.\nPeople aren't mocking, they're asking for an improvement. https://t.co/YXZn56wKvT",Mon Mar 11 03:44:09 +0000 2019,7.357909e-07,64
865336,kwitbelyakin,216,0.0,@davidwebbshow @SpeakerPelosi @RashidaTlaib @RepAOC @AOC @IlhanMN @TheDemocrats @RollingStone @foxnation They'd have to pay me to read that issue.,,Sat Mar 09 14:02:45 +0000 2019,7.357909e-07,0


first - if tweet is retweet, make that original tweet, so we have the full text:

In [15]:
df_0['final_text'] = np.where((df_0['retweet_text'].isnull() == False), df_0['retweet_text'], df_0['main_text'])

Now, preprocessing (using same code as for kickstarter as baseline, come back here to tweak later)

Look into https://pypi.org/project/tweet-preprocessor/ for tweet processing later!

In [41]:
nlp = spacy.load('en', disable=['parser', 'ner'])
# nlp = spacy.load('en')

In [38]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #lemmatize with Spacy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    
    return text

In [57]:
tweet_stopwords = stopwords.words('english') + \
    ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
     'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
     'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
     'day', 'days', 'aoc', 'alexandria', 'ocasio', 'cortez', 'ocasio-cortez', 'pron']

In [48]:
tweet_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [44]:
df_0['tweet_processed'] = df_0['final_text'].apply(lambda x:pre_process(x))

In [45]:
df_0.head()

Unnamed: 0,screen_name,followers_count,modularity_class,main_text,retweet_text,tweet_date,PageRank,rt_count,final_text,tweet_processed
405812,DavidJHarrisJr,75631,0.0,So @aoc answered a casting call... and now she’s in Congress! @RealJamesWoods and @realDonaldTrump have to see this!!! https://t.co/HBUW78XqAo,,Mon Mar 11 01:16:04 +0000 2019,0.004242,6936,So @aoc answered a casting call... and now she’s in Congress! @RealJamesWoods and @realDonaldTrump have to see this!!! https://t.co/HBUW78XqAo,so aoc answer a cast call and now -PRON- s in congress realjameswood and realdonaldtrump have to see this https t co hbuw xqao
712255,DavidJHarrisJr,75652,0.0,So @aoc wants to raise our taxes and she hasn’t even paid her own! https://t.co/W1m1CCnN3Y,,Sun Mar 10 03:49:44 +0000 2019,0.004242,1127,So @aoc wants to raise our taxes and she hasn’t even paid her own! https://t.co/W1m1CCnN3Y,so aoc want to raise -PRON- tax and -PRON- hasn t even pay -PRON- own https t co w m ccnn y
465115,DavidJHarrisJr,75638,0.0,Now @aoc says America is garbage? \nWho elected this woman!!! https://t.co/O7hsaAK9jH,,Sun Mar 10 22:30:39 +0000 2019,0.004242,1056,Now @aoc says America is garbage? \nWho elected this woman!!! https://t.co/O7hsaAK9jH,now aoc say america be garbage who elect this woman https t co o hsaak jh
62994,DavidJHarrisJr,75620,0.0,Former FEC Commissioner says there’s more than enough evidence to launch a criminal investigation into @aoc! https://t.co/XPL6LhrZFD,,Tue Mar 12 01:25:16 +0000 2019,0.004242,997,Former FEC Commissioner says there’s more than enough evidence to launch a criminal investigation into @aoc! https://t.co/XPL6LhrZFD,former fec commissioner say there s more than enough evidence to launch a criminal investigation into aoc https t co xpl lhrzfd
1205837,DavidJHarrisJr,75690,0.0,Hey AOC! Guess what? Climate change and global warming IS a hoax!!!! https://t.co/OwY68EMVMm,,Fri Mar 08 05:02:07 +0000 2019,0.004242,838,Hey AOC! Guess what? Climate change and global warming IS a hoax!!!! https://t.co/OwY68EMVMm,hey aoc guess what climate change and global warming be a hoax https t co owy emvmm


In [47]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_0_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_0, to_write)

In [88]:
tfidf_0 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))

In [89]:
X_0 = df_0['tweet_processed']

In [90]:
bag_of_words_0 = tfidf_0.fit_transform(X_0)

In [91]:
# feature_names_0 = tfidf_0.get_feature_names()
# pd.DataFrame(bag_of_words_0.toarray(), columns = feature_names)

In [92]:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa_0 = TruncatedSVD(20)
doc_topic_0 = lsa_0.fit_transform(bag_of_words_0)
lsa_0.explained_variance_ratio_

array([0.05214969, 0.02292908, 0.01694097, 0.00850004, 0.00859578,
       0.00884221, 0.00846455, 0.00850187, 0.00648632, 0.00695484,
       0.0061623 , 0.00595626, 0.00578482, 0.00576008, 0.00550388,
       0.00545221, 0.00517115, 0.00506406, 0.00501193, 0.00489234])

In [10]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [94]:
display_topics(lsa_0, tfidf_0.get_feature_names(), 10)


Topic  0
congress realjameswood, co hbuw, xqao, hbuw, hbuw xqao, call congress, realdonaldtrump co, realjameswood realdonaldtrump, answer cast, realjameswood

Topic  1
liberal mcclatchy, sabate full, immigrant felix, cuban immigrant, mcclatchy, felix sabate, sabate, cuban, felix, immigrant

Topic  2
claim, mail pile, crew postal, visit film, worker claim, district claim, business claim, claim visit, pile month, claim represent

Topic  3
finally expose, former grand, hateful racist, hat respect, left exactly, ilhanmn finally, expose left, wizard ku, celebrate party, party black

Topic  4
garbage, congresswoman, america garbage, america, garbage congresswoman, citizen force, seriously call, hellhole praise, congresswoman garbage, force eat

Topic  5
hawaii, green deal, green, crazy, deal, crazy crazy, hawaii island, deal crazy, eliminate air, hirono hawaii

Topic  6
cnn msnbc, msnbc, ignore fec, blackout cnn, medium blackout, msnbc ignore, co kxadajq, kxadajq, kxadajq jl, jl

Topic  7
b

Do the same as above, and consider:
 - use nmf
 - look into tweet specific preprocessing
 - change # of topics
 - etc...

But first, I'm going to process every community:

In [70]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_1.pickle', 'rb') as f:
    df_1 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_2.pickle', 'rb') as f:
    df_2 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_3.pickle', 'rb') as f:
    df_3 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_4.pickle', 'rb') as f:
    df_4 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_5.pickle', 'rb') as f:
    df_5 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_6.pickle', 'rb') as f:
    df_6 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_7.pickle', 'rb') as f:
    df_7 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_8.pickle', 'rb') as f:
    df_8 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_9.pickle', 'rb') as f:
    df_9 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_10.pickle', 'rb') as f:
    df_10 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_11.pickle', 'rb') as f:
    df_11 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_26.pickle', 'rb') as f:
    df_26 = pickle.load(f)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_40.pickle', 'rb') as f:
    df_40 = pickle.load(f)
    

    


In [72]:
df_1['final_text'] = np.where((df_1['retweet_text'].isnull() == False), df_1['retweet_text'], df_1['main_text'])
df_2['final_text'] = np.where((df_2['retweet_text'].isnull() == False), df_2['retweet_text'], df_2['main_text'])
df_3['final_text'] = np.where((df_3['retweet_text'].isnull() == False), df_3['retweet_text'], df_3['main_text'])
df_4['final_text'] = np.where((df_4['retweet_text'].isnull() == False), df_4['retweet_text'], df_4['main_text'])
df_5['final_text'] = np.where((df_5['retweet_text'].isnull() == False), df_5['retweet_text'], df_5['main_text'])
df_6['final_text'] = np.where((df_6['retweet_text'].isnull() == False), df_6['retweet_text'], df_6['main_text'])
df_7['final_text'] = np.where((df_7['retweet_text'].isnull() == False), df_7['retweet_text'], df_7['main_text'])
df_8['final_text'] = np.where((df_8['retweet_text'].isnull() == False), df_8['retweet_text'], df_8['main_text'])
df_9['final_text'] = np.where((df_9['retweet_text'].isnull() == False), df_9['retweet_text'], df_9['main_text'])
df_10['final_text'] = np.where((df_10['retweet_text'].isnull() == False), df_10['retweet_text'], df_10['main_text'])
df_11['final_text'] = np.where((df_11['retweet_text'].isnull() == False), df_11['retweet_text'], df_11['main_text'])
df_26['final_text'] = np.where((df_26['retweet_text'].isnull() == False), df_26['retweet_text'], df_26['main_text'])
df_40['final_text'] = np.where((df_40['retweet_text'].isnull() == False), df_40['retweet_text'], df_40['main_text'])



In [73]:
df_1['tweet_processed'] = df_1['final_text'].apply(lambda x:pre_process(x))
df_2['tweet_processed'] = df_2['final_text'].apply(lambda x:pre_process(x))
df_3['tweet_processed'] = df_3['final_text'].apply(lambda x:pre_process(x))
df_4['tweet_processed'] = df_4['final_text'].apply(lambda x:pre_process(x))
df_5['tweet_processed'] = df_5['final_text'].apply(lambda x:pre_process(x))
df_6['tweet_processed'] = df_6['final_text'].apply(lambda x:pre_process(x))
df_7['tweet_processed'] = df_7['final_text'].apply(lambda x:pre_process(x))
df_8['tweet_processed'] = df_8['final_text'].apply(lambda x:pre_process(x))
df_9['tweet_processed'] = df_9['final_text'].apply(lambda x:pre_process(x))
df_10['tweet_processed'] = df_10['final_text'].apply(lambda x:pre_process(x))
df_11['tweet_processed'] = df_11['final_text'].apply(lambda x:pre_process(x))
df_26['tweet_processed'] = df_26['final_text'].apply(lambda x:pre_process(x))
df_40['tweet_processed'] = df_40['final_text'].apply(lambda x:pre_process(x))









In [74]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_1_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_2_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_3_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_4_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_5_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_6_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_7_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_8_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_8, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_9_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_9, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_10_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_10, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_11_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_26_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_26, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/df_40_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_40, to_write)
    


In [76]:
X_1 = df_1['tweet_processed']
X_2 = df_2['tweet_processed']
X_3 = df_3['tweet_processed']
X_4 = df_4['tweet_processed']
X_5 = df_5['tweet_processed']
X_6 = df_6['tweet_processed']
X_7 = df_7['tweet_processed']
X_8 = df_8['tweet_processed']
X_9 = df_9['tweet_processed']
X_10 = df_10['tweet_processed']
X_11 = df_11['tweet_processed']
X_26 = df_26['tweet_processed']
X_40 = df_40['tweet_processed']



In [87]:
tfidf_1 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_2 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_3 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_4 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_5 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_6 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_7 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_8 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_9 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_10 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_11 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_26 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_40 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))


Community_1

In [95]:
bag_of_words_1 = tfidf_1.fit_transform(X_1)
lsa_1 = TruncatedSVD(20)
doc_topic_1 = lsa_1.fit_transform(bag_of_words_1)
lsa_1.explained_variance_ratio_

array([0.06577197, 0.06232332, 0.04064375, 0.02427183, 0.02345344,
       0.01326354, 0.01123048, 0.01143955, 0.00867047, 0.00869339,
       0.00841455, 0.00784053, 0.00800741, 0.00777426, 0.00754838,
       0.00718744, 0.0070491 , 0.00656743, 0.00648086, 0.00605249])

In [96]:
display_topics(lsa_1, tfidf_1.get_feature_names(), 10)


Topic  0
irredeemable system, system, capitalism irredeemable, irredeemable, capitalism, blame income, inequality co, system blame, call capitalism, blame

Topic  1
value, tweet worker, essentially restatement, create essentially, pay less, essentially, restatement, worker pay, less value, value create

Topic  2
co rdl, rdl, rdl cgkhqj, cgkhqj, system co, blast, blast capitalism, irredeemable system, capitalism irredeemable, irredeemable

Topic  3
live society, society job, job leave, leave die, excited, specter, haunt specter, specter automate, die, core

Topic  4
institution, people valid, eliminate personal, thus, thus must, close absolute, property, must give, abuse people, jump institution

Topic  5
abjw, lxo, co lxo, lxo abjw, income inequality, call capitalism, inequality co, system blame, blame income, blame

Topic  6
sxsw capitalism, socialist image, lean, lean socialist, co juikvax, image sxsw, juikvax, irredeemable co, image, socialist

Topic  7
robot, bill gate, gate, bill

Community_2

In [97]:
bag_of_words_2 = tfidf_2.fit_transform(X_2)
lsa_2 = TruncatedSVD(20)
doc_topic_2 = lsa_2.fit_transform(bag_of_words_2)
lsa_2.explained_variance_ratio_

array([0.45382045, 0.05495502, 0.03683469, 0.03287456, 0.02701459,
       0.02375505, 0.01134734, 0.00814445, 0.00690917, 0.00674604,
       0.00486508, 0.00446107, 0.00410206, 0.00371391, 0.00330003,
       0.00325492, 0.00293964, 0.00269977, 0.00245108, 0.00206534])

In [98]:
display_topics(lsa_2, tfidf_2.get_feature_names(), 10)


Topic  0
service, long, lot life, require work, require, long long, job lot, lesson learn, customer service, work service

Topic  1
skill libs, nice job, leadership skill, cvfc, cvfc kjzum, libs nice, cookie teach, libs, teach little, co cvfc

Topic  2
file bogus, phij, group spamm, misinformation machine, allege, co gi, allege untrue, complaint fox, folk co, conspiracy theory

Topic  3
gdjsh, part co, gdjsh dk, co gdjsh, dk, part, co, rest co, povmemdnlz, mint rest

Topic  4
live society, core, core problem, die core, society job, problem co, leave die, job leave, society, excite reason

Topic  5
povmemdnlz, mint rest, rest co, co povmemdnlz, thin mint, mint, thin, rest, co, come

Topic  6
give job, entirely, ivanka, mtvszv, mtvszv sco, whose, heiress, whose career, sco, heiress whose

Topic  7
co gcv, cannnnn co, cannnnn, ff woman, woman cannnnn, gcv, ff, woman, co, international

Topic  8
osborne, george osborne, george, politic george, osborne co, politic, co ktmlcikb, ktmlcikb, c

Community_3

In [99]:
bag_of_words_3 = tfidf_3.fit_transform(X_3)
lsa_3 = TruncatedSVD(20)
doc_topic_3 = lsa_3.fit_transform(bag_of_words_3)
lsa_3.explained_variance_ratio_

array([0.03999392, 0.02212447, 0.0194341 , 0.01317997, 0.01189089,
       0.01164648, 0.01008555, 0.00990178, 0.00828502, 0.00778276,
       0.00801122, 0.00808432, 0.00740344, 0.00713834, 0.00716819,
       0.00678276, 0.00630425, 0.0060036 , 0.00594291, 0.00565552])

In [100]:
display_topics(lsa_3, tfidf_3.get_feature_names(), 10)


Topic  0
compilation co, sxsw good, cxy, cxy goe, compilation, goe, iw cxy, genius sxsw, good compilation, iw

Topic  1
explode fyi, fyi, challenger explode, explode, challenger, girlscout challenger, uzbk, fyi co, co uzbk, girlscout

Topic  2
fly, eat, sxsw atx, fart taco, atx, atx co, taco speak, airplane eat, eat cow, speak sxsw

Topic  3
beef, try town, fly austin, beef try, town fly, town, austin, fly, try, eat beef

Topic  4
trillion, cost, green deal, green, deal, dollar tell, almost trillion, wall cost, trillion expensive, year die

Topic  5
claim, worker claim, month box, business claim, never boyfriend, postal, district claim, mail pile, pile month, pile

Topic  6
faught, mlk pro, faught democrat, life republican, kkk next, republican faught, run kkk, pro life, democrat run, kkk

Topic  7
right corporation, scar right, corporation government, obvious never, irredeemable rep, government obvious, co xfv, xfv, pick dictionary, never pick

Topic  8
hillary clinton, clinton, hill

Community_4

In [101]:
bag_of_words_4 = tfidf_4.fit_transform(X_4)
lsa_4 = TruncatedSVD(20)
doc_topic_4 = lsa_4.fit_transform(bag_of_words_4)
lsa_4.explained_variance_ratio_

array([0.03498395, 0.022398  , 0.01901534, 0.01770204, 0.01575723,
       0.01281881, 0.01271955, 0.01133546, 0.00950088, 0.0101677 ,
       0.01001013, 0.00881374, 0.00837975, 0.00796869, 0.0074926 ,
       0.00696977, 0.00673222, 0.00654947, 0.00649253, 0.00627482])

In [102]:
display_topics(lsa_4, tfidf_4.get_feature_names(), 10)


Topic  0
claim, box fraud, pile month, claim mail, business claim, claim visit, crew postal, worker claim, month box, never boyfriend

Topic  1
hawaii, crazy, green deal, green, deal, democrat senator, hawaii anyone, oppose green, hawaii island, interesting democrat

Topic  2
ilhanmn finally, hat respect, folk former, celebrate party, party black, stupid jews, klan celebrate, wizard ku, nothing folk, former grand

Topic  3
garbage, congresswoman garbage, praise venezuela, hellhole praise, wow seriously, socialist hellhole, garbage congresswoman, eat socialist, seriously call, garbage citizen

Topic  4
instead, uber, subway, everything, spend, demise instead, lyft instead, subway fly, slow demise, believe world

Topic  5
boy, necessary boy, co xkadfguk, xkadfguk, away boy, scouts away, boy co, boy scouts, necessary, scouts

Topic  6
fec complaint, fec, complaint, cnn msnbc, msnbc, blackout cnn, ignore fec, medium blackout, msnbc ignore, co kxadajq

Topic  7
cnn msnbc, msnbc, blackout c

Community_5

In [103]:
bag_of_words_5 = tfidf_5.fit_transform(X_5)
lsa_5 = TruncatedSVD(20)
doc_topic_5 = lsa_5.fit_transform(bag_of_words_5)
lsa_5.explained_variance_ratio_

array([0.01163665, 0.01392898, 0.01324152, 0.01230021, 0.01241668,
       0.00944719, 0.0097324 , 0.00862834, 0.00876726, 0.00854226,
       0.00774749, 0.00779549, 0.0076388 , 0.00697391, 0.00667956,
       0.00647239, 0.00639239, 0.00602114, 0.00562742, 0.00523464])

In [104]:
display_topics(lsa_5, tfidf_5.get_feature_names(), 10)


Topic  0
american, palestinian, uh, class american, class, israel, claim, working class, working, co

Topic  1
uh, mess bronx, clapback game, woman without, jh xatvtcm, though borough, without roasted, roasted act, perfect clapback, xatvtcm

Topic  2
claim, class american, class, claim mail, business claim, worker claim, visit film, box fraud, apartment local, crew postal

Topic  3
anti, anti semitism, semitism, condemn, resolution, sham resolution, sham, design, vote, rep

Topic  4
class american, class, american, working class, working, reagan, ronald reagan, ronald, president ronald, president

Topic  5
garbage, america, state america, united state, united, america garbage, state, great, tell firsthand, family immigrate

Topic  6
bigotry ilhanmn, floor design, put sham, resolution floor, semitic hate, design protect, outrage party, party put, hate bigotry, protect anti

Topic  7
people, harris, multiple job, work multiple, low people, rate low, unemployment rate, kamala harris, peo

Community_6

In [105]:
bag_of_words_6 = tfidf_6.fit_transform(X_6)
lsa_6 = TruncatedSVD(20)
doc_topic_6 = lsa_6.fit_transform(bag_of_words_6)
lsa_6.explained_variance_ratio_

array([0.05626058, 0.05251768, 0.01988479, 0.01333021, 0.01301637,
       0.01165898, 0.00978425, 0.00903689, 0.00868336, 0.00872859,
       0.00796586, 0.0074651 , 0.00665062, 0.00660595, 0.00567826,
       0.00589073, 0.00571472, 0.00548576, 0.00539443, 0.00503219])

In [106]:
display_topics(lsa_6, tfidf_6.get_feature_names(), 10)


Topic  0
cvfc kjzum, cvfc, co cvfc, kjzum, libs nice, skill libs, teach little, cookie teach, nice job, job co

Topic  1
untrue scandal, allege untrue, case conspiracy, theory run, run conservative, group spamm, report allege, scandal misinformation, spamm file, spamm

Topic  2
crumble road, froth rage, past abandon, screech monkey, monkey tell, road past, radio screech, storefront low, right deplorable, crappy car

Topic  3
hispanic, female, year old, young, bartender, ny, attack, old, year, female hispanic

Topic  4
dem, reclaim language, storm dem, decade reclaim, willing past, reason political, leader simple, dem willing, equivocation, hesitation equivocation

Topic  5
attack, praise, praise sander, praise attack, biden give, space dem, promote politician, attack biden, politician candidate, praise warren

Topic  6
moderate, naive visionary, visionary tinker, moderate spot, spot moderate, edge solve, moderate naive, critique moderate, contract manage, time rewrite

Topic  7
addres

Community_7

In [107]:
bag_of_words_7 = tfidf_7.fit_transform(X_7)
lsa_7 = TruncatedSVD(20)
doc_topic_7 = lsa_7.fit_transform(bag_of_words_7)
lsa_7.explained_variance_ratio_

array([0.06827266, 0.05724163, 0.06072061, 0.02227897, 0.01582008,
       0.01678693, 0.01686263, 0.01508106, 0.01514805, 0.01469623,
       0.01342586, 0.01278529, 0.01158334, 0.00923727, 0.00841481,
       0.00715446, 0.00713883, 0.00559877, 0.0054351 , 0.00534099])

In [108]:
display_topics(lsa_7, tfidf_7.get_feature_names(), 10)


Topic  0
excited live, reason excited, specter automate, excite reason, work excite, society job, haunt specter, specter, haunt, automate work

Topic  1
address, co tsbac, tsbac, change together, together work, divide address, inequality address, fear divide, address income, work sxsw

Topic  2
moderate, time rewrite, critique moderate, visionary tinker, edge solve, contract manage, moderate spot, spot moderate, naive visionary, problem democracy

Topic  3
policy idea, idea, policy, popular instead, cynical big, bcwtml, focus strategy, strategy substance, coverage green, big policy

Topic  4
co fexqdex, fexqdex, tell use, gun co, use gun, please tell, gun, please, tell, use

Topic  5
file bogus, complaint fox, theory run, untrue scandal, gi phij, phij, phij jo, group spamm, spamm file, case conspiracy

Topic  6
first, bid united, endorsement ilhanmn, support announce, election bid, first endorsement, state congress, solidarity support, announce first, rashidatlaib election

Topic  7
s

Community_8

In [109]:
bag_of_words_8 = tfidf_8.fit_transform(X_8)
lsa_8 = TruncatedSVD(20)
doc_topic_8 = lsa_8.fit_transform(bag_of_words_8)
lsa_8.explained_variance_ratio_

array([0.02460699, 0.02165547, 0.01920328, 0.01406421, 0.0104501 ,
       0.008127  , 0.00795469, 0.00735274, 0.00714922, 0.00713512,
       0.00716786, 0.00632214, 0.00653472, 0.00631148, 0.00602179,
       0.00581059, 0.00576865, 0.005268  , 0.00507904, 0.00470967])

In [110]:
display_topics(lsa_8, tfidf_8.get_feature_names(), 10)


Topic  0
est fox, gnd climate, alarmism general, pm est, general celebrateco, carlson tonight, est, news discuss, discuss gnd, tucker carlson

Topic  1
right pressing, part ilhan, face year, invasion border, pressing issue, distract right, year invasion, bad part, country face, issue country

Topic  2
amplicity joshuadstewart, einandererblog seanflanderhijn, brunopresent kstafford, amplicity, prez, prez xlrec, awayyumi einandererblog, einandererblog, libertyisalady, watt prez

Topic  3
fuel, fossil fuel, fossil, food, plan grow, grow food, twit plan, bring mass, horse fossil, food billion

Topic  4
claim, claim visit, business claim, postal worker, postal, local business, district claim, live district, month box, neighbor never

Topic  5
immigration demographic, legal immigration, demographic change, demographic, legal, immigration, change, crazy, ilhan omar, ilhan

Topic  6
crazy, today care, ummm crazy, ummm, country invade, care country, crazy ilhan, invade, omar today, care

Topic

Community_9

In [111]:
bag_of_words_9 = tfidf_9.fit_transform(X_9)
lsa_9 = TruncatedSVD(20)
doc_topic_9 = lsa_9.fit_transform(bag_of_words_9)
lsa_9.explained_variance_ratio_

array([0.01111566, 0.00632566, 0.00552998, 0.00664922, 0.00629983,
       0.00616465, 0.00609082, 0.00599648, 0.00504169, 0.00527193,
       0.00513906, 0.00511007, 0.00481633, 0.00477485, 0.00455726,
       0.00454241, 0.00442555, 0.00422344, 0.00414108, 0.00409862])

In [112]:
display_topics(lsa_9, tfidf_9.get_feature_names(), 10)


Topic  0
claim, pile month, business claim, film news, visit film, month box, claim visit, box fraud, never boyfriend, crew postal

Topic  1
green deal, green, deal, hawaii, crazy, oppose green, hirono hawaii, senator mazie, hawaii anyone, hawaii island

Topic  2
hawaii, green deal, crazy, green, deal, oppose green, hirono hawaii, hawaii anyone, senator mazie, hawaii island

Topic  3
complaint, slap, third ethic, slap third, week co, third, ethic, week, complaint week, ethic complaint

Topic  4
black, left exactly, ilhanmn finally, hat respect, exactly hateful, folk former, celebrate party, respect ilhanmn, klan celebrate, party black

Topic  5
instead, subway, uber, spend, use amtrak, everything slow, thousand uber, lyft instead, demise instead, subway fly

Topic  6
answer cast, cast call, cast, answer, congress, congress realjameswood, xqao, hbuw xqao, hbuw, co hbuw

Topic  7
fec, cnn msnbc, msnbc ignore, ignore fec, blackout cnn, medium blackout, kxadajq, co kxadajq, kxadajq jl, ms

Community_10

In [113]:
bag_of_words_10 = tfidf_10.fit_transform(X_10)
lsa_10 = TruncatedSVD(20)
doc_topic_10 = lsa_10.fit_transform(bag_of_words_10)
lsa_10.explained_variance_ratio_

array([0.06282411, 0.05881977, 0.05120739, 0.01739165, 0.01243666,
       0.01123294, 0.00957861, 0.00837087, 0.00632518, 0.00571643,
       0.0055428 , 0.00545918, 0.00443376, 0.0039585 , 0.00389428,
       0.00315766, 0.00323669, 0.00320314, 0.00306162, 0.00291234])

In [114]:
display_topics(lsa_10, tfidf_10.get_feature_names(), 10)


Topic  0
venezuela socialism, watch co, venezuela, watch, refuse, wow berniesander, dictator wow, bad watch, berniesander win, afraid venezuela

Topic  1
aid enter, disgusting conspiracy, enter country, motive venezuela, america motive, peddle disgusting, lie humanitarian, break venezuela, protest break, question america

Topic  2
invite far, nthd, ilhanomar come, co nthd, country watch, leave dem, dem ilhanomar, poq, nthd poq, jguaido invite

Topic  3
sjnrbexcuz, co sjnrbexcuz, tolerate, responsible pain, maduro responsible, suffering starvation, pain suffering, starvation million, desperate co, allow aid

Topic  4
democratic, socialist, apologist alone, socialist banner, spci oiy, less lose, spci, socialist incapable, lose democratic, simply become

Topic  5
suffer, sake million, anti conspiracy, help suffer, suffer amid, suffer educate, million suffer, theory help, prefer peddle, educate sake

Topic  6
american, egos co, political egos, nothing people, ignorance never, love democra

Community_11

In [115]:
bag_of_words_11 = tfidf_11.fit_transform(X_11)
lsa_11 = TruncatedSVD(20)
doc_topic_11 = lsa_11.fit_transform(bag_of_words_11)
lsa_11.explained_variance_ratio_

array([0.04386625, 0.01308346, 0.00874388, 0.00727753, 0.00706349,
       0.00364769, 0.00446262, 0.00456757, 0.00418456, 0.00405006,
       0.00358519, 0.00367563, 0.00305483, 0.00334501, 0.00315185,
       0.00308597, 0.00295256, 0.00279017, 0.00267708, 0.00261978])

In [116]:
display_topics(lsa_11, tfidf_11.get_feature_names(), 10)


Topic  0
buy apple, america ewarren, job dem, instead amazon, platform stop, bq, bq onfr, love life, ewarren sure, amazon basic

Topic  1
libertarian, basically uniformly, silicon, liberal silicon, libertarian tech, end basically, silicon valley, mainly reaction, valley always, tech year

Topic  2
love, trump election, love guarantee, election team, team love, riav, co riav, guarantee trump, audience love, sure audience

Topic  3
mjtheprophet, whackamolepro, mjtheprophet whackamolepro, tenor, reneefiredup, gabbifromtexas, gabbifromtexas tenor, rick, kokshaj, reneefiredup gabbifromtexas

Topic  4
low tax, low, tax, socialist party, tax state, etc next, stop low, tether geography, understand founder, nv tx

Topic  5
mitchellvii, garbage, america, america garbage, boss, country, call, go, garbage nice, someone

Topic  6
boss, green, deal, green deal, crazy, hawaii, senator, crazy crazy, eliminate, air travel

Topic  7
deal, green, green deal, crazy, hawaii, senator, crazy crazy, eliminat

Community_26

In [117]:
bag_of_words_26 = tfidf_26.fit_transform(X_26)
lsa_26 = TruncatedSVD(20)
doc_topic_26 = lsa_26.fit_transform(bag_of_words_26)
lsa_26.explained_variance_ratio_

array([0.33439272, 0.06267686, 0.0585087 , 0.0339684 , 0.03117037,
       0.02224041, 0.01826589, 0.01102561, 0.01049816, 0.0102095 ,
       0.01004498, 0.00967537, 0.00964054, 0.00905694, 0.00815444,
       0.0074211 , 0.00639365, 0.00627589, 0.00518671, 0.00498069])

In [118]:
display_topics(lsa_26, tfidf_26.get_feature_names(), 10)


Topic  0
austin, valley good, co bnzfvhffrf, austin hit, taco exist, exist austin, hour south, south austin, exist, hit valley

Topic  1
freeway san, austin freeway, freeway, taco austin, antonio, san, san antonio, taco, good, good taco

Topic  2
joshtheflanagan torchystacos, joshtheflanagan, dwnldblcntnt joshtheflanagan, dwnldblcntnt, torchystacos, truck speak, wrong good, torchystacos wrong, taco come, come food

Topic  3
torchys tbh, block everyone, everyone torchys, block, tbh, torchys, everyone, long, service, listen

Topic  4
torchy, highly misinform, city town, taco good, taco city, misinform, town highly, torchystacos torchy, town, torchy taco

Topic  5
pastor riverside, rosita al, al, al pastor, rosita, riverside, pastor, valentina, arandina riverside, arandina

Topic  6
always torchy, torchystacos always, always, torchy, torchystacos, confirm, co hqdvbxgs, hqdvbxgs, confirm co, torchystacos confirm

Topic  7
moderate, time, tinker, systemic problem, wyk, decline, xhd, xhd wy

Community_40

In [119]:
bag_of_words_40 = tfidf_40.fit_transform(X_40)
lsa_40 = TruncatedSVD(20)
doc_topic_40 = lsa_40.fit_transform(bag_of_words_40)
lsa_40.explained_variance_ratio_

array([0.40984087, 0.27893032, 0.05467952, 0.01490836, 0.01074136,
       0.00928683, 0.00597119, 0.00574506, 0.0054923 , 0.00505121,
       0.00435609, 0.00385205, 0.00373314, 0.00364253, 0.00358847,
       0.00351047, 0.00347804, 0.00332365, 0.00297532, 0.00290224])

In [120]:
display_topics(lsa_40, tfidf_40.get_feature_names(), 10)


Topic  0
talk powerhouse, sheinspiresme ladygaga, demi lovato, twitter talk, powerhouse, lovato kamalaharris, mention woman, lovato, taylorswift demi, woman twitter

Topic  1
brain, brain co, co ufxyb, ufxyb ny, ufxyb, woman brain, love woman, ny, love, co

Topic  2
call san, vksygxrqxv, place mile, antonio co, place, co vksygxrqxv, mile south, south call, mile, antonio

Topic  3
gop problem, problem keep, qwv, lose fan, local state, local, rank, rank local, leader speak, keep woman

Topic  4
address, climate, climate change, fear, work sxsw, fear divide, change together, address income, together, together work

Topic  5
cvfc kjzum, nice job, job co, boycott, boycott cookie, co cvfc, cookie, cookie teach, nice, skill

Topic  6
service, long, job, lot, work, green, green deal, hear, lesson, deal

Topic  7
green, green deal, hear, deal, co wab, watch read, wab, wab hka, surround, effect politic

Topic  8
specter, live society, core problem, excite, excite reason, excited, specter automa

Now I have my top words in each LSA object w/ 20 topics each. I'm going to pickle each model and then use a new file to:
 - Find which tweets score the highest in each topic to get examples
 - Name communities
 - Sentiment analysis (maybe weekend)

In [121]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_0.pickle', 'wb') as to_write:
    pickle.dump(lsa_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_1.pickle', 'wb') as to_write:
    pickle.dump(lsa_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_2.pickle', 'wb') as to_write:
    pickle.dump(lsa_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_3.pickle', 'wb') as to_write:
    pickle.dump(lsa_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_4.pickle', 'wb') as to_write:
    pickle.dump(lsa_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_5.pickle', 'wb') as to_write:
    pickle.dump(lsa_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_6.pickle', 'wb') as to_write:
    pickle.dump(lsa_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_7.pickle', 'wb') as to_write:
    pickle.dump(lsa_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_8.pickle', 'wb') as to_write:
    pickle.dump(lsa_8, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_9.pickle', 'wb') as to_write:
    pickle.dump(lsa_9, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_10.pickle', 'wb') as to_write:
    pickle.dump(lsa_10, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_11.pickle', 'wb') as to_write:
    pickle.dump(lsa_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_26.pickle', 'wb') as to_write:
    pickle.dump(lsa_26, to_write)

with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_40.pickle', 'wb') as to_write:
    pickle.dump(lsa_40, to_write)
    
    

    

NMF Models?

In [127]:
nmf_0 = NMF(20)
doc_topic_NMF_0 = nmf_0.fit_transform(bag_of_words_0)
print(nmf_0.reconstruction_err_)

display_topics(nmf_0, tfidf_0.get_feature_names(), 10)

166.24833610965854

Topic  0
xqao, hbuw, congress realjameswood, hbuw xqao, co hbuw, call congress, realdonaldtrump co, realjameswood realdonaldtrump, answer cast, realjameswood

Topic  1
socialist revolution, border socialism, observer liberal, observer, push open, break millionaire, democrat push, ad charlotte, socialism usa, charlotte

Topic  2
claim, mail pile, box fraud, pile month, claim represent, news crew, business claim, postal, crew postal, never boyfriend

Topic  3
finally expose, bigot co, wizard ku, qwek omk, omk, celebrate party, grand wizard, ku klux, co qwek, folk former

Topic  4
garbage, praise venezuela, seriously call, citizen force, eat socialist, socialist hellhole, wow seriously, force eat, hellhole praise, garbage citizen

Topic  5
hawaii, green deal, green, deal, crazy, crazy crazy, hawaii island, democrat senator, hawaii anyone, hirono hawaii

Topic  6
cnn msnbc, msnbc, medium blackout, ignore fec, blackout cnn, msnbc ignore, kxadajq jl, kxadajq, co kxadajq, 

In [128]:
nmf_1 = NMF(20)
doc_topic_NMF_1 = nmf_1.fit_transform(bag_of_words_1)
print(nmf_1.reconstruction_err_)

display_topics(nmf_1, tfidf_1.get_feature_names(), 10)

30.50907628177441

Topic  0
co koufenr, koufenr wg, koufenr, wg, inequality co, blame income, system blame, call capitalism, blame, income inequality

Topic  1
value, essentially restatement, create essentially, essentially, restatement, pay less, tweet worker, worker pay, value create, less value

Topic  2
co rdl, rdl, cgkhqj, rdl cgkhqj, system co, blast, blast capitalism, irredeemable system, system, capitalism irredeemable

Topic  3
live society, society job, job leave, leave die, excited, specter, haunt specter, specter automate, die, core

Topic  4
institution, valid thus, abuse people, world view, must give, eliminate personal, view jump, valid, give institution, absolute

Topic  5
lxo abjw, co lxo, abjw, lxo, blame income, inequality co, system blame, call capitalism, blame, income inequality

Topic  6
image sxsw, lean, lean socialist, co juikvax, sxsw capitalism, socialist image, juikvax, irredeemable co, image, socialist

Topic  7
bill gate, gate, bill, robot co, robot, idea 

In [129]:
nmf_2 = NMF(20)
doc_topic_NMF_2 = nmf_2.fit_transform(bag_of_words_2)
print(nmf_2.reconstruction_err_)

display_topics(nmf_2, tfidf_2.get_feature_names(), 10)

27.392889192902118

Topic  0
service, long, time everyone, service job, co fzvfdsnue, life lesson, customer, customer service, require work, long long

Topic  1
libs nice, nice job, teach little, co cvfc, leadership skill, cookie teach, kjzum, cvfc, cvfc kjzum, girl leadership

Topic  2
machine work, file bogus, work folk, folk co, spamm file, conservative group, jo, untrue scandal, untrue, news report

Topic  3
gdjsh, dk, part co, gdjsh dk, co gdjsh, part, co, power, apparatus, repeat essentially

Topic  4
core problem, core, society job, problem co, die core, live society, leave die, job leave, society, specter automate

Topic  5
rest co, mint rest, povmemdnlz, co povmemdnlz, thin mint, mint, thin, rest, co, come

Topic  6
give job, trump famous, mtvszv sco, whose, whose career, father co, famous heiress, famous, want work, sco

Topic  7
ff woman, ff, gcv, co gcv, woman cannnnn, cannnnn co, cannnnn, woman, co, woman work

Topic  8
osborne, george, george osborne, politic george, osbo

In [130]:
nmf_3 = NMF(20)
doc_topic_NMF_3 = nmf_3.fit_transform(bag_of_words_3)
print(nmf_3.reconstruction_err_)

display_topics(nmf_3, tfidf_3.get_feature_names(), 10)

84.95870203102425

Topic  0
good compilation, sxsw good, genius sxsw, cxy, compilation co, goe, compilation, cxy goe, iw cxy, co iw

Topic  1
uzbk, co uzbk, fyi co, explode fyi, fyi, challenger explode, explode, challenger, co, tweet co

Topic  2
airplane eat, taco speak, fart taco, atx co, atx, sxsw atx, eat cow, speak sxsw, shroyer, co dsakis

Topic  3
town fly, try town, fly austin, beef try, town, austin, beef, try, fly, try illegally

Topic  4
trillion, cost, god information, dollar tell, tell american, year die, expensive also, cost almost, people border, die god

Topic  5
claim, mail pile, postal worker, claim visit, news crew, pile, visit film, boyfriend apartment, pile month, represent neighbor

Topic  6
mlk pro, faught democrat, republican faught, kkk next, life republican, run kkk, faught, pro life, democrat run, kkk

Topic  7
scar right, corporation government, right corporation, irredeemable rep, obvious never, pick dictionary, rep scar, never pick, dictionary co, xfv

Top

In [131]:
nmf_4 = NMF(20)
doc_topic_NMF_4 = nmf_4.fit_transform(bag_of_words_4)
print(nmf_4.reconstruction_err_)

display_topics(nmf_4, tfidf_4.get_feature_names(), 10)

411.3003443674469

Topic  0
claim, apartment local, month box, never boyfriend, pile month, worker claim, claim mail, news crew, film news, business claim

Topic  1
hawaii, crazy, green deal, green, deal, democrat senator, hawaii anyone, oppose green, hawaii island, interesting democrat

Topic  2
celebrate party, folk former, hat respect, ilhanmn finally, klan celebrate, wizard ku, stupid jews, party black, nothing folk, former grand

Topic  3
garbage, congresswoman garbage, hellhole praise, praise venezuela, socialist hellhole, wow seriously, garbage congresswoman, eat socialist, seriously call, garbage citizen

Topic  4
instead, slow demise, lyft instead, demise instead, subway fly, believe world, nyc instead, use amtrak, thousand uber, fly dc

Topic  5
boy, necessary boy, co xkadfguk, xkadfguk, away boy, scouts away, boy co, boy scouts, necessary, scouts

Topic  6
list fec, possible coordination, coordination want, uber air, yet spend, travel mom, ton uber, complaint funnel, car yet

In [132]:
nmf_5 = NMF(20)
doc_topic_NMF_5 = nmf_5.fit_transform(bag_of_words_5)
print(nmf_5.reconstruction_err_)

display_topics(nmf_5, tfidf_5.get_feature_names(), 10)

441.7224629591867

Topic  0
palestinian, israel, palestinian faction, faction hama, palestinian side, choose palestinian, tlaib grow, pay terrorist, side pay, number democrat

Topic  1
uh, though borough, mess bronx, roasted act, jh xatvtcm, without roasted, perfect clapback, clapback game, people mess, borough perfect

Topic  2
class american, class, american, working class, working, work class, white, screw, screw work, pit white

Topic  3
condemn, anti semitism, semitism, anti, design obscure, repeatedly help, obscure anti, semitism ashamed, force democratic, condemn condemn

Topic  4
claim, worker claim, visit film, crew postal, apartment local, claim mail, represent neighbor, claim visit, business claim, postal worker

Topic  5
state america, united state, united, state, america, family immigrate, tell firsthand, firsthand united, garbage great, immigrate united

Topic  6
bigotry ilhanmn, floor design, put sham, resolution floor, design protect, outrage party, party put, semitic h

In [133]:
nmf_6 = NMF(20)
doc_topic_NMF_6 = nmf_6.fit_transform(bag_of_words_6)
print(nmf_6.reconstruction_err_)

display_topics(nmf_6, tfidf_6.get_feature_names(), 10)

372.53672672594075

Topic  0
co cvfc, cvfc, cvfc kjzum, kjzum, libs nice, skill libs, teach little, cookie teach, nice job, job co

Topic  1
allege untrue, untrue scandal, case conspiracy, run conservative, theory run, group spamm, report allege, scandal misinformation, spamm file, spamm

Topic  2
froth rage, radio screech, screech monkey, drive crappy, rage listen, right deplorable, deplorable drive, past abandon, abandon storefront, road past

Topic  3
hispanic, female, year old, young, ny, bartender, old, year, ask female, hispanic year

Topic  4
dem, leader simple, reason political, willing past, decade reclaim, reclaim language, storm dem, dem willing, equivocation, justice without

Topic  5
attack, praise, praise sander, attack harris, space dem, biden give, harris praise, candidate refrain, give space, politician candidate

Topic  6
moderate, edge solve, naive visionary, visionary tinker, spot moderate, moderate spot, moderate naive, critique moderate, contract manage, time rewr

In [134]:
nmf_7 = NMF(20)
doc_topic_NMF_7 = nmf_7.fit_transform(bag_of_words_7)
print(nmf_7.reconstruction_err_)

display_topics(nmf_7, tfidf_7.get_feature_names(), 10)

284.9696798236513

Topic  0
excited live, reason excited, specter automate, excite reason, work excite, society job, haunt specter, specter, haunt, automate work

Topic  1
address, co tsbac, tsbac, change together, together work, divide address, inequality address, fear divide, address income, work sxsw

Topic  2
moderate, time rewrite, critique moderate, edge solve, visionary tinker, contract manage, moderate spot, spot moderate, naive visionary, problem democracy

Topic  3
policy idea, idea, policy, substance asking, co bcwtml, coverage green, strategy substance, cynical big, bcwtml, big policy

Topic  4
fexqdex, co fexqdex, tell use, gun co, use gun, please tell, gun, please, tell, use

Topic  5
file bogus, complaint fox, theory run, news report, spamm file, case conspiracy, allege untrue, gi phij, phij, phij jo

Topic  6
rashidatlaib election, solidarity support, bid united, announce first, support announce, state congress, endorsement ilhanmn, first endorsement, election bid, amou

In [135]:
nmf_8 = NMF(20)
doc_topic_NMF_8 = nmf_8.fit_transform(bag_of_words_8)
print(nmf_8.reconstruction_err_)

display_topics(nmf_8, tfidf_8.get_feature_names(), 10)

81.36324654457356

Topic  0
general celebrateco, carlson tonight, gnd climate, est fox, news discuss, alarmism general, pm est, est, discuss gnd, tucker carlson

Topic  1
part ilhan, bad part, face year, country face, invasion border, right pressing, distract right, pressing issue, year invasion, issue country

Topic  2
joshuadstewart, tigerzntl, brandondaly, davidyoung, awayyumi einandererblog, brunopresent kstafford, jamesabbott, prez, jamesabbott amplicity, awayyumi

Topic  3
fuel, fossil fuel, fossil, food, twit plan, plan grow, grow food, horse fossil, world cut, fuel ban

Topic  4
claim, business claim, mail pile, postal, postal worker, never boyfriend, film news, crew postal, live district, boyfriend apartment

Topic  5
immigration demographic, legal immigration, demographic change, demographic, legal, immigration, change, conservative ilhan, yryw, ignore problem

Topic  6
green deal, green, deal, hawaii, crazy, anyone oppose, crazy crazy, mazie hirono, interesting democrat, eli

In [136]:
nmf_9 = NMF(20)
doc_topic_NMF_9 = nmf_9.fit_transform(bag_of_words_9)
print(nmf_9.reconstruction_err_)

display_topics(nmf_9, tfidf_9.get_feature_names(), 10)

613.8853818750036

Topic  0
claim, crew postal, visit film, pile month, represent neighbor, box fraud, month box, claim visit, film news, business claim

Topic  1
hawaii, green deal, green, deal, crazy, oppose green, hirono hawaii, senator mazie, hawaii anyone, hawaii island

Topic  2
garbage, america, america garbage, garbage congresswoman, garbage citizen, praise venezuela, hellhole praise, congresswoman garbage, force eat, socialist hellhole

Topic  3
slap, third ethic, slap third, week co, third, ethic, week, complaint week, ethic complaint, vsuvkqrxxm

Topic  4
ilhanmn finally, left exactly, hat respect, celebrate party, klan celebrate, respect ilhanmn, folk former, exactly hateful, party black, wizard ku

Topic  5
instead, demise instead, use amtrak, everything slow, thousand uber, lyft instead, subway fly, slow demise, fly dc, dc nyc

Topic  6
answer cast, cast call, cast, answer, congress, congress realjameswood, hbuw, hbuw xqao, co hbuw, xqao

Topic  7
cnn msnbc, blackout cnn,

In [137]:
nmf_10 = NMF(20)
doc_topic_NMF_10 = nmf_10.fit_transform(bag_of_words_10)
print(nmf_10.reconstruction_err_)

display_topics(nmf_10, tfidf_10.get_feature_names(), 10)

96.2330594208418

Topic  0
maduro afraid, berniesander win, dictator wow, afraid venezuela, wow berniesander, bad watch, call nicolasmaduro, win denounce, nicolasmaduro dictator, look bad

Topic  1
venezuela, wrong protest, crisis wrong, aid enter, theory co, co ryjrvcabih, america motive, disgusting conspiracy, suggest potus, break venezuela

Topic  2
invite far, ilhanomar come, co nthd, leave dem, nthd, dem ilhanomar, nthd poq, country watch, poq, jguaido invite

Topic  3
suffering starvation, maduro responsible, responsible pain, pain suffering, starvation million, throw point, ilhanmn tolerate, tolerate maduro, million allow, ignorance ilhanmn

Topic  4
democratic, socialist, co spci, ignoramus pitiful, spci, spci oiy, simply become, incapable condemn, oiy, shine simply

Topic  5
suffer, electricity co, help suffer, anti conspiracy, theory help, suffer amid, suffer educate, ignorant prefer, sake million, amid food

Topic  6
american, tolerate excuse, nothing people, bully ilhanmn, 

In [138]:
nmf_11 = NMF(20)
doc_topic_NMF_11 = nmf_11.fit_transform(bag_of_words_11)
print(nmf_11.reconstruction_err_)

display_topics(nmf_11, tfidf_11.get_feature_names(), 10)

44.25405245694043

Topic  0
life delightful, lose platform, bq, bq onfr, worry america, delightful, onfr, america ewarren, stop company, delightful co

Topic  1
libertarian, start come, seem start, libertarian tech, tech year, movement represent, always lot, liberal silicon, valley always, come mainly

Topic  2
love, sure audience, guarantee trump, election team, audience love, riav, co riav, love guarantee, team love, trump election

Topic  3
mjtheprophet whackamolepro, mjtheprophet, whackamolepro, tenor, reneefiredup, gabbifromtexas, gabbifromtexas tenor, rick, kokshaj, reneefiredup gabbifromtexas

Topic  4
low tax, low, tax, socialist party, tax state, entrepreneur already, tax county, washington etc, geography, geography co

Topic  5
mitchellvii, mitchellvii garbage, garbage, talk, mitchellvii kinda, kinda, mitchellvii talk, mitchellvii trump, mitchellvii go, leave

Topic  6
boss, charge, rly, boss rly, serve, woman, call boss, boss right, boss serve, charliekirk

Topic  7
green, d

In [139]:
nmf_26 = NMF(20)
doc_topic_NMF_26 = nmf_26.fit_transform(bag_of_words_26)
print(nmf_26.reconstruction_err_)

display_topics(nmf_26, tfidf_26.get_feature_names(), 10)

16.32050975192742

Topic  0
austin, taco exist, valley good, south austin, hit valley, hour south, exist austin, bnzfvhffrf, exist, austin hit

Topic  1
austin freeway, freeway san, freeway, taco austin, san antonio, san, antonio, austin, good taco, good

Topic  2
taco come, truck speak, come food, torchystacos wrong, wrong good, food truck, speak spanish, speak, wrong, spanish

Topic  3
tbh, everyone torchys, block, block everyone, torchys tbh, torchys, everyone, white, donvad, torchystacos exactly

Topic  4
city town, highly misinform, misinform, taco good, taco city, town highly, torchystacos torchy, pacobaggin dwnldblcntnt, town, pacobaggin

Topic  5
pastor riverside, rosita al, al pastor, al, riverside, rosita, pastor, arandina riverside, arandina, east

Topic  6
always torchy, torchystacos always, always, torchy, torchystacos, ppl, tell good, eww ppl, ppl tell, eww

Topic  7
moderate, decline, moderate spot, co xhd, xhd wyk, xhd, critique moderate, manage, wyk, manage decline

To

In [140]:
nmf_40 = NMF(20)
doc_topic_NMF_40 = nmf_40.fit_transform(bag_of_words_40)
print(nmf_40.reconstruction_err_)

display_topics(nmf_40, tfidf_40.get_feature_names(), 10)

15.168867255006226

Topic  0
twitter talk, powerhouse sheinspiresme, woman twitter, demi lovato, taylorswift demi, lovato, lovato kamalaharris, mention woman, sheinspiresme ladygaga, powerhouse

Topic  1
brain co, co ufxyb, ufxyb, ufxyb ny, woman brain, brain, love woman, ny, love, co

Topic  2
mile, south call, co vksygxrqxv, vksygxrqxv, place mile, call san, place, antonio co, mile south, antonio

Topic  3
co spqien, national party, qwv, problem keep, wonder gop, woman voter, gop problem, fan co, people wonder, voter rank

Topic  4
address, climate change, fear, climate, divide, work sxsw, change together, fear divide, income inequality, co tsbac

Topic  5
cvfc kjzum, skill, boycott cookie, nice job, nice, job co, cvfc, little girl, cookie, skill libs

Topic  6
service, long, bartender long, learn co, everyone require, service customer, service job, lot life, customer, long long

Topic  7
green deal, green, hear, deal, wab, read news, lot green, well hear, watch read, watch

Topic  8

In [141]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_0.pickle', 'wb') as to_write:
    pickle.dump(nmf_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_1.pickle', 'wb') as to_write:
    pickle.dump(nmf_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_2.pickle', 'wb') as to_write:
    pickle.dump(nmf_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_3.pickle', 'wb') as to_write:
    pickle.dump(nmf_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_4.pickle', 'wb') as to_write:
    pickle.dump(nmf_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_5.pickle', 'wb') as to_write:
    pickle.dump(nmf_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_6.pickle', 'wb') as to_write:
    pickle.dump(nmf_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_7.pickle', 'wb') as to_write:
    pickle.dump(nmf_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_8.pickle', 'wb') as to_write:
    pickle.dump(nmf_8, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_9.pickle', 'wb') as to_write:
    pickle.dump(nmf_9, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_10.pickle', 'wb') as to_write:
    pickle.dump(nmf_10, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_11.pickle', 'wb') as to_write:
    pickle.dump(nmf_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_26.pickle', 'wb') as to_write:
    pickle.dump(nmf_26, to_write)

with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_40.pickle', 'wb') as to_write:
    pickle.dump(nmf_40, to_write)
    

Running Topic Modeling on ALL text data, both NMF and LSA, on various topic counts, to feed into DBScan / KMeans for cluster analysis

In [146]:
## Creating list of the processed text files to merge, so it's all in the same format and can re-use code
dfs_to_merge = [df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_26, df_40]

In [149]:
total = 0

for df in dfs_to_merge:
    print(df.shape)
    total += df.shape[0]
    
print(total)

(34997, 10)
(1446, 10)
(8345, 10)
(9346, 10)
(224793, 10)
(237746, 10)
(190998, 10)
(135612, 10)
(8192, 10)
(425343, 10)
(13157, 10)
(2270, 10)
(1141, 10)
(2628, 10)
1296014


In [150]:
df_all_preprocessed = pd.concat(dfs_to_merge)

In [151]:
df_all_preprocessed.shape

(1296014, 10)

In [153]:
df_all = df_all_preprocessed.copy()

In [157]:
df_all.head()

Unnamed: 0,screen_name,followers_count,modularity_class,main_text,retweet_text,tweet_date,PageRank,rt_count,final_text,tweet_processed
405812,DavidJHarrisJr,75631,0.0,So @aoc answered a casting call... and now she’s in Congress! @RealJamesWoods and @realDonaldTrump have to see this!!! https://t.co/HBUW78XqAo,,Mon Mar 11 01:16:04 +0000 2019,0.004242,6936,So @aoc answered a casting call... and now she’s in Congress! @RealJamesWoods and @realDonaldTrump have to see this!!! https://t.co/HBUW78XqAo,so aoc answer a cast call and now -PRON- s in congress realjameswood and realdonaldtrump have to see this https t co hbuw xqao
712255,DavidJHarrisJr,75652,0.0,So @aoc wants to raise our taxes and she hasn’t even paid her own! https://t.co/W1m1CCnN3Y,,Sun Mar 10 03:49:44 +0000 2019,0.004242,1127,So @aoc wants to raise our taxes and she hasn’t even paid her own! https://t.co/W1m1CCnN3Y,so aoc want to raise -PRON- tax and -PRON- hasn t even pay -PRON- own https t co w m ccnn y
465115,DavidJHarrisJr,75638,0.0,Now @aoc says America is garbage? \nWho elected this woman!!! https://t.co/O7hsaAK9jH,,Sun Mar 10 22:30:39 +0000 2019,0.004242,1056,Now @aoc says America is garbage? \nWho elected this woman!!! https://t.co/O7hsaAK9jH,now aoc say america be garbage who elect this woman https t co o hsaak jh
62994,DavidJHarrisJr,75620,0.0,Former FEC Commissioner says there’s more than enough evidence to launch a criminal investigation into @aoc! https://t.co/XPL6LhrZFD,,Tue Mar 12 01:25:16 +0000 2019,0.004242,997,Former FEC Commissioner says there’s more than enough evidence to launch a criminal investigation into @aoc! https://t.co/XPL6LhrZFD,former fec commissioner say there s more than enough evidence to launch a criminal investigation into aoc https t co xpl lhrzfd
1205837,DavidJHarrisJr,75690,0.0,Hey AOC! Guess what? Climate change and global warming IS a hoax!!!! https://t.co/OwY68EMVMm,,Fri Mar 08 05:02:07 +0000 2019,0.004242,838,Hey AOC! Guess what? Climate change and global warming IS a hoax!!!! https://t.co/OwY68EMVMm,hey aoc guess what climate change and global warming be a hoax https t co owy emvmm


In [165]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'wb') as to_write:
    pickle.dump(df_all, to_write)

In [2]:
with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'rb') as f:
    df_all = pickle.load(f)

So the concat worked, the # of rows equals the number of all the rows added up (just to be safe)

Now I'll run a few different LSAs and NMFs on the total data, and move on to clustering

# Preprocess Text - for both LSA and NMF

In [3]:
# Can come back to tweek this if I notice anything
tweet_stopwords = stopwords.words('english') + \
    ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
     'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
     'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
     'day', 'days', 'aoc', 'alexandria', 'ocasio', 'cortez', 'ocasio-cortez', 'pron']

In [4]:
tfidf_all = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))

In [5]:
X_all = df_all['tweet_processed']

In [6]:
bag_of_words_all = tfidf_all.fit_transform(X_all)

In [7]:
type(bag_of_words_all)

scipy.sparse.csr.csr_matrix

# LSA on all text
*10, 15, 20, 25 topics*

In [170]:
# LSA on 10 topics
lsa_all_10 = TruncatedSVD(10)
lsa_all_10.fit_transform(bag_of_words_all)
lsa_all_10.explained_variance_ratio_

array([0.01284424, 0.01030946, 0.00990273, 0.00920476, 0.00827643,
       0.00781187, 0.00793363, 0.0072784 , 0.00735234, 0.00721214])

In [175]:
display_topics(lsa_all_10, tfidf_all.get_feature_names(), 10)


Topic  0
claim, box fraud, visit film, represent neighbor, crew postal, film news, never boyfriend, claim visit, apartment local, claim mail

Topic  1
skill libs, libs nice, cookie teach, co cvfc, kjzum, cvfc, cvfc kjzum, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, scandal misinformation, complaint fox, report allege

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, climate change, inequality address, divide address, change together, together work, address income, tsbac, co tsbac, work sxsw

Topic  6
moderate, time rewrite, edge solve, visionary tinker, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic

In [176]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_10.pickle', 'wb') as to_write:
    pickle.dump(lsa_all_10, to_write)

---------

In [187]:
# LSA on 15 topics
lsa_all_15 = TruncatedSVD(15)
lsa_all_15.fit_transform(bag_of_words_all)
lsa_all_15.explained_variance_ratio_

array([0.01284409, 0.01030946, 0.00990181, 0.00920577, 0.00827817,
       0.00780601, 0.00793854, 0.00728814, 0.00735668, 0.00721322,
       0.00614608, 0.00581037, 0.00529221, 0.00522468, 0.00525943])

In [188]:
display_topics(lsa_all_15, tfidf_all.get_feature_names(), 10)


Topic  0
claim, claim visit, visit film, crew postal, film news, represent neighbor, apartment local, never boyfriend, box fraud, claim mail

Topic  1
skill libs, libs nice, cookie teach, cvfc kjzum, cvfc, co cvfc, kjzum, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, scandal misinformation, complaint fox, report allege

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, climate change, inequality address, divide address, change together, together work, address income, co tsbac, tsbac, work sxsw

Topic  6
moderate, time rewrite, visionary tinker, edge solve, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic

In [189]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_15.pickle', 'wb') as to_write:
    pickle.dump(lsa_all_15, to_write)

---------

In [180]:
# LSA on 20 topics
lsa_all_20 = TruncatedSVD(20)
lsa_all_20.fit_transform(bag_of_words_all)
lsa_all_20.explained_variance_ratio_

array([0.01284416, 0.01031006, 0.00990201, 0.0092044 , 0.00827566,
       0.00780765, 0.00793947, 0.00729166, 0.00735414, 0.00721279,
       0.00615355, 0.00581353, 0.00530054, 0.0052364 , 0.0052897 ,
       0.00525525, 0.00489424, 0.00446149, 0.00414687, 0.00423378])

In [181]:
display_topics(lsa_all_20, tfidf_all.get_feature_names(), 10)


Topic  0
claim, film news, box fraud, crew postal, visit film, claim visit, apartment local, represent neighbor, never boyfriend, claim mail

Topic  1
skill libs, libs nice, cookie teach, cvfc kjzum, co cvfc, cvfc, kjzum, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, scandal misinformation, complaint fox, report allege

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, climate change, inequality address, divide address, change together, together work, address income, co tsbac, tsbac, work sxsw

Topic  6
moderate, time rewrite, edge solve, visionary tinker, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic

In [182]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_20.pickle', 'wb') as to_write:
    pickle.dump(lsa_all_20, to_write)

---------

In [8]:
# LSA on 25 topics
lsa_all_25 = TruncatedSVD(25)
lsa_all_25.fit_transform(bag_of_words_all)
lsa_all_25.explained_variance_ratio_

array([0.01284428, 0.01031   , 0.00990254, 0.0092052 , 0.0082777 ,
       0.00780795, 0.00793927, 0.00729185, 0.00735446, 0.00721273,
       0.00615261, 0.00580615, 0.00530464, 0.00523566, 0.00529585,
       0.00525649, 0.00492341, 0.0044789 , 0.00419832, 0.00428453,
       0.00406109, 0.00396976, 0.00393049, 0.00365576, 0.00348358])

In [11]:
display_topics(lsa_all_25, tfidf_all.get_feature_names(), 10)


Topic  0
claim, box fraud, apartment local, represent neighbor, crew postal, never boyfriend, claim visit, film news, visit film, claim mail

Topic  1
skill libs, libs nice, cookie teach, co cvfc, kjzum, cvfc, cvfc kjzum, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, scandal misinformation, complaint fox, report allege

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, climate change, inequality address, divide address, change together, together work, address income, co tsbac, tsbac, work sxsw

Topic  6
moderate, time rewrite, visionary tinker, edge solve, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic

In [12]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_25.pickle', 'wb') as to_write:
    pickle.dump(lsa_all_25, to_write)

# NMF on all text
*10, 15, 20, 25 topics*

In [13]:
# NMF on 10 topics
nmf_all_10 = NMF(10)
nmf_all_10.fit_transform(bag_of_words_all)
print(nmf_all_10.reconstruction_err_)

1085.5968911147454


In [14]:
display_topics(nmf_all_10, tfidf_all.get_feature_names(), 10)


Topic  0
claim, crew postal, film news, visit film, represent neighbor, claim visit, apartment local, box fraud, never boyfriend, claim mail

Topic  1
skill libs, libs nice, cookie teach, cvfc, cvfc kjzum, kjzum, co cvfc, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, news report, complaint fox, scandal misinformation

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, inequality address, divide address, change together, together work, address income, co tsbac, tsbac, work sxsw, fear divide

Topic  6
moderate, time rewrite, visionary tinker, edge solve, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic  7
i

In [15]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_10.pickle', 'wb') as to_write:
    pickle.dump(nmf_all_10, to_write)

---------

In [16]:
# NMF on 15 topics
nmf_all_15 = NMF(15)
nmf_all_15.fit_transform(bag_of_words_all)
print(nmf_all_15.reconstruction_err_)

1068.5427287389832


In [17]:
display_topics(nmf_all_15, tfidf_all.get_feature_names(), 10)


Topic  0
claim, claim visit, crew postal, apartment local, box fraud, visit film, represent neighbor, film news, never boyfriend, claim mail

Topic  1
skill libs, libs nice, cookie teach, kjzum, co cvfc, cvfc kjzum, cvfc, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, news report, scandal misinformation, complaint fox

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, inequality address, divide address, change together, together work, address income, co tsbac, tsbac, work sxsw, fear divide

Topic  6
moderate, time rewrite, visionary tinker, edge solve, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic  7
i

In [18]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_15.pickle', 'wb') as to_write:
    pickle.dump(nmf_all_15, to_write)

---------

In [19]:
# NMF on 20 topics
nmf_all_20 = NMF(20)
nmf_all_20.fit_transform(bag_of_words_all)
print(nmf_all_20.reconstruction_err_)

1054.2875702490992


In [20]:
display_topics(nmf_all_20, tfidf_all.get_feature_names(), 10)


Topic  0
claim, film news, apartment local, crew postal, never boyfriend, box fraud, represent neighbor, visit film, claim visit, claim mail

Topic  1
skill libs, libs nice, cookie teach, co cvfc, kjzum, cvfc kjzum, cvfc, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, news report, scandal misinformation, complaint fox

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, crazy green, oppose green, senator mazie, hirono hawaii

Topic  5
address, inequality address, divide address, change together, together work, address income, tsbac, co tsbac, work sxsw, fear divide

Topic  6
moderate, time rewrite, visionary tinker, edge solve, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic  7
i

In [21]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_20.pickle', 'wb') as to_write:
    pickle.dump(nmf_all_20, to_write)

---------

In [22]:
# NMF on 25 topics
nmf_all_25 = NMF(25)
nmf_all_25.fit_transform(bag_of_words_all)
print(nmf_all_25.reconstruction_err_)

## setting a variable to the fit transformed

1042.4274847932734


In [28]:
nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all))
nmf_model_25_df.shape

(1296014, 25)

In [29]:
print(nmf_all_25.reconstruction_err_)
nmf_model_25_df.head()

1042.4277109200966


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.112873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.000341,3e-05,0.000488,0.0,0.00041,0.000302,0.000239,0.000329,0.0,...,0.000617,0.000773,0.000503,0.000775,0.0,0.000674,0.003253,0.010097,0.000656,0.0
2,0.0,0.000201,0.000123,0.000141,0.0,0.000175,0.000109,0.0,0.0,0.010703,...,1.7e-05,0.0,0.000183,0.000109,4.3e-05,0.0,0.015331,0.0,0.010695,0.0
3,0.0,0.000227,0.0,0.000168,0.0,0.000199,0.000243,0.002161,0.000307,0.0,...,0.000441,0.000516,0.000245,0.000574,5.7e-05,3.1e-05,0.000271,0.003239,0.000439,0.0
4,0.0,0.000223,7.3e-05,7.2e-05,0.0,0.00742,0.000188,0.000149,9.2e-05,0.0,...,0.000427,0.000664,0.000307,0.000473,0.0,0.0,0.000351,2e-05,0.000372,0.0


NOTE TO SELF - if the above works, I'll need to make DFs of all versions again (which will take around another hour) :(

Then I'll concat them with index and screen name, so I can assign values back to the tweeters otherwise. One annoying thing is I didn't pass in tweet ID so I'll have to think about that too

In [23]:
display_topics(nmf_all_25, tfidf_all.get_feature_names(), 10)


Topic  0
claim, apartment local, crew postal, never boyfriend, claim visit, film news, visit film, box fraud, represent neighbor, claim mail

Topic  1
skill libs, libs nice, cookie teach, cvfc, cvfc kjzum, co cvfc, kjzum, boycott cookie, nice job, teach little

Topic  2
file bogus, case conspiracy, allege untrue, spamm file, group spamm, untrue scandal, theory run, news report, scandal misinformation, complaint fox

Topic  3
society job, live society, job leave, leave die, automate work, automate, die core, core problem, excited live, reason excited

Topic  4
hawaii, crazy, green deal, green, deal, hawaii anyone, oppose green, crazy green, senator mazie, hirono hawaii

Topic  5
address, inequality address, divide address, change together, together work, address income, tsbac, co tsbac, work sxsw, fear divide

Topic  6
moderate, time rewrite, edge solve, visionary tinker, naive visionary, moderate naive, moderate spot, problem democracy, democracy economy, critique moderate

Topic  7
i

In [24]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_25.pickle', 'wb') as to_write:
    pickle.dump(nmf_all_25, to_write)

Now I will create DF's where I append this info to a dataframe

then try dbscan with these

In [31]:
df_topic_base = df_all.reset_index()
df_topic_base.head()

Unnamed: 0,index,screen_name,followers_count,modularity_class,main_text,retweet_text,tweet_date,PageRank,rt_count,final_text,tweet_processed
0,405812,DavidJHarrisJr,75631,0.0,So @aoc answered a casting call... and now she...,,Mon Mar 11 01:16:04 +0000 2019,0.004242,6936,So @aoc answered a casting call... and now she...,so aoc answer a cast call and now -PRON- s in ...
1,712255,DavidJHarrisJr,75652,0.0,So @aoc wants to raise our taxes and she hasn’...,,Sun Mar 10 03:49:44 +0000 2019,0.004242,1127,So @aoc wants to raise our taxes and she hasn’...,so aoc want to raise -PRON- tax and -PRON- has...
2,465115,DavidJHarrisJr,75638,0.0,Now @aoc says America is garbage? \nWho electe...,,Sun Mar 10 22:30:39 +0000 2019,0.004242,1056,Now @aoc says America is garbage? \nWho electe...,now aoc say america be garbage who elect this ...
3,62994,DavidJHarrisJr,75620,0.0,Former FEC Commissioner says there’s more than...,,Tue Mar 12 01:25:16 +0000 2019,0.004242,997,Former FEC Commissioner says there’s more than...,former fec commissioner say there s more than ...
4,1205837,DavidJHarrisJr,75690,0.0,Hey AOC! Guess what? Climate change and global...,,Fri Mar 08 05:02:07 +0000 2019,0.004242,838,Hey AOC! Guess what? Climate change and global...,hey aoc guess what climate change and global w...


crap - now I know why I saved the fit_transform(bag_of_words_ objects previously, that's what I need to use to create the df. For now I'm seeing if I can just call that as the "data" and see if that works. Otherwise I will need to set them all equal to variables

In [32]:
df_topic_base.shape

(1296014, 11)

In [34]:
lsa_all_10 = TruncatedSVD(10)

lsa_model_10_df = pd.DataFrame(data=lsa_all_10.fit_transform(bag_of_words_all))
lsa_model_10_df.shape

(1296014, 10)

In [36]:
lsa_all_15 = TruncatedSVD(15)

lsa_model_15_df = pd.DataFrame(data=lsa_all_15.fit_transform(bag_of_words_all))
lsa_model_15_df.shape

(1296014, 15)

In [37]:
lsa_all_20 = TruncatedSVD(20)

lsa_model_20_df = pd.DataFrame(data=lsa_all_20.fit_transform(bag_of_words_all))
lsa_model_20_df.shape

(1296014, 20)

In [38]:
lsa_all_25 = TruncatedSVD(25)

lsa_model_25_df = pd.DataFrame(data=lsa_all_25.fit_transform(bag_of_words_all))
lsa_model_25_df.shape

(1296014, 25)

In [39]:
df_lsa_model_10 = pd.merge(df_topic_base, lsa_model_10_df, left_index=True, right_index=True)
df_lsa_model_15 = pd.merge(df_topic_base, lsa_model_15_df, left_index=True, right_index=True)
df_lsa_model_20 = pd.merge(df_topic_base, lsa_model_20_df, left_index=True, right_index=True)
df_lsa_model_25 = pd.merge(df_topic_base, lsa_model_25_df, left_index=True, right_index=True)

---------

In [40]:
nmf_all_10 = NMF(10)

nmf_model_10_df = pd.DataFrame(data=nmf_all_10.fit_transform(bag_of_words_all))
nmf_model_10_df.shape

(1296014, 10)

In [41]:
nmf_all_15 = NMF(15)

nmf_model_15_df = pd.DataFrame(data=nmf_all_15.fit_transform(bag_of_words_all))
nmf_model_15_df.shape

(1296014, 15)

In [42]:
nmf_all_20 = NMF(20)

nmf_model_20_df = pd.DataFrame(data=nmf_all_20.fit_transform(bag_of_words_all))
nmf_model_20_df.shape

(1296014, 20)

In [None]:
# nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all)) ALREADY DONE!
nmf_model_25_df.shape

In [43]:
df_nmf_model_10 = pd.merge(df_topic_base, nmf_model_10_df, left_index=True, right_index=True)
df_nmf_model_15 = pd.merge(df_topic_base, nmf_model_15_df, left_index=True, right_index=True)
df_nmf_model_20 = pd.merge(df_topic_base, nmf_model_20_df, left_index=True, right_index=True)
df_nmf_model_25 = pd.merge(df_topic_base, nmf_model_25_df, left_index=True, right_index=True)


In [44]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_10.pickle', 'wb') as to_write:
    pickle.dump(df_lsa_model_10, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_15.pickle', 'wb') as to_write:
    pickle.dump(df_lsa_model_15, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_20.pickle', 'wb') as to_write:
    pickle.dump(df_lsa_model_20, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_25.pickle', 'wb') as to_write:
    pickle.dump(df_lsa_model_25, to_write)
    

with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_10.pickle', 'wb') as to_write:
    pickle.dump(df_nmf_model_10, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_15.pickle', 'wb') as to_write:
    pickle.dump(df_nmf_model_15, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_20.pickle', 'wb') as to_write:
    pickle.dump(df_nmf_model_20, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_25.pickle', 'wb') as to_write:
    pickle.dump(df_nmf_model_25, to_write)
    
    

    
