In [52]:
import pandas as pd
import numpy as np

import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import nltk
import spacy

from textblob import TextBlob, Word

import re

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

Edited version of AOC to make it more organized

In [53]:
pd.set_option('display.max_colwidth', 400)  # or 199

Now, preprocessing (using same code as for kickstarter as baseline, come back here to tweak later)

Look into https://pypi.org/project/tweet-preprocessor/ for tweet processing later!

In [54]:
nlp = spacy.load('en', disable=['parser', 'ner'])
# nlp = spacy.load('en')

In [55]:
def pre_process(text):
    
    # lowercase
    text=text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)

    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    #lemmatize with Spacy
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc])
    
    return text

In [56]:
tweet_stopwords = stopwords.words('english') + \
    ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
     'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
     'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
     'day', 'days', 'captain', 'marvel', 'mcu', 'pron']

In [57]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [58]:
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_0.pickle', 'rb') as f:
    df_0 = pickle.load(f)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_1.pickle', 'rb') as f:
    df_1 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_2.pickle', 'rb') as f:
    df_2 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_3.pickle', 'rb') as f:
    df_3 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_4.pickle', 'rb') as f:
    df_4 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_5.pickle', 'rb') as f:
    df_5 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_6.pickle', 'rb') as f:
    df_6 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_7.pickle', 'rb') as f:
    df_7 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_11.pickle', 'rb') as f:
    df_11 = pickle.load(f)
    
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/df_15.pickle', 'rb') as f:
    df_15 = pickle.load(f)

In [59]:
df_0['final_text'] = np.where((df_0['retweet_text'].isnull() == False), df_0['retweet_text'], df_0['main_text'])
df_1['final_text'] = np.where((df_1['retweet_text'].isnull() == False), df_1['retweet_text'], df_1['main_text'])
df_2['final_text'] = np.where((df_2['retweet_text'].isnull() == False), df_2['retweet_text'], df_2['main_text'])
df_3['final_text'] = np.where((df_3['retweet_text'].isnull() == False), df_3['retweet_text'], df_3['main_text'])
df_4['final_text'] = np.where((df_4['retweet_text'].isnull() == False), df_4['retweet_text'], df_4['main_text'])
df_5['final_text'] = np.where((df_5['retweet_text'].isnull() == False), df_5['retweet_text'], df_5['main_text'])
df_6['final_text'] = np.where((df_6['retweet_text'].isnull() == False), df_6['retweet_text'], df_6['main_text'])
df_7['final_text'] = np.where((df_7['retweet_text'].isnull() == False), df_7['retweet_text'], df_7['main_text'])
df_11['final_text'] = np.where((df_11['retweet_text'].isnull() == False), df_11['retweet_text'], df_11['main_text'])
df_15['final_text'] = np.where((df_15['retweet_text'].isnull() == False), df_15['retweet_text'], df_15['main_text'])

RUN ALL BELOW ONCE PREV IS FINISHED!

In [None]:
df_0['tweet_processed'] = df_0['final_text'].apply(lambda x:pre_process(x))
df_1['tweet_processed'] = df_1['final_text'].apply(lambda x:pre_process(x))
df_2['tweet_processed'] = df_2['final_text'].apply(lambda x:pre_process(x))
df_3['tweet_processed'] = df_3['final_text'].apply(lambda x:pre_process(x))
df_4['tweet_processed'] = df_4['final_text'].apply(lambda x:pre_process(x))
df_5['tweet_processed'] = df_5['final_text'].apply(lambda x:pre_process(x))
df_6['tweet_processed'] = df_6['final_text'].apply(lambda x:pre_process(x))
df_7['tweet_processed'] = df_7['final_text'].apply(lambda x:pre_process(x))
df_11['tweet_processed'] = df_11['final_text'].apply(lambda x:pre_process(x))
df_15['tweet_processed'] = df_15['final_text'].apply(lambda x:pre_process(x))

In [None]:
with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_0_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_0, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_1_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_1, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_2_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_2, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_3_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_3, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_4_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_4, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_5_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_5, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_6_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_6, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_7_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_7, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_11_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_11, to_write)

with open('/Users/robertpagano/src/metis_project_kojak/network_files/marvel/community_pickles_pre_nlp/preproccessed_text/df_15_preprocess.pickle', 'wb') as to_write:
    pickle.dump(df_15, to_write)


In [None]:
X_0 = df_1['tweet_processed']
X_1 = df_1['tweet_processed']
X_2 = df_2['tweet_processed']
X_3 = df_3['tweet_processed']
X_4 = df_4['tweet_processed']
X_5 = df_5['tweet_processed']
X_6 = df_6['tweet_processed']
X_7 = df_7['tweet_processed']
X_11 = df_11['tweet_processed']
X_15 = df_15['tweet_processed']


In [None]:
tfidf_0 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_1 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_2 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_3 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_4 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_5 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_6 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_7 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_11 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))
tfidf_15 = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))


Community_0

In [None]:
bag_of_words_0 = tfidf_0.fit_transform(X_0)
lsa_0 = TruncatedSVD(20)
doc_topic_0 = lsa_0.fit_transform(bag_of_words_0)
lsa_0.explained_variance_ratio_

In [None]:
display_topics(lsa_0, tfidf_0.get_feature_names(), 10)

Community_1

In [None]:
bag_of_words_1 = tfidf_1.fit_transform(X_1)
lsa_1 = TruncatedSVD(20)
doc_topic_1 = lsa_1.fit_transform(bag_of_words_1)
lsa_1.explained_variance_ratio_

In [None]:
display_topics(lsa_1, tfidf_1.get_feature_names(), 10)

Community_2

In [None]:
bag_of_words_2 = tfidf_2.fit_transform(X_2)
lsa_2 = TruncatedSVD(20)
doc_topic_2 = lsa_2.fit_transform(bag_of_words_2)
lsa_2.explained_variance_ratio_

In [None]:
display_topics(lsa_2, tfidf_2.get_feature_names(), 10)

Community_3

In [None]:
bag_of_words_3 = tfidf_3.fit_transform(X_3)
lsa_3 = TruncatedSVD(20)
doc_topic_3 = lsa_3.fit_transform(bag_of_words_3)
lsa_3.explained_variance_ratio_

In [None]:
display_topics(lsa_3, tfidf_3.get_feature_names(), 10)

Community_4

In [None]:
bag_of_words_4 = tfidf_4.fit_transform(X_4)
lsa_4 = TruncatedSVD(20)
doc_topic_4 = lsa_4.fit_transform(bag_of_words_4)
lsa_4.explained_variance_ratio_

In [None]:
display_topics(lsa_4, tfidf_4.get_feature_names(), 10)

Community_5

In [None]:
bag_of_words_5 = tfidf_5.fit_transform(X_5)
lsa_5 = TruncatedSVD(20)
doc_topic_5 = lsa_5.fit_transform(bag_of_words_5)
lsa_5.explained_variance_ratio_

In [None]:
display_topics(lsa_5, tfidf_5.get_feature_names(), 10)

Community_6

In [None]:
bag_of_words_6 = tfidf_6.fit_transform(X_6)
lsa_6 = TruncatedSVD(20)
doc_topic_6 = lsa_6.fit_transform(bag_of_words_6)
lsa_6.explained_variance_ratio_

In [None]:
display_topics(lsa_6, tfidf_6.get_feature_names(), 10)

Community_7

In [None]:
bag_of_words_7 = tfidf_7.fit_transform(X_7)
lsa_7 = TruncatedSVD(20)
doc_topic_7 = lsa_7.fit_transform(bag_of_words_7)
lsa_7.explained_variance_ratio_

In [None]:
display_topics(lsa_7, tfidf_7.get_feature_names(), 10)

Community_11

In [None]:
bag_of_words_11 = tfidf_11.fit_transform(X_11)
lsa_11 = TruncatedSVD(20)
doc_topic_11 = lsa_11.fit_transform(bag_of_words_11)
lsa_11.explained_variance_ratio_

In [None]:
display_topics(lsa_11, tfidf_11.get_feature_names(), 10)

Community_15

In [None]:
bag_of_words_15 = tfidf_15.fit_transform(X_15)
lsa_15 = TruncatedSVD(20)
doc_topic_15 = lsa_15.fit_transform(bag_of_words_15)
lsa_15.explained_variance_ratio_

In [None]:
display_topics(lsa_15, tfidf_15.get_feature_names(), 10)

Now I have my top words in each LSA object w/ 20 topics each. I'm going to pickle each model and then use a new file to:
 - Find which tweets score the highest in each topic to get examples
 - Name communities
 - Sentiment analysis (maybe weekend)

In [None]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_0.pickle', 'wb') as to_write:
    pickle.dump(lsa_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_1.pickle', 'wb') as to_write:
    pickle.dump(lsa_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_2.pickle', 'wb') as to_write:
    pickle.dump(lsa_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_3.pickle', 'wb') as to_write:
    pickle.dump(lsa_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_4.pickle', 'wb') as to_write:
    pickle.dump(lsa_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_5.pickle', 'wb') as to_write:
    pickle.dump(lsa_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_6.pickle', 'wb') as to_write:
    pickle.dump(lsa_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_7.pickle', 'wb') as to_write:
    pickle.dump(lsa_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_11.pickle', 'wb') as to_write:
    pickle.dump(lsa_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_marvel/lsa_15.pickle', 'wb') as to_write:
    pickle.dump(lsa_15, to_write)
      

NMF Models?

In [None]:
nmf_0 = NMF(20)
doc_topic_NMF_0 = nmf_0.fit_transform(bag_of_words_0)
print(nmf_0.reconstruction_err_)

display_topics(nmf_0, tfidf_0.get_feature_names(), 10)

In [None]:
nmf_1 = NMF(20)
doc_topic_NMF_1 = nmf_1.fit_transform(bag_of_words_1)
print(nmf_1.reconstruction_err_)

display_topics(nmf_1, tfidf_1.get_feature_names(), 10)

In [None]:
nmf_2 = NMF(20)
doc_topic_NMF_2 = nmf_2.fit_transform(bag_of_words_2)
print(nmf_2.reconstruction_err_)

display_topics(nmf_2, tfidf_2.get_feature_names(), 10)

In [None]:
nmf_3 = NMF(20)
doc_topic_NMF_3 = nmf_3.fit_transform(bag_of_words_3)
print(nmf_3.reconstruction_err_)

display_topics(nmf_3, tfidf_3.get_feature_names(), 10)

In [None]:
nmf_4 = NMF(20)
doc_topic_NMF_4 = nmf_4.fit_transform(bag_of_words_4)
print(nmf_4.reconstruction_err_)

display_topics(nmf_4, tfidf_4.get_feature_names(), 10)

In [None]:
nmf_5 = NMF(20)
doc_topic_NMF_5 = nmf_5.fit_transform(bag_of_words_5)
print(nmf_5.reconstruction_err_)

display_topics(nmf_5, tfidf_5.get_feature_names(), 10)

In [None]:
nmf_6 = NMF(20)
doc_topic_NMF_6 = nmf_6.fit_transform(bag_of_words_6)
print(nmf_6.reconstruction_err_)

display_topics(nmf_6, tfidf_6.get_feature_names(), 10)

In [None]:
nmf_7 = NMF(20)
doc_topic_NMF_7 = nmf_7.fit_transform(bag_of_words_7)
print(nmf_7.reconstruction_err_)

display_topics(nmf_7, tfidf_7.get_feature_names(), 10)

In [None]:
nmf_11 = NMF(20)
doc_topic_NMF_11 = nmf_11.fit_transform(bag_of_words_11)
print(nmf_11.reconstruction_err_)

display_topics(nmf_11, tfidf_11.get_feature_names(), 10)

In [None]:
nmf_15 = NMF(20)
doc_topic_NMF_15 = nmf_15.fit_transform(bag_of_words_15)
print(nmf_15.reconstruction_err_)

display_topics(nmf_15, tfidf_15.get_feature_names(), 10)

In [None]:
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_0.pickle', 'wb') as to_write:
    pickle.dump(nmf_0, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_1.pickle', 'wb') as to_write:
    pickle.dump(nmf_1, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_2.pickle', 'wb') as to_write:
    pickle.dump(nmf_2, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_3.pickle', 'wb') as to_write:
    pickle.dump(nmf_3, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_4.pickle', 'wb') as to_write:
    pickle.dump(nmf_4, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_5.pickle', 'wb') as to_write:
    pickle.dump(nmf_5, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_6.pickle', 'wb') as to_write:
    pickle.dump(nmf_6, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_7.pickle', 'wb') as to_write:
    pickle.dump(nmf_7, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_11.pickle', 'wb') as to_write:
    pickle.dump(nmf_11, to_write)
    
with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_marvel/nmf_12.pickle', 'wb') as to_write:
    pickle.dump(nmf_12, to_write)


    

Running Topic Modeling on ALL text data, both NMF and LSA, on various topic counts, to feed into DBScan / KMeans for cluster analysis

In [50]:
# ## Creating list of the processed text files to merge, so it's all in the same format and can re-use code
# dfs_to_merge = [df_0, df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10, df_11, df_26, df_40]

In [51]:
# total = 0

# for df in dfs_to_merge:
#     print(df.shape)
#     total += df.shape[0]
    
# print(total)

In [49]:
# df_all_preprocessed = pd.concat(dfs_to_merge)

In [48]:
# df_all_preprocessed.shape

In [153]:
# df_all = df_all_preprocessed.copy()

In [45]:
# df_all.head()

In [46]:
# with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'wb') as to_write:
#     pickle.dump(df_all, to_write)

In [47]:
# with open('/Users/robertpagano/metis_data/project_5/community_pickles_pre_nlp/preprocessed_dfs/df_all.pickle', 'rb') as f:
#     df_all = pickle.load(f)

So the concat worked, the # of rows equals the number of all the rows added up (just to be safe)

Now I'll run a few different LSAs and NMFs on the total data, and move on to clustering

# Preprocess Text - for both LSA and NMF

In [1]:
# # Can come back to tweek this if I notice anything
# tweet_stopwords = stopwords.words('english') + \
#     ['rt', 'https', 'http', 'amp', 'via', 'one', 'around', 'would', 'let', 'could', 'going', 'like', 
#      'get', 'may', 'says', 'say', 'make', 'based', 'even', 'another', 'completely', 'thanks', 'way', 
#      'find', 'used', 'thing', '2019', 'see', 'need', 'know', 'knows', 'think', 'thinks', 'take', 'new', 
#      'day', 'days', 'aoc', 'alexandria', 'ocasio', 'cortez', 'ocasio-cortez', 'pron']

In [2]:
# tfidf_all = TfidfVectorizer(stop_words=tweet_stopwords, token_pattern="\\b[a-z][a-z]+\\b", ngram_range=(1,2))

In [3]:
# X_all = df_all['tweet_processed']

In [4]:
# bag_of_words_all = tfidf_all.fit_transform(X_all)

In [5]:
# type(bag_of_words_all)

# LSA on all text
*10, 15, 20, 25 topics*

In [6]:
# # LSA on 10 topics
# lsa_all_10 = TruncatedSVD(10)
# lsa_all_10.fit_transform(bag_of_words_all)
# lsa_all_10.explained_variance_ratio_

In [7]:
# display_topics(lsa_all_10, tfidf_all.get_feature_names(), 10)

In [8]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_10.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_10, to_write)

---------

In [9]:
# # LSA on 15 topics
# lsa_all_15 = TruncatedSVD(15)
# lsa_all_15.fit_transform(bag_of_words_all)
# lsa_all_15.explained_variance_ratio_

In [10]:
# display_topics(lsa_all_15, tfidf_all.get_feature_names(), 10)

In [11]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_15.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_15, to_write)

---------

In [12]:
# # LSA on 20 topics
# lsa_all_20 = TruncatedSVD(20)
# lsa_all_20.fit_transform(bag_of_words_all)
# lsa_all_20.explained_variance_ratio_

In [13]:
# display_topics(lsa_all_20, tfidf_all.get_feature_names(), 10)

In [14]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_20.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_20, to_write)

---------

In [15]:
# # LSA on 25 topics
# lsa_all_25 = TruncatedSVD(25)
# lsa_all_25.fit_transform(bag_of_words_all)
# lsa_all_25.explained_variance_ratio_

In [16]:
# display_topics(lsa_all_25, tfidf_all.get_feature_names(), 10)

In [17]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/lsa_aoc/lsa_all_25.pickle', 'wb') as to_write:
#     pickle.dump(lsa_all_25, to_write)

# NMF on all text
*10, 15, 20, 25 topics*

In [18]:
# # NMF on 10 topics
# nmf_all_10 = NMF(10)
# nmf_all_10.fit_transform(bag_of_words_all)
# print(nmf_all_10.reconstruction_err_)

In [19]:
# display_topics(nmf_all_10, tfidf_all.get_feature_names(), 10)

In [20]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_10.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_10, to_write)

---------

In [21]:
# # NMF on 15 topics
# nmf_all_15 = NMF(15)
# nmf_all_15.fit_transform(bag_of_words_all)
# print(nmf_all_15.reconstruction_err_)

In [22]:
# display_topics(nmf_all_15, tfidf_all.get_feature_names(), 10)

In [23]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_15.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_15, to_write)

---------

In [24]:
# # NMF on 20 topics
# nmf_all_20 = NMF(20)
# nmf_all_20.fit_transform(bag_of_words_all)
# print(nmf_all_20.reconstruction_err_)

In [25]:
# display_topics(nmf_all_20, tfidf_all.get_feature_names(), 10)

In [26]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_20.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_20, to_write)

---------

In [27]:
# # NMF on 25 topics
# nmf_all_25 = NMF(25)
# nmf_all_25.fit_transform(bag_of_words_all)
# print(nmf_all_25.reconstruction_err_)

# ## setting a variable to the fit transformed

In [28]:
# nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all))
# nmf_model_25_df.shape

In [29]:
# print(nmf_all_25.reconstruction_err_)
# nmf_model_25_df.head()

NOTE TO SELF - if the above works, I'll need to make DFs of all versions again (which will take around another hour) :(

Then I'll concat them with index and screen name, so I can assign values back to the tweeters otherwise. One annoying thing is I didn't pass in tweet ID so I'll have to think about that too

In [30]:
# display_topics(nmf_all_25, tfidf_all.get_feature_names(), 10)

In [31]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/nmf_aoc/nmf_all_25.pickle', 'wb') as to_write:
#     pickle.dump(nmf_all_25, to_write)

Now I will create DF's where I append this info to a dataframe

then try dbscan with these

In [32]:
# df_topic_base = df_all.reset_index()
# df_topic_base.head()

crap - now I know why I saved the fit_transform(bag_of_words_ objects previously, that's what I need to use to create the df. For now I'm seeing if I can just call that as the "data" and see if that works. Otherwise I will need to set them all equal to variables

In [33]:
# df_topic_base.shape

In [34]:
# lsa_all_10 = TruncatedSVD(10)

# lsa_model_10_df = pd.DataFrame(data=lsa_all_10.fit_transform(bag_of_words_all))
# lsa_model_10_df.shape

In [35]:
# lsa_all_15 = TruncatedSVD(15)

# lsa_model_15_df = pd.DataFrame(data=lsa_all_15.fit_transform(bag_of_words_all))
# lsa_model_15_df.shape

In [36]:
# lsa_all_20 = TruncatedSVD(20)

# lsa_model_20_df = pd.DataFrame(data=lsa_all_20.fit_transform(bag_of_words_all))
# lsa_model_20_df.shape

In [37]:
# lsa_all_25 = TruncatedSVD(25)

# lsa_model_25_df = pd.DataFrame(data=lsa_all_25.fit_transform(bag_of_words_all))
# lsa_model_25_df.shape

In [38]:
# df_lsa_model_10 = pd.merge(df_topic_base, lsa_model_10_df, left_index=True, right_index=True)
# df_lsa_model_15 = pd.merge(df_topic_base, lsa_model_15_df, left_index=True, right_index=True)
# df_lsa_model_20 = pd.merge(df_topic_base, lsa_model_20_df, left_index=True, right_index=True)
# df_lsa_model_25 = pd.merge(df_topic_base, lsa_model_25_df, left_index=True, right_index=True)

---------

In [39]:
# nmf_all_10 = NMF(10)

# nmf_model_10_df = pd.DataFrame(data=nmf_all_10.fit_transform(bag_of_words_all))
# nmf_model_10_df.shape

In [40]:
# nmf_all_15 = NMF(15)

# nmf_model_15_df = pd.DataFrame(data=nmf_all_15.fit_transform(bag_of_words_all))
# nmf_model_15_df.shape

In [41]:
# nmf_all_20 = NMF(20)

# nmf_model_20_df = pd.DataFrame(data=nmf_all_20.fit_transform(bag_of_words_all))
# nmf_model_20_df.shape

In [42]:
# # nmf_model_25_df = pd.DataFrame(data=nmf_all_25.fit_transform(bag_of_words_all)) ALREADY DONE!
# nmf_model_25_df.shape

In [43]:
# df_nmf_model_10 = pd.merge(df_topic_base, nmf_model_10_df, left_index=True, right_index=True)
# df_nmf_model_15 = pd.merge(df_topic_base, nmf_model_15_df, left_index=True, right_index=True)
# df_nmf_model_20 = pd.merge(df_topic_base, nmf_model_20_df, left_index=True, right_index=True)
# df_nmf_model_25 = pd.merge(df_topic_base, nmf_model_25_df, left_index=True, right_index=True)


In [44]:
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_10.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_10, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_15.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_15, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_20.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_20, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_lsa_model_25.pickle', 'wb') as to_write:
#     pickle.dump(df_lsa_model_25, to_write)
    

# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_10.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_10, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_15.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_15, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_20.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_20, to_write)
    
# with open('/Users/robertpagano/metis_data/project_5/topic_modeling/topic_dfs/df_nmf_model_25.pickle', 'wb') as to_write:
#     pickle.dump(df_nmf_model_25, to_write)
    
    

    
