create legend with plotly see link in py script
maybe message plotly person
https://plot.ly/scikit-learn/plot-compare-methods/

In [25]:
import argparse
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation 
from sklearn.model_selection import GridSearchCV

"""
Topic modeling on time series data
Sample call from commandline:
python topic_modeling.py --path ~/Documents/Git/Twitter-Mining/streaming_tweets/data/streaming_guns_rights_guncontrol_converted_tweets.tsv
"""


def nmf_analysis(data, optimize = False):
    """
    Vectorize tweet data into a tfidf matrix. Run NMF on the matrix. If optimize = True, 
    grid search will report out model's performance as the number of topics change. I still need a good statistic to evaluate NMF
    Basic tutorial below:
    http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html
    """
    
    n_features = 1000 
    n_topics = 10
    n_top_words = 5

    tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features=n_features, stop_words='english')
    # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    # term frequency * inverse document frequency

    tfidf = tfidf_vectorizer.fit_transform(data)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()

    # Optimize:
    if optimize == True:
        model = grid_search(tfidf, tfidf_feature_names, "NMF")
    else:
        nmf = NMF(n_components=n_topics, random_state=0,alpha=.1, l1_ratio=.5).fit(tfidf)
        nmf_transform = nmf.transform(tfidf)
        display_topics(nmf, tfidf_feature_names, n_top_words)
        run_tsne(nmf_transform)

def lda_analysis(data, optimize = False):
    """ 
    Vectorize tweet data and run LDA. If optimize = True, grid search will run and report perplexity 
    of the model as the number of topics changes.
    LDA can only use raw term counts for LDA because it is a probabilistic graphical model
    """

    n_features = 1000 
    n_topics = 2
    n_top_words = 15

    tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, max_features = n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data)
    tf_feature_names = tf_vectorizer.get_feature_names()
    

    if optimize == True:
        # Find optimal number of components
        model = grid_search(tf, tf_feature_names, "LDA")
    
    else:
        lda = LatentDirichletAllocation(n_components = n_topics, learning_method = 'online', random_state=0).fit(tf)
        lda_transform = lda.transform(tf) 
        print("Model's perplexity:", lda.perplexity(tf))
        display_topics(lda, tf_feature_names, n_top_words)


        lda_keys = []
        for i in range(lda_transform.shape[0]):

            lda_keys.append(lda_transform[i].argmax())
        #print(lda_keys)
        run_tsne(lda_transform, lda_keys)
    
def grid_search(vectorized_data, feature_names, method = None):
    
    """
    Takes model and searches for optimal number of topics based on perplexity. If "LDA" is passed when 
    function is called, LDA will be optimized; otherwise, the default is NMF.
    """
    
    print("Optimizing...")
    topics = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
    n_top_words = 5
    
    for i in topics:
        if method == "LDA": 
            model = LatentDirichletAllocation(n_components = i, learning_method = 'online', random_state = 0).fit(vectorized_data)
            model_transform = model.transform(vectorized_data)
        else: 
            model = NMF(n_components = i, random_state = 0).fit(vectorized_data)
            model_transform = model.transform(vectorized_data)
        keys = []
        
        if method == "LDA":
            print("{} topics:".format(i))
            print("Current model's perplexity is:", model.perplexity(vectorized_data))
        else:
            print("test")
             #print("Current model's Frobenius norm of the matrix difference is:", model.reconstruction_err_())
        for j in range(model_transform.shape[0]):
            keys.append(model_transform[j].argmax())
        display_topics(model, feature_names, n_top_words)
        #run_tsne(model_transform, keys)
  

def run_tsne(transform, keys):
    """
    Run tsne
    """
    
    tsne = TSNE(n_components = 2, verbose = 1, random_state = 0, angle = .99, init = 'pca').fit_transform(transform)
    visualize(tsne, keys)


def display_topics(model, feature_names, n_top_words):
    """
    Display the most frequent words per topic
    """
    
    for topic_n, topic in enumerate(model.components_): # topic_n is the index: 1,2,3...
        print(",".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]),"\n")

def visualize(tsne_data, lda_keys): #title = None, labels = None):
    """
    Visualize dimensionally reduced data
    """
    
    print("Plotting...")
    # take tsne plot from lecture
    color_scale = np.linspace(0, 1, len(set(lda_keys)))
    c = [plt.cm.Set2(color_scale[i]) for i in lda_keys]
    print(color_scale)
    # Custom colors:
    #colormap = np.array(["#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
    #                     "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5"])
    
    fig, ax = plt.subplots()
    
    # Plot with custom colors
    #ax.scatter(tsne_data[:, 0], tsne_data[:, 1], 
    #           color = colormap[lda_keys][:len(lda_keys)], alpha=0.75)
    ax.scatter(tsne_data[:, 0], tsne_data[:, 1], color = c, alpha=0.75)
    plt.show()

    
tweet_dataframe = pd.read_csv(
    "~/Documents/Git/Twitter-Mining/streaming_tweets/data/streaming_cohen_trump_FBI_raid_mueller_nda_corruption_corrupt_stormy.jsonl_converted_tweets.tsv", sep = '\t')
    
print(tweet_dataframe.isnull().sum())
#nmf_analysis(tweet_dataframe['filtered_text']) 
tweet_dataframe = tweet_dataframe.dropna()
lda_analysis(tweet_dataframe['filtered_text'], optimize = True) 
     

Unnamed: 0         0
Unnamed: 0.1       0
date               0
likes              0
retweets           0
text               0
filtered_text    449
dtype: int64
Optimizing...


KeyboardInterrupt: 

In [16]:
import pandas as pd
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


def testing_df(data, text):
    cnt = 0
    a = np.where(data.isnull())
   
    print(text.loc[a[0][0]])
def testing_tf(data):

    n_features = 1000 
    n_topics = 2
    n_top_words = 15

    tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, max_features = n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data)
    tf_feature_names = tf_vectorizer.get_feature_names()


tweet_dataframe = pd.read_csv(
    "~/Documents/Git/Twitter-Mining/streaming_tweets/data/streaming_cohen_trump_FBI_raid_mueller_nda_corruption_corrupt_stormy.jsonl_converted_tweets.tsv", sep = '\t')
    #"~/Documents/Git/Twitter-Mining/streaming_tweets/data/streaming_guns_rights_guncontrol_converted_tweets.tsv", sep = '\t')


#print(tweet_dataframe.isnull().sum())
#tweet_dataframe = tweet_dataframe.dropna()
testing_df(tweet_dataframe['filtered_text'], tweet_dataframe['text'])
#print(tweet_dataframe.isnull().sum())

https://t.co/94vRMJdIl2


need to find where blanks are in data and drop whole rows. The way I was doing it before was not dropping whole rows


In [2]:
from sklearn.pipeline import Pipeline

In [None]:
tweet_dataframe = pd.read_csv(
    "~/Documents/Git/Twitter-Mining/streaming_tweets/data/streaming_guns_rights_guncontrol_converted_tweets.tsv", sep = '\t')


    n_features = 1000 
    n_topics = 2
    n_top_words = 15

tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, max_features = n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data)

lda = LatentDirichletAllocation(n_components = n_topics, random_state=0).fit(tf)
lda_transform = lda.transform(tf) 
print("Model's perplexity:", lda.perplexity(tf))
display_topics(lda, tf_feature_names, n_top_words)


lda_keys = []
for i in range(lda_transform.shape[0]):

    lda_keys.append(lda_transform[i].argmax())
#print(lda_keys)
run_tsne(lda_transform, lda_keys)
pipe = Pipeline(steps=[('CountVectorizer', cv), ('LDA', lda)])

future directions:
cocurrent word network
better preprocessing
time series of topic change
tweet bot