In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import re
import numpy as np
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [2]:
n_features = 1000
n_components = 75
n_top_words = 20

stop_words = set(stopwords.words('english'))

lemmer = WordNetLemmatizer()

def get_top_words(H, feature_names):
    output = []
    for topic_idx, topic in enumerate(H):
        top_words = [(feature_names[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
        output.append([str(topic_idx)] + top_words)
        
    return pd.DataFrame(output) 

def print_top_docs(topic_idx, W_df, data):
    print(topic_idx)
    top_doc_indices = np.argsort( W_df.iloc[:,topic_idx] )[::-1]
    for doc_index in top_doc_indices[0:5]:
        print(data.iloc[doc_index])

# Simple preprocessor
def preprocess(x):
    
    # Remove URLs
    x = re.sub(r'http\S+', '', x)
    
    # Remove HTML tags
    x = BeautifulSoup(x, "lxml").get_text()
    
    #Lowercase
    x = x.lower()
    
    # Remove HTML
    
    # Remove stopwords and lemmatize
    x = [lemmer.lemmatize(w) for w in x.split() if w not in stop_words]
    return ' '.join(x) 


def find_topics(data_samples, text_col = "tweet_text", include_text=True):   
    output = []
    
    # We override the token_pattern in order to keep @signs and #hashtags
    tfidf_vectorizer = TfidfVectorizer(max_df=0.80, min_df=5,
                                   token_pattern = '[a-zA-Z0-9@#]+',
                                   ngram_range={1,3},
                                   preprocessor=preprocess,
                                   max_features=n_features)

    tfidf = tfidf_vectorizer.fit_transform(data_samples)
    
    nmf = NMF(n_components=n_components, random_state=1, init='nndsvda', solver='mu', alpha=.1, l1_ratio=.5)
    
    W = nmf.fit_transform(tfidf)
    H = nmf.components_
    top_words = get_top_words(H, tfidf_vectorizer.get_feature_names())
    
    W_df = pd.DataFrame(W, columns=["topic {}".format(i) for i in range(n_components)])
    H_df = pd.DataFrame(H, columns=tfidf_vectorizer.get_feature_names())
    
    return W_df, H_df, top_words   


In [3]:
def do_it(file_base, id_col="id", text_col="tweet_text"):
    df = pd.read_csv('data/'+file_base+'.csv')
    
    # Drop rows without any text
    df = df.dropna(subset=[text_col])
    
    W_df, H_df, top_words = find_topics(df[text_col])

    #print(top_words)
    #for i in range(n_components):
    #    print_top_docs(i, W_df, df[text_col])

    pd.concat([df, W_df], axis=1).to_csv('out/'+file_base+'_W.csv', float_format='%.4f', index=False)
    H_df.to_csv('out/'+file_base+'_H.csv', float_format='%.4f', index=True)
    top_words.to_csv('out/'+file_base+'_top_words.csv', index=False)

In [4]:
do_it(file_base="obama_tweets", id_col="id", text_col="tweet_text")

In [5]:
do_it(file_base="kiva_cleaned", id_col="loan_id", text_col="en_clean")

In [6]:
do_it(file_base="elonmusk_tweets", id_col="id", text_col="text")

In [7]:
do_it(file_base="2017_trump_tweets", id_col="id", text_col="tweet")

In [8]:
do_it(file_base="pence_tweets", id_col="id", text_col="tweet_text")

In [9]:
do_it(file_base="reutersCSV", id_col="pid", text_col="doc.text")

In [10]:
do_it(file_base="JoeBidenTweets", id_col="id", text_col="tweet")

In [11]:
do_it(file_base="amazon_food_reviews_10", id_col="reviewID", text_col="reviewText")

In [12]:
do_it(file_base="ISKON_IMB767-XLS-ENG", id_col="ID", text_col="text")

In [13]:
do_it(file_base="imdb.small", id_col="id", text_col="en")

In [14]:
do_it(file_base="reviews_Grocery_and_Gourmet_Food_5_50000", id_col="reviewID", text_col="reviewText")

In [15]:
do_it(file_base="vaers2", id_col="VAERS_ID", text_col="SYMPTOM_TEXT")

  if (await self.run_code(code, result,  async_=asy)):


In [16]:
do_it(file_base="Hillary_Emails", id_col="Id", text_col="ExtractedBodyText")