In [1]:
from gensim.corpora import Dictionary
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string

In [36]:
# read with spark because of nested list column 
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("long-parsed-tweets2")

In [37]:
df = df.where(df.lang == "en").sample(0.001)
training_df = df.where(~ df.full_text.like("RT @%")).toPandas()
full_df = df.toPandas()

In [65]:
def replace_entities(row):
    mentions = row['mentions']
    urls = row['urls']
    tweet = row['full_text']
    
    mentions = ['@' + m for m in mentions.split(' ')]
    for m in mentions:
        if len(m) > 0:
            tweet = tweet.replace(m, '@MENTION')
        
    urls = urls.split(' ')
    for u in urls:
        if len(u) > 0:
            tweet = tweet.replace(u, '@URL')
        
    return tweet

In [66]:
training_df['replaced_text'] = training_df.apply(replace_entities, axis=1)

In [104]:
from demoji import replace
import re
from gensim.parsing.preprocessing import strip_multiple_whitespaces, remove_stopwords
from gensim.utils import to_unicode
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS

STOPWORDS = STOPWORDS.union(stopwords.words('english')).union(set('&amp;'))
def my_remove_stopwords(s):
    s = to_unicode(s)
    s = s.lower()
    return " ".join(w for w in s.split() if w not in STOPWORDS)

def regexer(string):
    http = re.compile(r'https?://\S+')
    string = http.sub('@URL', string)
    string = string.replace("-", " ")
    string = string.replace("RT ", "")
    handles_and_hashtags = re.compile(r"[^\w\d#@\s]+")
    string =  handles_and_hashtags.sub('', string)
    return string

custom_filters = [
                  replace,
                  strip_multiple_whitespaces,
                  regexer,
                  my_remove_stopwords,
                  SnowballStemmer("english").stem, 
                 ]
training_df['text'] = training_df.replaced_text.apply(preprocess_string, filters=custom_filters)

def further_replacer(text_list):
    numbers = re.compile(r"\d+")
    new_text_list = []
    for w in text_list:
        if w == '@url':
            new_text_list.append('@URL')
            continue
            
        if w == '@mention':
            new_text_list.append('@MENTION')
            continue
            
        if numbers.match(w):
            new_text_list.append('@NUMBER')
            continue
            
        else:
            new_text_list.append(w)
    
    return new_text_list

training_df['text'] = training_df.text.apply(further_replacer)

In [108]:
training_df["created_at"] = pd.to_datetime(training_df.created_at)

In [107]:
training_docs = training_df.text.to_list()
training_dictionary = Dictionary(docs)
training_corpus = [training_dictionary.doc2bow(tweet) for tweet in training_docs]

full

how to make topics have fewer words?

In [111]:
from gensim.models import ldamulticore
# main hyperparameter is number of topics, 10 may be too little, try 50 or 100 for this random sample dataset
# for coronavirus themed tweets, we could do fewer topics 

# Set training parameters.
# try different number of topics
num_topics = 10
chunksize = 2000 # number of documents passed to a core

# use defaults for iterations and passes and see if modeling is good
passes = 20 # number of passes through corpus
iterations = 400 # could make 100 for coronavirus tweets, but could reduce for faster development iterations 
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = ldamulticore.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    eta='auto',
    iterations=400,
    num_topics=5,
    eval_every=eval_every
)

In [112]:
top_topics = model.top_topics(corpus)

In [113]:
# how 2 do this in spark??
# add corpus as column?
topics = []
for i in range(len(corpus)):
    topics.append(model.get_document_topics(corpus[i], minimum_probability=0.0))

In [12]:
training_df['topics'] = topics

In [13]:
topic_df = pd.DataFrame(topics)

In [14]:
topic_df = topic_df.applymap(lambda x: x[1])

In [15]:
df = pd.concat([pdf.reset_index(drop=True), topic_df.reset_index(drop=True)], axis=1)

In [16]:
df.to_csv("5_topic_model.csv")

In [17]:
text_dict = {}
for i in range(10):
    small_df = df[['full_text', i]]
    small_df = small_df.sort_values(i, ascending=False)
    text_dict[i] = list(small_df.full_text.unique()[:10])

KeyError: '[5] not in index'

In [18]:
import json
with open("5_topics_tweets.json", "w") as f:
    f.write(json.dumps(text_dict, indent = 2, ensure_ascii = False))