In [1]:
from gensim.corpora import Dictionary
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string

In [2]:
# read with spark because of nested list column 
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("long-parsed-tweets2")

In [3]:
df.where(df.lang == "en").count()

12970523

In [4]:
df = df.where(df.lang == "en").sample(0.1)
df = df.withColumn("rt_indicator", when(df.full_text.like("RT @%"), 1).otherwise(0))
pdf = df.toPandas()

In [5]:
import re
def replace_entities(row):
    mentions = row['mentions']
    urls = row['urls']
    tweet = row['full_text']
    
    mentions = ['@' + m for m in mentions.split(' ')]
    for m in mentions:
        if len(m) > 0:
            tweet = tweet.replace(m, '@MENTION')
        
    urls = urls.split(' ')
    for u in urls:
        if len(u) > 0:
            tweet = tweet.replace(u, '@URL')
    http = re.compile(r'https?://\S+')
    tweet = http.sub('@URL', tweet)
        
    return tweet

In [6]:
pdf['cleaned_text'] = pdf.apply(replace_entities, axis=1)

In [7]:
from demoji import replace
import re
from gensim.parsing.preprocessing import strip_multiple_whitespaces, remove_stopwords
from gensim.utils import to_unicode
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS

STOPWORDS = STOPWORDS.union(stopwords.words('english')).union(set('&amp;'))
def my_remove_stopwords(s):
    s = to_unicode(s)
    s = s.lower()
    return " ".join(w for w in s.split() if w not in STOPWORDS)

def replacer(string):
    string = string.replace("-", "")
    string = string.replace("RT ", "")
    handles_and_hashtags = re.compile(r"[^\w\d#@\s]+")
    string =  handles_and_hashtags.sub('', string)
    return string

custom_filters = [
                  strip_multiple_whitespaces,
                  replacer,
                  my_remove_stopwords,
                  SnowballStemmer("english").stem, 
                 ]
pdf['cleaned_text'] = pdf.cleaned_text.apply(preprocess_string, filters=custom_filters)

def further_replacer(text_list):
    numbers = re.compile(r"\d+")
    new_text_list = []
    for w in text_list:
        if w == '@url':
            new_text_list.append('@URL')
            continue
            
        if w == '@mention':
            new_text_list.append('@MENTION')
            continue
            
        if numbers.match(w):
            new_text_list.append('@NUMBER')
            continue
            
        else:
            new_text_list.append(w)
    
    return new_text_list

pdf['cleaned_text'] = pdf.cleaned_text.apply(further_replacer)

In [8]:
pdf["created_at"] = pd.to_datetime(pdf.created_at)

In [9]:
# need to confirm it's right to do dict with full text but other things not with it 
dictionary = Dictionary(pdf.cleaned_text.to_list())

training_df = pdf[pdf.rt_indicator == 0]

training_docs = training_df.cleaned_text.to_list()
training_corpus = [dictionary.doc2bow(tweet) for tweet in training_docs]

full_docs = pdf.cleaned_text.to_list()
full_corpus = [dictionary.doc2bow(tweet) for tweet in full_docs]



how to make topics have fewer words?

In [10]:
from gensim.models import ldamulticore
# main hyperparameter is number of topics, 10 may be too little, try 50 or 100 for this random sample dataset
# for coronavirus themed tweets, we could do fewer topics 

# Set training parameters.
# try different number of topics
num_topics = 10
chunksize = 2000 # number of documents passed to a core

# use defaults for iterations and passes and see if modeling is good
passes = 20 # number of passes through corpus
iterations = 400 # could make 100 for coronavirus tweets, but could reduce for faster development iterations 
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = ldamulticore.LdaMulticore(
    corpus=training_corpus,
    id2word=id2word,
    chunksize=chunksize,
    eta='auto',
    iterations=400,
    num_topics=50,
    eval_every=eval_every
)

In [11]:
top_topics = model.top_topics(training_corpus)

In [12]:
top_topics

[([(0.1881176, '@NUMBER'),
   (0.08079993, '@URL'),
   (0.06435269, 'coronavirus'),
   (0.054149825, 'cases'),
   (0.031111274, 'confirmed'),
   (0.023398168, 'china'),
   (0.022365713, 'new'),
   (0.020884141, '#coronavirus'),
   (0.019575823, 'case'),
   (0.014553761, 'deaths'),
   (0.013449169, 'death'),
   (0.009569494, 'updates'),
   (0.009156577, 'toll'),
   (0.008832387, 'number'),
   (0.008245634, 'infected'),
   (0.008186652, 'reported'),
   (0.007836486, 'worldwide'),
   (0.0073350435, 'wuhan'),
   (0.0072846347, 'novel'),
   (0.0068611545, 'update')],
  -3.212480302717993),
 ([(0.16021077, '@URL'),
   (0.11075835, 'coronavirus'),
   (0.04379708, 'global'),
   (0.03539388, 'outbreak'),
   (0.030140435, 'fears'),
   (0.02553773, 'china'),
   (0.023216948, 'impact'),
   (0.02171014, 'new'),
   (0.02096792, 'emergency'),
   (0.017327406, 'amid'),
   (0.015219435, 'hit'),
   (0.013225736, 'declared'),
   (0.011894473, 'ways'),
   (0.011340712, 'heres'),
   (0.011194554, 'health')

In [13]:
# how 2 do this in spark??
# add corpus as column?
topics = []
for i in range(len(full_corpus)):
    topics.append(model.get_document_topics(full_corpus[i], minimum_probability=0.0))

In [14]:
pdf['topics'] = topics

In [15]:
pdf

Unnamed: 0,created_at,full_text,lang,coordinates,favorite_count,retweet_count,bounding_box,country,place_type,full_name,mentions,urls,hashtags,rt_indicator,cleaned_text,topics
0,2020-03-28 19:27:30+00:00,RT @theofficialFEMI: Life post COVID-19 in Nig...,en,,0,4,,,,,theofficialFEMI,,,1,"[@MENTION, life, post, covid19, nigeria, proba...","[(0, 0.002000269), (1, 0.002000269), (2, 0.002..."
1,2020-03-28 19:27:42+00:00,RT @BillGates: There are few people I’ve learn...,en,,0,2126,,,,,BillGates,,,1,"[@MENTION, people, ive, learned, yearsespecial...","[(0, 0.0025184732), (1, 0.0025184732), (2, 0.0..."
2,2020-03-28 19:28:09+00:00,RT @KGeorgieva: Had a good conversation with #...,en,,0,1045,,,,,KGeorgieva ImranKhanPTI,,Pakistan COVID,1,"[@MENTION, good, conversation, #pakistan, pm, ...","[(0, 0.0016671811), (1, 0.0016671811), (2, 0.0..."
3,2020-03-28 19:28:10+00:00,RT @jenn7399: me when i call it COVID-19 inste...,en,,0,35251,,,,,jenn7399,,,1,"[@MENTION, covid19, instead, rona, @URL]","[(0, 0.0033334587), (1, 0.0033334587), (2, 0.0..."
4,2020-03-28 19:28:11+00:00,RT @tcfsps: We have joined together with local...,en,,0,1,,,,,tcfsps,,,1,"[@MENTION, joined, local, partners, thurston, ...","[(0, 0.0018199659), (1, 0.0018199659), (2, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298059,2020-03-18 08:34:29+00:00,RT @BashirAhmaad: Chairman of the Presidential...,en,,0,338,,,,,BashirAhmaad,,Coronavirus,1,"[@MENTION, chairman, presidential, task, force...","[(0, 0.0014287306), (1, 0.0014287306), (2, 0.0..."
1298060,2020-03-18 08:34:36+00:00,RT @PadmajaJoshi: Former FM on #IndiaFightsCor...,en,,0,55,,,,,PadmajaJoshi,,IndiaFightsCorona,1,"[@MENTION, fm, #indiafightscorona, steps, take...","[(0, 0.14496247), (1, 0.0015403233), (2, 0.001..."
1298061,2020-03-18 08:34:37+00:00,I'm sorry but this is unbelievable 🤦🏻‍♂️ so yo...,en,,0,0,,,,,,twitter.com/JohnsonThompso…,,0,"[im, sorry, unbelievable, warm, weather, train...","[(0, 0.22267032), (1, 0.0008696822), (2, 0.052..."
1298062,2020-03-18 08:34:41+00:00,RT @ABaerbock: Wow. That‘s how politics should...,en,,0,121,,,,,ABaerbock katieporteroc,,,1,"[@MENTION, wow, thats, politics, work, straigh...","[(0, 0.0016679678), (1, 0.0016679678), (2, 0.3..."


In [16]:
topic_df = pd.DataFrame(topics)

In [17]:
topic_df = topic_df.applymap(lambda x: x[1])

In [18]:
df = pd.concat([pdf.reset_index(drop=True), topic_df.reset_index(drop=True)], axis=1)

In [19]:
df.to_csv("50_topic_model_round_2.csv")

In [20]:
text_dict = {}
for i in range(10):
    small_df = df[['full_text', i]]
    small_df = small_df.sort_values(i, ascending=False)
    text_dict[i] = list(small_df.full_text.unique()[:10])

In [21]:
import json
with open("50_topics_tweets_round_2.json", "w") as f:
    f.write(json.dumps(text_dict, indent = 2, ensure_ascii = False))