In [1]:
from gensim.corpora import Dictionary
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string

In [2]:
# read with spark because of nested list column 
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
df = spark.read.parquet("long-parsed-tweets2")

In [3]:
df.where(df.lang == "en").count()

12970523

In [4]:
df = df.where(df.lang == "en").sample(0.05)
training_df = df.where(~ df.full_text.like("RT @%")).sample(0.05).toPandas()
full_df = df.toPandas()

In [5]:
import datefinder

In [35]:
s = "SeaWorld Entertainment is furloughing 90% of its workers because the coronavirus had forced the company to close its 12 theme parks."
for m in datefinder.find_dates(s, index=True):
    print(m)

(datetime.datetime(1990, 5, 15, 0, 0), (37, 40))
(datetime.datetime(2020, 5, 12, 0, 0), (116, 120))


In [None]:
s[37:]

In [34]:
help(datefinder.find_dates)

Help on function find_dates in module datefinder:

find_dates(text, source=False, index=False, strict=False, base_date=None)
    Extract datetime strings from text
    
    :param text:
        A string that contains one or more natural language or literal
        datetime strings
    :type text: str|unicode
    :param source:
        Return the original string segment
    :type source: boolean
    :param index:
        Return the indices where the datetime string was located in text
    :type index: boolean
    :param strict:
        Only return datetimes with complete date information. For example:
        `July 2016` of `Monday` will not return datetimes.
        `May 16, 2015` will return datetimes.
    :type strict: boolean
    :param base_date:
        Set a default base datetime when parsing incomplete dates
    :type base_date: datetime
    
    :return: Returns a generator that produces :mod:`datetime.datetime` objects,
        or a tuple with the source text and index, if req

In [6]:
from dateutil.parser import parse

In [7]:
import re
def replace_entities(row):
    mentions = row['mentions']
    urls = row['urls']
    tweet = row['full_text']
    
    mentions = ['@' + m for m in mentions.split(' ')]
    for m in mentions:
        if len(m) > 0:
            tweet = tweet.replace(m, '@MENTION')
        
    urls = urls.split(' ')
    for u in urls:
        if len(u) > 0:
            tweet = tweet.replace(u, '@URL')
    http = re.compile(r'https?://\S+')
    tweet = http.sub('@URL', tweet)
        
    return tweet

In [8]:
training_df['entity_replaced'] = training_df.apply(replace_entities, axis=1)

In [9]:
from dateutil.parser import parse
from datetime import datetime
 
def remove_dates(tweet): 
    try:
        parsed = parse(tweet, fuzzy_with_tokens=True) 
        text = ' '.join(parsed[1])
    except:
        text = tweet
    
    return text

In [10]:
training_df['entity_and_date_replaced'] = training_df.entity_replaced.apply(remove_dates)



In [29]:
not_covid_19.full_text[2]

'OCR is particularly focused on ensuring that covered entities do not unlawfully discriminate against people with disabilities when making decisions about their treatment during the COVID-19 health care emergency.\n\nhttps://t.co/sqpsraN5ES'

In [31]:
not_covid_19 = training_df[~training_df['entity_replaced'].str.lower().str.contains("covid-19", case=False, regex=False)]
not_covid_19 = not_covid_19[~not_covid_19['entity_replaced'].str.lower().str.contains("covid19", case=False, regex=False)]
unmatch = not_covid_19[not_covid_19['entity_replaced'] != not_covid_19['entity_and_date_replaced']][['entity_replaced', 'entity_and_date_replaced']]
for r in unmatch.itertuples():
    print(r[0])
    print(r[1])
    print(r[2])
    print("")

26
Literally everything I complained about mid February... especially shortness of breath @URL
Literally everything I complained about mid  ... especially shortness of breath @URL

34
How to Significantly Slow Coronavirus? #Masks4All @URL via @MENTION @MENTION @MENTION
How to Significantly Slow Coronavirus? #Masks All @URL via @MENTION @MENTION @MENTION

41
@MENTION it’s 92 people. @URL
@MENTION it’s  people. @URL

66
Superintendent talks to Local 10 about school changes in midst of coronavirus @URL @URL
Superintendent talks to Local  about school changes in midst of coronavirus @URL @URL

70
SeaWorld Entertainment is furloughing 90% of its workers because the coronavirus had forced the company to close its 12 theme parks.

@URL
SeaWorld Entertainment is furloughing  % of its workers because the coronavirus had forced the company to close its  theme parks.  @URL

84
Thoughts? San Francisco:: China Coronavirus Airlift Flights Land At Travis Air Force Base @URL #Local #News #SyndicatedLo

2692
Report: Human-to-human spread of coronavirus occurred in December @URL
Report: Human-to-human spread of coronavirus occurred in   @URL

2701
How Do I Know If I Have Coronavirus? 5 Questions Answered @URL
How Do I Know If I Have Coronavirus?  Questions Answered @URL

2705
England's first two coronavirus patients confirmed. UK patients transferred to specialist infection unit.

62 countries place immigration control on Chinese citizens
England's first two coronavirus patients confirmed. UK patients transferred to specialist infection unit.   countries place immigration control on Chinese citizens

2712
#Coronavirus after 2 weeks #PlagueInc @URL
#Coronavirus after  weeks #PlagueInc @URL

2722
Coronavirus: 114 tests for the virus carried out in B.C., say health officials /richmondbc @URL
Coronavirus:  tests for the virus carried out in B.C., say health officials /richmondbc @URL

2729
The coronavirus reportedly surfaced in Wuhan, China, on Dec. 8. The local government waited more than

... the foreign ministry just sent us this advisory. ( ) #coronavirus @URL

4497
@MENTIONSABCNewsOnline Is it true that Nelfivir (PI-based ART drug) has been successfully used to treat a 56 year old Corona Virus in Wuhan?
@MENTIONSABCNewsOnline Is it true that Nelfivir (PI-based ART drug) has been successfully used to treat a  year old Corona Virus in Wuhan?

4502
Coronavirus has been declared a health emergency, Amazon is on the way to a $1 trillion valuation and it’s Brexit Day. Here’s what’s moving markets @URL
Coronavirus has been declared a health emergency, Amazon is on the way to a $ trillion valuation and it’s Brexit Day. Here’s what’s moving markets @URL

4507
As the coronavirus spreads, fear is fueling racism and xenophobia – NEWPAPER24 @URL @URL
As the coronavirus spreads, fear is fueling racism and xenophobia – NEWPAPER @URL @URL

4511
My Bacteria, called Coronavirus, just got eradicated in 808 days! #PlagueInc 
Lol xd
My Bacteria, called Coronavirus, just got eradicated in


6283
Ciara &amp; Russell Wilson Donate 1 Million Meals Amid Coronavirus Outbreak @URL di @MENTION
Ciara &amp; Russell Wilson Donate  Million Meals Amid Coronavirus Outbreak @URL di @MENTION

6293
Amazing OFFER !! Click the link
#Oprah #CLUBTWITTER #californialockdown #Kathy #DropOutBernie #Drake #Biden #HS3ATMIDNIGHT #ThisIsUs #NBA #Lakers #LeBron #UnitedStates #America #Trump #ÇanakkaleGeçilmez #COVID #coronavirus #COVID2019 #China #India #BTS
 @URL
Amazing OFFER !! Click the link #Oprah #CLUBTWITTER #californialockdown #Kathy #DropOutBernie #Drake #Biden #HS ATMIDNIGHT #ThisIsUs #NBA #Lakers #LeBron #UnitedStates #America #Trump #ÇanakkaleGeçilmez #COVID #coronavirus #COVID #China #India #BTS  @URL

6294
@MENTION @MENTION BREAKING: Number of worldwide coronavirus cases passes 200,000
@MENTION @MENTION BREAKING: Number of worldwide coronavirus cases passes 

6297
I am going into work in an hour so they can decide who gets to self isolate and work from home and who must work as part o

In [None]:
training_df.loc[0, 'full_text']

In [None]:
parse(training_df.loc[0, 'entity_replaced'], fuzzy_with_tokens=True)

In [None]:
for m in datefinder.find_dates(training_df.loc[0, 'entity_replaced']):
    print(m)

In [None]:
# TODO: figure out how to ignore dates that are parsed out of covid-19 without removing covid-19 from the string

In [None]:
from demoji import replace
import re
from gensim.parsing.preprocessing import strip_multiple_whitespaces, remove_stopwords
from gensim.utils import to_unicode
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.parsing.preprocessing import STOPWORDS

STOPWORDS = STOPWORDS.union(stopwords.words('english')).union(set('&amp;'))
def my_remove_stopwords(s):
    s = to_unicode(s)
    s = s.lower()
    return " ".join(w for w in s.split() if w not in STOPWORDS)

def regexer(string):
    string = string.replace("-", " ")
    string = string.replace("RT ", "")
    handles_and_hashtags = re.compile(r"[^\w\d#@\s]+")
    string =  handles_and_hashtags.sub('', string)
    return string

custom_filters = [
                  replace,
                  strip_multiple_whitespaces,
                  regexer,
                  my_remove_stopwords,
                  SnowballStemmer("english").stem, 
                 ]
training_df['text'] = training_df.replaced_text.apply(preprocess_string, filters=custom_filters)

def further_replacer(text_list):
    numbers = re.compile(r"\d+")
    new_text_list = []
    for w in text_list:
        if w == '@url':
            new_text_list.append('@URL')
            continue
            
        if w == '@mention':
            new_text_list.append('@MENTION')
            continue
            
        if numbers.match(w):
            new_text_list.append('@NUMBER')
            continue
            
        else:
            new_text_list.append(w)
    
    return new_text_list

training_df['text'] = training_df.text.apply(further_replacer)

In [None]:
training_df["created_at"] = pd.to_datetime(training_df.created_at)

In [None]:
training_docs = training_df.text.to_list()
training_dictionary = Dictionary(docs)
training_corpus = [training_dictionary.doc2bow(tweet) for tweet in training_docs]

full

how to make topics have fewer words?

In [None]:
from gensim.models import ldamulticore
# main hyperparameter is number of topics, 10 may be too little, try 50 or 100 for this random sample dataset
# for coronavirus themed tweets, we could do fewer topics 

# Set training parameters.
# try different number of topics
num_topics = 10
chunksize = 2000 # number of documents passed to a core

# use defaults for iterations and passes and see if modeling is good
passes = 20 # number of passes through corpus
iterations = 400 # could make 100 for coronavirus tweets, but could reduce for faster development iterations 
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = ldamulticore.LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    eta='auto',
    iterations=400,
    num_topics=5,
    eval_every=eval_every
)

In [None]:
top_topics = model.top_topics(corpus)

In [None]:
# how 2 do this in spark??
# add corpus as column?
topics = []
for i in range(len(corpus)):
    topics.append(model.get_document_topics(corpus[i], minimum_probability=0.0))

In [None]:
training_df['topics'] = topics

In [None]:
topic_df = pd.DataFrame(topics)

In [None]:
topic_df = topic_df.applymap(lambda x: x[1])

In [None]:
df = pd.concat([pdf.reset_index(drop=True), topic_df.reset_index(drop=True)], axis=1)

In [None]:
df.to_csv("5_topic_model.csv")

In [None]:
text_dict = {}
for i in range(10):
    small_df = df[['full_text', i]]
    small_df = small_df.sort_values(i, ascending=False)
    text_dict[i] = list(small_df.full_text.unique()[:10])

In [None]:
import json
with open("5_topics_tweets.json", "w") as f:
    f.write(json.dumps(text_dict, indent = 2, ensure_ascii = False))