In [210]:
import pandas as pd
from collections import defaultdict, Counter
import random
import re
from nltk.tokenize import word_tokenize

In [211]:
path_to_the_file = 'covid19_tweets.csv'
tweets_df = pd.read_csv(path_to_the_file)
tweets_df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False
1,Tom Basile 🇺🇸,"New York, NY","Husband, Father, Columnist & Commentator. Auth...",2009-04-16 20:06:23,2253,1677,24,True,2020-07-25 12:27:17,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,,Twitter for Android,False
2,Time4fisticuffs,"Pewee Valley, KY",#Christian #Catholic #Conservative #Reagan #Re...,2009-02-28 18:57:41,9275,9525,7254,False,2020-07-25 12:27:14,@diane3443 @wdunlap @realDonaldTrump Trump nev...,['COVID19'],Twitter for Android,False
3,ethel mertz,Stuck in the Middle,#Browns #Indians #ClevelandProud #[]_[] #Cavs ...,2019-03-07 01:45:06,197,987,1488,False,2020-07-25 12:27:10,@brookbanktv The one gift #COVID19 has give me...,['COVID19'],Twitter for iPhone,False
4,DIPR-J&K,Jammu and Kashmir,🖊️Official Twitter handle of Department of Inf...,2017-02-12 06:45:15,101009,168,101,False,2020-07-25 12:27:08,25 July : Media Bulletin on Novel #CoronaVirus...,"['CoronaVirusUpdates', 'COVID19']",Twitter for Android,False


In [212]:
def basic_preprocess_tweet(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

tweets_df['basic_processed_text'] = tweets_df['text'].apply(basic_preprocess_tweet)
tweets_df[['text', 'basic_processed_text']].head()

Unnamed: 0,text,basic_processed_text
0,If I smelled the scent of hand sanitizers toda...,"[if, i, smelled, the, scent, of, hand, sanitiz..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,"[hey, and, wouldnt, it, have, made, more, sens..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,"[trump, never, once, claimed, covid19, was, a,..."
3,@brookbanktv The one gift #COVID19 has give me...,"[the, one, gift, covid19, has, give, me, is, a..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,"[25, july, media, bulletin, on, novel, coronav..."


In [213]:
N = 10000
words_all = [word for tweet in tweets_df['basic_processed_text'] for word in tweet]
vocabu = {word for word, count in Counter(words_all).most_common(N)}

def preprocess_with_special_tokens(tweet, vocabulary):
    processed_tweet = ['<s>'] + tweet + ['</s>']
    return [word if word in vocabulary else 'UNK' for word in processed_tweet]

tweets_df['preprocessed_text_special'] = tweets_df['basic_processed_text'].apply(
    lambda tweet: preprocess_with_special_tokens(tweet, vocabu))
tweets_df[['text', 'preprocessed_text_special']].head()

Unnamed: 0,text,preprocessed_text_special
0,If I smelled the scent of hand sanitizers toda...,"[UNK, if, i, UNK, the, UNK, of, hand, sanitize..."
1,Hey @Yankees @YankeesPR and @MLB - wouldn't it...,"[UNK, hey, and, wouldnt, it, have, made, more,..."
2,@diane3443 @wdunlap @realDonaldTrump Trump nev...,"[UNK, trump, never, once, claimed, covid19, wa..."
3,@brookbanktv The one gift #COVID19 has give me...,"[UNK, the, one, gift, covid19, has, give, me, ..."
4,25 July : Media Bulletin on Novel #CoronaVirus...,"[UNK, 25, july, media, bulletin, on, novel, co..."


In [214]:
tokenized_tweets = tweets_df["preprocessed_text_special"]

In [215]:
def build_n_gram_model(n, tokenized_texts):
    model = {}
    for tokens in tokenized_texts:
        padded_tokens = ['<s>'] * (n-1) + tokens + ['</s>']
        n_grams = zip(*[padded_tokens[i:] for i in range(n)])
        for n_gram in n_grams:
            prefix = n_gram[:-1]
            suffix = n_gram[-1]
            if prefix not in model:
                model[prefix] = {}
            if suffix not in model[prefix]:
                model[prefix][suffix] = 0
            model[prefix][suffix] += 1
    return model

uni_gram_model = build_n_gram_model(1, tokenized_tweets)
bi_gram_model = build_n_gram_model(2, tokenized_tweets)
tri_gram_model = build_n_gram_model(3, tokenized_tweets)

In [216]:
uni_gram_model

{(): {'UNK': 609367,
  'if': 8019,
  'i': 15040,
  'the': 105930,
  'of': 59906,
  'hand': 500,
  'sanitizers': 67,
  'today': 4533,
  'on': 23823,
  'someone': 815,
  'in': 54642,
  'past': 677,
  'would': 2369,
  'think': 1929,
  'they': 5707,
  'were': 3909,
  'so': 6225,
  'that': 15368,
  '</s>': 179108,
  'hey': 582,
  'and': 44886,
  'wouldnt': 229,
  'it': 12942,
  'have': 14648,
  'made': 1212,
  'more': 8382,
  'sense': 329,
  'to': 75038,
  'players': 292,
  'pay': 723,
  'their': 4622,
  'a': 47844,
  'trump': 3443,
  'never': 1182,
  'once': 514,
  'claimed': 114,
  'covid19': 109717,
  'was': 5430,
  'hoax': 351,
  'we': 14982,
  'all': 8555,
  'claim': 207,
  'this': 18452,
  'effort': 186,
  'one': 4928,
  'gift': 94,
  'has': 12998,
  'give': 983,
  'me': 2928,
  'is': 34580,
  'an': 6388,
  'appreciation': 36,
  'for': 31611,
  'simple': 361,
  'things': 968,
  'always': 732,
  'around': 1543,
  '25': 586,
  'july': 1890,
  'media': 1100,
  'bulletin': 196,
  'novel':

In [217]:
bi_gram_model

{('<s>',): {'UNK': 179108},
 ('UNK',): {'if': 2529,
  'the': 12952,
  'of': 4355,
  'that': 1209,
  '</s>': 179108,
  'hey': 512,
  'to': 4976,
  'trump': 767,
  '25': 59,
  'coronavirus': 2485,
  'how': 2328,
  'you': 1840,
  'praying': 26,
  'UNK': 93938,
  'as': 2727,
  'watch': 410,
  'covid19': 17389,
  'order': 54,
  'icon': 9,
  'abstract': 4,
  'no': 1054,
  'lets': 459,
  'rajasthan': 169,
  'man': 194,
  'nagaland': 16,
  'july': 148,
  'discharge': 14,
  'people': 1237,
  'cases': 882,
  'chennai': 46,
  'second': 69,
  'it': 1473,
  'moments': 10,
  'covid': 610,
  'good': 675,
  'your': 711,
  'holy': 34,
  'need': 301,
  'hospital': 367,
  'our': 1285,
  'response': 87,
  'modified': 7,
  'and': 6683,
  'i': 5272,
  'off': 114,
  'tn': 21,
  'ppp': 10,
  'tax': 24,
  'minimum': 3,
  'per': 167,
  'fema': 10,
  'lacks': 5,
  'homes': 17,
  'actor': 47,
  'an': 678,
  'regarding': 40,
  '13': 99,
  'a': 4166,
  'in': 6823,
  'talking': 49,
  'im': 1022,
  'everyones': 7,
  

In [222]:
def generate_text(model, start_with=None, max_length=20):
    if start_with is None:
        start_with = random.choice(list(model.keys()))

    result = list(start_with)
    current = start_with
    for _ in range(max_length):
        possible_next_words = model[current]
        if not possible_next_words:
            break
        next_word = random.choices(list(possible_next_words.keys()), weights=possible_next_words.values())[0]
        if next_word == '</s>':
            break
        result.append(next_word)
        if len(current) > 1:
            current = (*current[1:], next_word)
        else:
            current = (next_word,)
    return ' '.join(result)

In [225]:
def generate_text_uni_gram(model, max_length=20):
    result = []

    for _ in range(max_length):
        possible_next_words = []
        for prefix, suffix_dict in model.items():
            for suffix, count in suffix_dict.items():
                possible_next_words.append((suffix, count))

        if not possible_next_words:
            break

        words, weights = zip(*possible_next_words)
        next_word = random.choices(words, weights=weights)[0]

        if next_word == '</s>':
            break

        result.append(next_word)

    return ' '.join(result)

generated_uni_gram_text = generate_text_uni_gram(uni_gram_model)
print(generated_uni_gram_text)


because complete care UNK resign fight life found UNK of too settings referred worthy UNK on born to from


In [220]:
generated_bi_gram_text = generate_text(bi_gram_model)
generated_bi_gram_text

'innovative ventilator after seeing the UNK'

In [221]:
generated_tri_gram_text = generate_text(tri_gram_model)
generated_tri_gram_text

'major shortages are still out there touting swedens UNK got the UNK amp for enforcing the law surrounding covid19 busy UNK weed'