In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import numpy as np
import statsmodels.api as sm
from io import StringIO

import preprocessor as pp
import emoji
from nltk.tokenize import TweetTokenizer



In [2]:
df = pd.read_csv('../data/monkeypox.csv')
df = df.drop('number', axis=1)
df = df.drop('created_at', axis=1)
# df['text']
text_columns = ['text', 'user description'] 

tweet_tokenizer = TweetTokenizer()

for cl in text_columns:
    df[cl] = [list(map(pp.tokenize, map(emoji.demojize, tweet_tokenizer.tokenize(emoji.emojize(text))))) for text in df[cl]]
    print(df[cl])

0       [Much, of, the, focus, on, $HASHTAG$, recently...
1       [ICYMI, :, The, first, probable, case, of, mon...
2       [WHO, :, $HASHTAG$, outbreak, not, yet, a, glo...
3       [According, to, the, CDC, ,, monkeypox, is, us...
4       [LGBTQ, advocates, and, health, care, organiza...
                              ...                        
5782    [$MENTION$, $MENTION$, $MENTION$, $MENTION$, $...
5783    [$MENTION$, $MENTION$, $MENTION$, $MENTION$, $...
5784    [$MENTION$, It, seems, we, have, a, new, varia...
5785    [$MENTION$, From, what, I, gather, ', Monkey, ...
5786    [WTF, :exclamation_question_mark:, , Monkeypox...
Name: text, Length: 5787, dtype: object
0       [Mother, of, $NUMBER$, ,, ex, wife, of, $NUMBE...
1       [WCHS-TV, serves, the, Charleston-Huntington, ...
2       [Patient, Engagement, |, Revenue, Cycle, Manag...
3       [Home, of, the, Minnesota, $MENTION$, and, $ME...
4       [Spectrum, News, $NUMBER$, offers, the, best, ...
                              ..

In [3]:
# Check if the values needs to be standadized
for cl in df.columns:
    if cl not in text_columns:
        print(cl)
        print(df[cl].unique())


source
['Twitter Web App' 'TweetDeck' 'eClincher' 'Sprout Social'
 'SocialNewsDesk' 'Twitter for iPhone' 'WordPress.com'
 'Monkeypox News & Data' 'Hootsuite Inc.' 'Twitter for Android'
 'TwitterRSSTSOLAgent' 'NewsWorldpress' 'aa.com.tr' 'Tweetbot for iΟS'
 'SocialFlow' 'IFTTT' 'Brandwatch' 'dlvr.it' 'Twitter for iPad'
 'San Francisco PLOW' 'Zapier.com' 'Microsoft Power Platform'
 'Circleboom Publish' 'True Anthem' 'Insatiable Retriever'
 'HelloworldTest123' 'AI' 'Echobox' 'Twitter' 'Sprinklr Publishing'
 'The Tweeted Times' 'Twidere for Android' 'VernonNow' 'ShuswapNow'
 'PentictonNow' 'Kamloops BC Now' 'Welcome 2 Kelowna News'
 'PrinceGeorgeNow' 'VictoriaNow' 'FS_Poster_App'
 'Twitter Media Studio - LiveCut' 'Buffer' 'Revive Social App'
 'br00t4c App 01' 'skiviers' 'presshub_usbot' 'Next Newsfeeds.media'
 'NewsNacho' 'theglobe.co.com' 'Twitter Media Studio'
 'TweetCaster for iOS' 'Akhbaralyawm.com Post' 'POZ Auto Tweet'
 'RH Auto Tweet' 'TS Auto Tweet' 'ContentStudio.io' 'Germany News

In [4]:
for cl in df.columns:
    if cl not in text_columns:
        print(cl)
        print(df[cl].value_counts())


source
Twitter for iPhone     1780
Twitter Web App        1613
Twitter for Android    1095
WordPress.com           357
Twitter for iPad        181
                       ... 
Chronicle.lu v2           1
newslink.app              1
The Global Herald         1
Mirage News Posts         1
SMAP Lite                 1
Name: source, Length: 148, dtype: int64
user is verified
False    5385
True      402
Name: user is verified, dtype: int64
user has url
False    3566
True     2221
Name: user has url, dtype: int64
user created at
2013-08-13 06:16:35    44
2018-02-02 14:55:32    24
2020-03-11 08:05:33    22
2022-01-09 11:48:24    21
2022-01-14 23:40:55    21
                       ..
2022-04-17 19:41:24     1
2022-05-15 15:48:14     1
2016-08-20 18:11:59     1
2020-02-25 23:47:56     1
2019-09-18 11:53:12     1
Name: user created at, Length: 4588, dtype: int64
retweet_count
0      4575
1       550
2       176
3       104
4        73
       ... 
113       1
343       1
185       1
66        1
166

In [5]:
print(df['binary_class'].value_counts())

print(df['ternary_class'].value_counts())


0    4718
1    1069
Name: binary_class, dtype: int64
9    2753
0    1965
1    1069
Name: ternary_class, dtype: int64


In [6]:
spl_strings = ['$HASHTAG$', '_URL_', '$MENTION$']
stop_words = ['thats','weve','hes','theres','ive','im','will','can','cant','dont','youve','us', 
              'youre','youll','theyre','whats','didnt', 'just']

def filter_condition(s):
    return (len(s) > 2) and (s not in spl_strings) and (s not in stop_words)

# TODO: remove punctuation, to lower or upper 
    
for cl in text_columns:
    df[cl] = [list(filter(filter_condition, text)) for text in df[cl]]  

# df['text'] = [list(set(text) - to_filter) for text in df['text']]
# df['text'] = [list(filter(filter_condition, text)) for text in df['text']]

In [7]:
df['text'][0]

['Much',
 'the',
 'focus',
 'recently',
 'has',
 'been',
 'how',
 'its',
 'affecting',
 'people',
 'richer',
 'countries',
 'during',
 'this',
 'current',
 'outbreak',
 'Charles',
 'Mahzude',
 'spoke',
 'about',
 'what',
 'know',
 'from',
 'West',
 'Africa',
 'where',
 'moneypox',
 'endemic',
 'about',
 'how',
 'affects',
 'people',
 'with',
 'HIV']

In [8]:
df['text'][1]

['ICYMI',
 'The',
 'first',
 'probable',
 'case',
 'monkeypox',
 'West',
 'Virginia',
 'has',
 'been',
 'identified',
 'state',
 'health',
 'officials',
 'announced',
 'Friday']

In [9]:
df['text'][2]


['WHO',
 'outbreak',
 'not',
 'yet',
 'global',
 'public',
 'health',
 'emergency',
 'via']

In [10]:
df['text'][3]


['According',
 'the',
 'CDC',
 'monkeypox',
 'usually',
 'spread',
 'prolonged',
 'direct',
 'contact',
 'with',
 'person',
 'who',
 'has',
 'infectious',
 'rash',
 'scab',
 'Monkeypox',
 'symptoms',
 'include',
 'fever',
 'headache',
 'muscle',
 'aches',
 'swollen',
 'lymph',
 'nodes',
 'chills',
 'and',
 'exhaustion']

In [11]:
df['text'][4]


['LGBTQ',
 'advocates',
 'and',
 'health',
 'care',
 'organizations',
 'raised',
 'alarms',
 'Thursday',
 'with',
 'public',
 'health',
 'officials',
 'response',
 'the',
 'monkeypox',
 'outbreak',
 'New',
 'York',
 'amid',
 'concerns',
 'over',
 'access',
 'vaccines',
 'and',
 'updated',
 'data']

In [12]:
df['text'][5]


['Anyone', 'contract', 'monkeypox']

In [13]:
df['text'][6]


['NEW',
 'INFO',
 'The',
 'number',
 'monkeypox',
 'cases',
 'Texas',
 'now',
 '$NUMBER$',
 'according',
 'the',
 'Department',
 'State',
 'Health',
 'Services',
 "That's",
 'from',
 '$NUMBER$',
 'Tuesday']

In [14]:
df['text'][7]


["Here's",
 'how',
 'you',
 'get',
 'tested',
 'for',
 'Monkeypox',
 'you',
 'think',
 "you've",
 'been',
 'infected',
 ':backhand_index_pointing_down:']

In [15]:
df['text'][8]


['The',
 'May',
 'Losing',
 'the',
 'Fight',
 'Against',
 'Monkeypox',
 'Scientists',
 'Say']

In [16]:
df['text'][9]

['$NUMBER$/08',
 '$NUMBER$',
 '$NUMBER$:11',
 'UTC',
 ':newspaper:',
 'Monkeypox',
 'the',
 'Bay',
 'Area',
 'From',
 'Symptoms',
 'How',
 'Find',
 'Vaccine',
 "Here's",
 'What',
 'Know',
 'KQED',
 ':backhand_index_pointing_right:',
 'Latest',
 'monkeypox',
 'news',
 'around',
 'the',
 ':globe_showing_Americas:']