In [3]:
import pandas as pd
import tarfile
import spacy
import nltk
import string
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from collections import Counter

In [4]:
tweets = pd.read_csv("data/obamacare_19_23.csv")

In [3]:
#Exploring dataset characteristics
print(f"Shape of tweets df is{tweets.shape}",f"; Size of tweets df is {tweets.size}")
print(f"Columns are{tweets.columns}")
print(f"Unique tweets are {tweets['Text'].nunique()}")

Shape of tweets df is(36724, 3) ; Size of tweets df is 110172
Columns areIndex(['Id', 'Date', 'Text'], dtype='object')
Unique tweets are 14826


In [73]:
#Exploring most common periods of tweets
print(f"Unique dates for tweets are {tweets['Date'].nunique()}")
print(tweets['Date'].head(3))

Unique dates for tweets are 1421
0    2023-03-31
1    2023-03-31
2    2023-03-31
Name: Date, dtype: object


In [5]:
#Analysis of tweets published by month/year

# Convert 'date' column to datetime object
tweets['Date'] = pd.to_datetime(tweets['Date'])

# Extract month and year from 'date' column
tweets['Month'] = tweets['Date'].dt.month
tweets['Year'] = tweets['Date'].dt.year

# Group by month and year and count the number of tweets
tweet_count = tweets.groupby(['Month', 'Year'])['Text'].count().reset_index()

# Rename 'text' column to 'count'
tweet_count = tweet_count.rename(columns={'Text': 'count'})
tweet_count = tweet_count.sort_values(by='count', ascending=False)

# Print the resulting table
print(tweet_count)

    Month  Year  count
28      7  2020   7341
44     11  2020   3573
36      9  2020   2365
40     10  2020   2322
4       1  2023   1981
1       1  2020   1505
32      8  2020   1319
15      4  2019   1219
11      3  2020   1132
10      3  2019    964
3       1  2022    758
13      3  2022    715
19      5  2019    685
20      5  2020    679
24      6  2020    669
6       2  2020    655
12      3  2021    550
25      6  2021    506
27      7  2019    500
2       1  2021    487
42     10  2022    463
18      4  2022    459
0       1  2019    437
7       2  2021    436
14      3  2023    410
43     11  2019    406
16      4  2020    399
47     12  2019    334
34      8  2022    297
23      6  2019    296
31      8  2019    247
37      9  2021    239
39     10  2019    226
38      9  2022    224
49     12  2021    203
5       2  2019    192
30      7  2022    171
35      9  2019    168
48     12  2020    153
9       2  2023    141
46     11  2022    123
22      5  2022    112
17      4  

In [6]:
#Now moving to analysis of tweets

#Cleaning text and generating tokens
nltk.download('stopwords') 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
tweet_text = tweets.loc[:,'Text']

[nltk_data] Downloading package stopwords to C:\Users\Sanjeev
[nltk_data]     HRSCM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sanjeev
[nltk_data]     HRSCM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
#Function to clean text, create tokens
from nltk.tokenize import regexp_tokenize

def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove non-alphanumeric characters except # and @
    text = re.sub(r'[^\w\s#@]', '', text)
    # Tokenize and lemmatize
    # Adding a pattern to ensure handles and hashtags remain intact; eg: #obama will not be split into # and obama
    pattern = r'\w+|#\w+|@\w+'
    tokens = [lemmatizer.lemmatize(token.lower()) for token in regexp_tokenize(text, pattern)]
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

all_tokens = []
all_hashtags = []
all_handles = []

for tokens in tweet_text.apply(clean_text):
    for token in tokens:
        if token.startswith('#'):
            all_hashtags.append(token)
        elif token.startswith('@'):
            all_handles.append(token)
        else:
            all_tokens.append(token)

In [9]:
# Repeating the above but modifying the original tweets dataset for exporting

# Create new columns for tokens, hashtags, and handles
tweets['Tokens'] = tweets['Text'].apply(lambda x: clean_text(x))
tweets['Hashtags'] = tweets['Tokens'].apply(lambda x: [token for token in x if token.startswith('#')])
tweets['Handles'] = tweets['Tokens'].apply(lambda x: [token for token in x if token.startswith('@')])

# Export the modified dataset to a CSV file
tweets.to_csv('data/tokenized_tweets.csv', index=False)

In [54]:
# Count frequency of relevant tokens, hashtags, and handles
token_counts = Counter(all_tokens)
hashtag_counts = Counter(all_hashtags)
handle_counts = Counter(all_handles)

In [66]:
# Create a table of all tokens, hashtags and handles with frequency
twitt_desc = [('token', token, count) for token, count in token_counts.items()]
twitt_desc += [('hashtag', hashtag, count) for hashtag, count in hashtag_counts.items()]
twitt_desc += [('handle', handle, count) for handle, count in handle_counts.items()]
twitt_table = pd.DataFrame(table_data, columns=['Type', 'Value', 'Frequency'])

In [67]:
# Top 10 most occuring hashtags
twitt_table.loc[(twitt_table.loc[:,'Type'] == 'hashtag'),:].sort_values(by='Frequency', ascending=False).head(10)

Unnamed: 0,Type,Value,Frequency
14294,hashtag,#aca,5280
14293,hashtag,#obamacare,5136
14320,hashtag,#healthcare,845
14696,hashtag,#scotus,540
14791,hashtag,#trump,540
14599,hashtag,#covid19,539
14302,hashtag,#affordablecareact,533
14319,hashtag,#medicaid,310
14639,hashtag,#medicare4all,295
14615,hashtag,#preexistingconditions,275


In [68]:
# Top 10 most occuring handles
twitt_table.loc[(twitt_table.loc[:,'Type'] == 'handle'),:].sort_values(by='Frequency', ascending=False).head(10)

Unnamed: 0,Type,Value,Frequency
18123,handle,@speakerpelosi,3832
20928,handle,@realdonaldtrump,2515
17959,handle,@barackobama,2474
18133,handle,@nytimes,1639
23223,handle,@ambassadorrice,776
18090,handle,@deanobeidallah,738
17965,handle,@joebiden,695
17964,handle,@gop,679
22161,handle,@sebelius,633
18723,handle,@ewarren,612


In [69]:
# Top 10 most occuring tokens
twitt_table.loc[(twitt_table.loc[:,'Type'] == 'token'),:].sort_values(by='Frequency', ascending=False).head(10)

Unnamed: 0,Type,Value,Frequency
57,token,rt,24491
36,token,care,12597
6,token,health,11342
43,token,obamacare,9289
84,token,trump,8945
277,token,amp,7241
52,token,aca,7178
1914,token,woman,6448
188,token,president,5501
126,token,take,5496
