In [5]:
# Package Imports
import snscrape.modules.twitter as sntwitter
import pandas as pd
from collections import Counter
import re
import time 
import json

# Functions

In [6]:
# Reference: https://techcommunity.microsoft.com/t5/educator-developer-blog/how-to-scrape-twitter-data-for-sentiment-analysis-with-python/ba-p/3593365

def scrape_tweets(topic, start_date, end_date):
    '''
    Returns the set of English language tweets containing a specific hashtag between a date range
    
    Parameters: 
        topic (str): string describing hashtag (e.g. "#Oscars2022")
        start_date (str): string for start of date range (e.g. YYYY-MM-DD)
        end_date (str): string for start of date range (e.g. YYYY-MM-DD)
    
    Returns: 
        df (DataFrame): DataFrame containing date of tweet and tweet contents
    '''
    
    # Max number of tweets to pull
    NUM_TWEETS = 1000000

    # Filtering to English language tweets between start and end date
    query = "({}) since:{} until:{} lang:en".format(topic, start_date, end_date)

    tweets = []
    data = enumerate(sntwitter.TwitterSearchScraper(query).get_items())
    for i, tweet in data: 
        if i>NUM_TWEETS: 
            break
        else: 
            tweets.append([tweet.date, tweet.content])

    df = pd.DataFrame(tweets, columns = ['Date', 'tweet'])
    return df

In [7]:
def find_related_hashtags(data, num_topics): 
    '''
    Finds the most commonly occuring hashtags in a tweet
    
    Parameters: 
        data (DataFrame): DataFrame containing tweets 
        num_topics (int): The number of most related hashtags to return 
    
    Returns: 
        hashtags_counter (list): List of tuples containing the most commonly occuring hashtags 
        and the number of times that they occur
    '''
    
    related_hashtags = []

    for tweet in data['tweet']: 
        tweet = tweet.lower()
        hashtags = re.findall(r'\B#\w*[a-zA-Z]+\w*', tweet)
        related_hashtags.extend(hashtags)
        
    hashtags_counter = Counter(related_hashtags).most_common(num_topics+1)
    return hashtags_counter[1:]

# Scraping Data

## #Oscars2022

In [None]:
# Scraping tweets for overarching #Oscars2022 hashtag 

st = time.time()
df = scrape_tweets("#Oscars2022", "2022-03-27", "2022-03-28")
et = time.time()
print(et - st)

oscars_pkl = df.to_pickle("./oscars.pkl")  

In [None]:
# Finding 20 most mentioned hashtags in #Oscars2022 tweets
oscars_top_20 = find_related_hashtags(oscars_df, 20)

# Subsetting to the most interesting 15 
top_15_oscars = ["#academyawards", "#redcarpet", "#dune", "#oscarsfanfavorite", "#coda", "#thepowerofthedog", "#bestpicture", "#westsidestory", 
"#andrewgarfield", "#zendaya", "#hollywood", "#movies", "#kristenstewart", "#oscarnoms", "#oscars"]

# Scraping all tweets for each of the 15 sub-topics
for i in top_15_oscars: 
    print(i)
    df = scrape_tweets(i, "2022-03-27", "2022-03-28")
    pkl = df.to_pickle(i[1:]+".pkl") 

    
# Find two most common hashtags for 15 sub-topics
oscars_topic_dict = {}
st = time.time()

for topic in top_15_oscars: 
    data = pd.read_pickle(topic[1:] + ".pkl")
    hashtags = find_related_hashtags(data, 2)
    oscars_topic_dict[topic] = [i[0] for i in hashtags]   

et = time.time()
print(et - st)

# Save subtopics to json
json.dump(oscars_topic_dict, open("oscars_subtopics.json", 'w' ) )

## #UKPolitics

In [None]:
# Scraping tweets for overarching #UKPolitics hashtag
st = time.time()
uk_df = scrape_tweets("#UKPolitics", "2022-07-05", "2022-10-26")
et = time.time()
print(et - st)

uk_pkl = uk_df.to_pickle("./uk.pkl")

In [None]:
# Finding 20 most mentioned hashtags in #UKPolitics tweets
uk_top_20 = find_related_hashtags(uk_df, 20)

# Subselecting to the 15 most interesting
top_15_uk = ["#liztruss", "#borisjohnson", "#uk", "#rishisunak", "#brexit", "#toryshambles", "#politics", "#gtto", "#murdochguttermedia",
            "#progressivealliance", "#dissolvetheunion", "#dailyfail", "#nastyparty", "#rwnjs", "#torybritain"]

# Scraping all tweets for each of the 15 sub-topics
for i in top_15_uk: 
    print(i)
    df = scrape_tweets(i, "2022-07-05", "2022-10-26")
    pkl = df.to_pickle(i[1:]+".pkl") 
    

uk_topic_dict = {}

st = time.time()

# Find two most common hashtags for 15 sub-topics
for topic in top_15_uk: 
    data = pd.read_pickle(topic[1:] + ".pkl")
    hashtags = find_related_hashtags(data, 2)
    uk_topic_dict[topic] = [i[0] for i in hashtags]   

et = time.time()
print(et - st)

# Save subtopics to json
json.dump(uk_topic_dict, open("uk_subtopics.json", 'w' ) )


# Scrape tweets for additional hashtags (2 most common)
extra_topics = ['#london', '#eu', '#generalelectionnow', '#torychaos', '#news' , '#followbackfriday', '#murdochroyalcommission', '#auspol', 
                '#scottishindependence2023', '#indyref2', '#musicvideo', '#newmusic']

for i in extra_topics: 
    print(i)
    df = scrape_tweets(i, "2022-09-26", "2022-10-26")
    pkl = df.to_pickle(i[1:]+".pkl") 