# Tweets Scraping Notebook
Following these tutorials: 

1. https://medium.com/@jcldinco/downloading-historical-tweets-using-tweet-ids-via-snscrape-and-tweepy-5f4ecbf19032
2. https://towardsdatascience.com/how-to-scrape-more-information-from-tweets-on-twitter-44fd540b8a1f#7152

### 0. Use snscrape to crawl tweet IDs with certain search creteria
Social network crawler snscrape: https://github.com/JustAnotherArchivist/snscrape (NOTE: snscrape requires Python 3.8 environment!)
    
    pip install snscrape

Twitter advanced search criteria: https://github.com/igorbrigadir/twitter-advanced-search

Example terminal comand to run:

    snscrape twitter-search -v "(#coronavirus OR #covid19 OR #covid-19 OR #covid) since:2020-05-01 until:2020-05-15 lang:en" > ht_covid_0501_0515.txt

NOTE: since Twitter has been phasing out geo-tagging, searching for geo-tagged tweets has been very limiting. Thus, we skip geo location for now and filter for tweets from US users in the tweets extraction step

Run the following Python 3.8 script (snscrape_script.py) in terminal to automate snscrape for crawling a long period of tweet IDs:

    import subprocess
    import datetime
    import snscrape

    if __name__ == "__main__":
        # Edit start & end date here
        start_date = datetime.date(2020, 5, 16)
        end_date = datetime.date(2020, 11, 23)
        delta = datetime.timedelta(days=15)

        while start_date <= end_date:
            since = start_date.strftime("%Y-%m-%d")
            until = (start_date + delta).strftime("%Y-%m-%d");
            fill = [since, until, since[-5:] + "_" + until[-5:]]
            query = "(#coronavirus OR #covid19 OR #covid-19 OR #covid) since:{0} until:{1} lang:en".format(*fill)
            cmd1 = "snscrape --max-results 1000000 twitter-search"
            file = "ht_covid_{2}.txt".format(*fill)
            print(datetime.datetime.now(), ' ', query) 

            # Redirect output to a .txt file       
            with open(file, 'w') as f:
                subprocess.run(cmd1.split() + [query], stdout=f, text=True)
            start_date += delta

### 1. Tokens for Tweepy Authentication
Fill in your Twitter developer account API keys

In [None]:
import pandas as pd
import tweepy

# Moira's API keys
consumer_key = "rnf9JPI67xFrwkhqzDKDvkNHw"
consumer_secret = "SruqMAC6yJxu5JVOiQiRfJWNl1V2aLLtQm4WVOvnQsGWxZb4Fe"
# bearer_token = "AAAAAAAAAAAAAAAAAAAAAJrSJQEAAAAAdYm3S5f%2Fe3DLjNcRqZ7pSAe7MO8%3DsISkKVeuKuU4qtyniNFVBzHGdnm88rK2K0BONYWEkGrpMDqhrf"
access_token = "1324271022615326721-VZKJ4DgJK4CRcOVOyuGCQ3IFgRJbqd"
access_token_secret = "2mDKPt6RYDAkmplOxQLFHMjzWDX8zqhMUTUKdkCeSciv8"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

### 2. Tweet Content Scraping
Use "snscrape" to scrape historical tweets & write urls to a txt file

Note: For some reasons cloud services like Colab & Deepnote all block the snscrape module, so this step has to be done locally


In [None]:
# Name of the .txt file with tweets URLs to be extracted
filename = "ht_covid_09-13_09-28"

tweet_url = pd.read_csv(filename + ".txt", index_col= None, header = None, names = ["links"])
tweet_url.head()

Unnamed: 0,links
0,https://twitter.com/iOptimizeRealty/status/131...
1,https://twitter.com/victori92735456/status/131...
2,https://twitter.com/Joel_Agius1/status/1310368...
3,https://twitter.com/unbrothodox/status/1310368...
4,https://twitter.com/albertan48/status/13103685...


In [None]:
# extract tweet ID from url
af = lambda x: x["links"].split("/")[-1]
tweet_url['id'] = tweet_url.apply(af, axis=1)
ids = tweet_url['id'].tolist()

batch_size = 100
total_count = len(ids)
chunks = (total_count - 1) // batch_size + 1

created_at : The time the status was posted.
id : The ID of the status.
id_str : The ID of the status as a string.
text : The text of the status.
entities : The parsed entities of the status such as hashtags, URLs etc.
source : The source of the status.
source_url : The URL of the source of the status.
in_reply_to_status_id : The ID of the status being replied to.
in_reply_to_status_id_str : The ID of the status being replied to in as a string.
in_reply_to_user_id : The ID of the user being replied to.
in_reply_to_user_id_str : The ID of the user being replied to as a string.
in_reply_to_screen_name : The screen name of the user being replied to
user : The User object of the poster of the status.
geo : The geo object of the status.
coordinates : The coordinates of the status.
place : The place of the status.
contributors : The contributors of the status.
is_quote_status : Indicates whether the status is a quoted status or not.
retweet_count : The number of retweets of the status.
favorite_count : The number of likes of the status.
favorited : Indicates whether the status has been favourited by the authenticated user or not.
retweeted : Indicates whether the status has been retweeted by the authenticated user or not.
possibly_sensitive : Indicates whether the status is sensitive or not.
lang : The language of the status.

In [None]:
import time

def fetch_tw(ids, filename):
    for attempt in range(16):
        try:
            list_of_tw_status = api.statuses_lookup(ids, tweet_mode= "extended")
            #print('Looked up successfully')
            break
        except Exception as e:
            print(e, 'Wait 1 min & retry...Attempt:', attempt)
            time.sleep(60)
        
    tweet_df = pd.DataFrame()
    for status in list_of_tw_status:
        # place = 0
        # #print(type(status))
        # if status.place != None:
        #     place = status.place.full_name
        tweet_elem = {"tweet_id": status.id,
                    "screen_name": status.user.screen_name,
                    "tweet": status.full_text,
                    "date": status.created_at,
                    # "language": status.lang, # all English
                    #"place": place,
                    "user_location": status.user.location
                    }
        tweet_df = tweet_df.append(tweet_elem, ignore_index = True)
    return tweet_df
    #empty_data.to_csv(filename, mode="a")

Fetch tweets in batachs and write to a csv

In [None]:
dataset = pd.DataFrame()
for i in range(chunks):
        batch = ids[i*batch_size:(i+1)*batch_size]
        dataset = dataset.append(fetch_tw(batch, filename), ignore_index=True)
        #print('batch ', i, 'out of ', chunks)
        
dataset.to_csv(filename + ".csv", mode="a")

In [None]:
# Extract content from tweet ID files (.txt)
import pandas as pd
import tweepy
import time
import extract_states

def tweetID2contents(filename, batch_size=100):

    # Your Tweepy API keys
    consumer_key = ""
    consumer_secret = ""
    bearer_token = ""
    access_token = ""
    access_token_secret = ""

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth,wait_on_rate_limit=True)

    def fetch_tw(ids, filename):
        for attempt in range(16):
            try:
                list_of_tw_status = api.statuses_lookup(ids, tweet_mode= "extended")
                #print('Looked up successfully')
                break
            except Exception as e:
                print(e, 'Wait 1 min & retry...Attempt:', attempt)
                time.sleep(60)
            
        tweet_df = pd.DataFrame()
        for status in list_of_tw_status:
            # place = 0
            # #print(type(status))
            # if status.place != None:
            #     place = status.place.full_name
            tweet_elem = {"tweet_id": status.id,
                        "screen_name": status.user.screen_name,
                        "tweet": status.full_text,
                        "date": status.created_at,
                        # "language": status.lang, # all English
                        #"place": place,
                        "user_location": status.user.location
                        }
            tweet_df = tweet_df.append(tweet_elem, ignore_index = True)
        return tweet_df
    
    # extract tweet ID from url
    tweet_url = pd.read_csv(filename + ".txt", index_col= None, header = None, names = ["links"]) 
    af = lambda x: x["links"].split("/")[-1]
    tweet_url['id'] = tweet_url.apply(af, axis=1)
    ids = tweet_url['id'].tolist()

    total_count = len(ids)
    chunks = (total_count - 1) // batch_size + 1

    # fetch tweeet contents & write to csv
    dataset = pd.DataFrame()
    for i in range(chunks):
            batch = ids[i*batch_size:(i+1)*batch_size]
            dataset = dataset.append(fetch_tw(batch, filename), ignore_index=True)
            #print('batch ', i, 'out of ', chunks)
            
    dataset.to_csv(filename + ".csv", mode="a")


In [None]:
# Automate extracting tweet files
import datetime

start_date = datetime.date(2020, 5, 1)
end_date = datetime.date(2020, 6, 15)
delta = datetime.timedelta(days=15)

while start_date < end_date:
    since = start_date.strftime("%Y-%m-%d")
    until = (start_date + delta).strftime("%Y-%m-%d");
    fill = [since, until, since[-5:] + "_" + until[-5:]]
    # Name of the .txt file with tweets URLs to be extracted
    filename = "ht_covid_{2}".format(*fill)
    print(datetime.datetime.now(), filename)
    # Extract tweets
    tweetID2contents(filename)

    start_date += delta

2020-12-04 00:05:52.217886 ht_covid_05-01_05-16
2020-12-04 03:06:12.327026 ht_covid_05-16_05-31
2020-12-04 06:04:33.100475 ht_covid_05-31_06-15


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cfafe70e-b705-46d4-9ab8-a3eb30c9a7b3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>