# Extraction historical tweets 

## Importing all libraries

In [54]:
pip install tweepy

Note: you may need to restart the kernel to use updated packages.


In [55]:
import tweepy
import yaml
import pandas as pd
import json
import datetime
import time
from google.cloud import storage

For our project we will be using tweets from the last 2 years using the Twitter API. To access this data is necessary to have a developer account on Twitter and the [Academic Research credentials](https://developer.twitter.com/en/products/twitter-api/academic-research). The keys are stored in the `twitter_keys.yaml` file 

In [56]:
config_path = open("twitter_keys.yaml")
config = yaml.safe_load(config_path)

In [58]:
api_key = config['API Key']
api_key_secret = config['API Key Secret']
access_token = config['Access token']
access_token_secret = config['Access token secret']
bearer_token = config['Bearer token']

To get the data, we will use [tweepy.Client](https://docs.tweepy.org/en/stable/client.html) and the [search_all_tweets] method. 

In [59]:
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

In [74]:
# Start date to query tweets 
start_date = datetime.date(2021,10,31)
end_date = datetime.date(2022,3,13)

# Function to generate dates
def gen_dates(start_date, end_date):
    new_start = start_date
    while new_start != end_date:
        new_start += one_day
        yield new_start

In [75]:
dates=[]

# Date in the right format to query
for d in gen_dates(start_date, end_date):
    new_date = str(d) + 'T00:00:00Z'
    dates.append(new_date)


We will look for tweets that contain the word *Bitcoin* with these characteristics:
- Not a retweet
- English
- Verified Accounts


In [65]:
final_tweets = []
for i in range (len(dates)-1):
    hoax_tweets = []
    for response in tweepy.Paginator(client.search_all_tweets, 
                                 query = 'Bitcoin -is:retweet lang:en is:verified',
                                 user_fields = ['username', 'public_metrics', 'description', 'location'],
                                 tweet_fields = ['created_at', 'geo', 'public_metrics', 'text'],
                                 expansions = 'author_id',
                                 start_time = dates[i],
                                 end_time = dates[i+1],
                              max_results=200):
                              time.sleep(1)
                              hoax_tweets.append(response)
                              final_tweets.append(response)
                              if len(hoax_tweets) == 20:
                                  break

Rate limit exceeded. Sleeping for 412 seconds.


In [66]:
len(final_tweets)

558

Once we have the raw data from the Twitter API, we will keep only the information we care. 
- Username
- Followers
- Tweets
- Description 
- Location
- ID (tweet)
- ID (Author)
- Created at 
- Tweet
- Likes
- Retweets
- Quotes

We transform the `created_at` field into a string to be able to save it later like a timestamp.

In [67]:
result = []
user_dict = {}
# Loop through each response object
for response in final_tweets:
    # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
    for user in response.includes['users']:
        user_dict[user.id] = {'username': user.username, 
                              'followers': user.public_metrics['followers_count'],
                              'tweets': user.public_metrics['tweet_count'],
                              'description': user.description,
                              'location': user.location
                             }
    for tweet in response.data:
        # For each tweet, find the author's information
        author_info = user_dict[tweet.author_id]
        # Put all of the information we want to keep in a single dictionary for each tweet
        result.append({'id': str(tweet.id), 
                       'author_id': str(tweet.author_id),
                       'username': author_info['username'],
                       'author_followers': author_info['followers'],
                       'author_tweets': author_info['tweets'],
                       'author_description': author_info['description'],
                       'author_location': author_info['location'],
                       'text': tweet.text,
                       'created_at': str(tweet.created_at),
                       'retweets': tweet.public_metrics['retweet_count'],
                       'replies': tweet.public_metrics['reply_count'],
                       'likes': tweet.public_metrics['like_count'],
                       'quote_count': tweet.public_metrics['quote_count']
                      })

In [68]:
len(result)

97281

In [69]:
result[32983]

{'id': '1469462654808961024',
 'author_id': '17899712',
 'username': 'Kevin_Jackson',
 'author_followers': 77459,
 'author_tweets': 209109,
 'author_description': 'USA Today & Wall Street Journal bestselling author, advisor, & technologist. https://t.co/JQmBpxpgul https://t.co/5cowsJ2iMP…',
 'author_location': 'Virginia, USA',
 'text': '@ILokeli, Earn Bitcoin while protecting yourself from text and voice scams with Gabriel Crypto. Use code TNS2021 after checkout to get 60 points you can exchange for crypto!\nhttps://t.co/eWm1Th5GzQ\nhttps://t.co/bWBkHhnBrJ',
 'created_at': '2021-12-11 00:22:52+00:00',
 'retweets': 0,
 'replies': 0,
 'likes': 0,
 'quote_count': 0}

At last we load these date into our **tweets_crypto** bucket. We stored the data with the following structure `year/month/tweet_id.json` 

In [5]:
bucket = config['bucket']
client = storage.Client()
gcs_bucket = client.get_bucket(bucket)

In [6]:
print(bucket)

tweets_crypto


In [None]:
for row in result:
    path = f"tweets/{row['created_at'][:4]}/{row['created_at'][5:7]}/tweet_{row['id']}.json"
    blob = gcs_bucket.blob(path)
    with blob.open(mode = 'w') as file:
        json.dump(row, file)

As the data takes a while to load into the bucket, we stored the data form november 2021 up today in the `nov_march_2022.json` file. These will uploaded also into the bucket, the same way as the previous data. 

In [71]:
json_result = json.dumps(result)

In [73]:
with open('nov_march_2022.json', 'w') as outfile:
    json.dump(json_result, outfile)

## Extraction of tweets last 7 days

Here is a way to extract data from the last 7 days, and this will be use to feed our model with new data. 

In [7]:
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

In [8]:
keywords = '#bitcoin'
limit = 10
tweets = tweepy.Cursor(api.search_tweets, lang = 'en', q = keywords, count = 100, tweet_mode = 'extended').items(limit)

In [9]:
columns = ['User', 'Tweet', 'Date', 'Likes', 'Location', 'Followers']
data = []

for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.full_text, tweet.created_at, tweet.favorite_count, tweet.user.location, tweet.user.followers_count])

#df = pd.DataFrame(data, colum1s=columns)

In [10]:
data = []
keywords = '#bitcoin'
limit = 10

for tweet in tweepy.Cursor(api.search_tweets, lang = 'en', q = keywords, count = 100, tweet_mode = 'extended').items(limit):
    data.append(tweet._json)

In [37]:
data1=[]
for i in data:
    if i["lang"] == "en":
        test = {
            "id": i["id"],
            "lang": i["lang"],
            "created_at": i["created_at"],
            "text": i["full_text"],
            "retweet_count": i["retweet_count"],
            "raw_data": i
        }
    data1.append(test)

## References

[Twitter V2 Full Archive Search]()