# TWEEPY SCRAPING

## Imports

In [14]:
import tweepy
import pandas as pd
import time
import os
from dotenv import load_dotenv
load_dotenv()
import jsonpickle
import json

## API Connection

**Keys:**

In [15]:
consumer_key = os.getenv('CONSUMER_KEY')
consumer_secret = os.getenv('CONSUMER_SECRET')
access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

**Setup:**

Application only Auth instead of the Access Token Auth allows being able to search at a rate greater than 18K tweets/15 mins. 

In [16]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)
# api = tweepy.API(auth,wait_on_rate_limit=True)

In [17]:
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

## Scraping functions

**Save tweets to .txt:**

Reference code can be found in this [link](https://bhaskarvk.github.io/2015/01/how-to-use-twitters-search-rest-api-most-effectively./#:~:text=Application%20only%20auth%20has%20higher,Auth%20using%20the%20Tweepy%20API.)

In [18]:
def save_tweets_to_txt(searchQuery,maxTweets):
    fName = '{0}-tweets.txt'.format(searchQuery)
    sinceId = None
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100)
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +'\n')
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
                
            except tweepy.TweepError as e:
            # Just exit if any error
                print("some error : " + str(e))
                break

    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


Run the following code tu save load the .txt into a json object

In [19]:
# file_name=''

# with open(file_name) as file:
#     status = [json.loads(line) for line in file]

**Save tweets to DataFrame:**

In [20]:
def tweets_to_df(searchQuery,maxTweets,language='',geocode=[]):
    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    tweets=[]

    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended')
                else:
                     new_tweets = api.search(q=searchQuery, count=100,lang=language,since_id=sinceId,tweet_mode='extended')
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                tweets.append(tweet._json)
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

    print ("Downloaded {0} tweets".format(tweetCount))
    return pd.DataFrame(tweets)


## Scraping & saving to CSV 🙌🏼

In [None]:
max_tweets = int(input('Set maximun tweets: '))
query = input('Choose query: ')
language = input('Choose language or leave empty: ')
geocode = [input('latitude: '), input('longitude: '),input('radius: ')]

In [25]:
tweets = tweets_to_df(query,max_tweets,language,geocode)

Downloading max 50000 tweets
Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 400 tweets
Downloaded 500 tweets
Downloaded 600 tweets
Downloaded 700 tweets
Downloaded 800 tweets
Downloaded 900 tweets
Downloaded 1000 tweets
Downloaded 1100 tweets
Downloaded 1200 tweets
Downloaded 1300 tweets
Downloaded 1400 tweets
Downloaded 1500 tweets
Downloaded 1600 tweets
Downloaded 1700 tweets
Downloaded 1800 tweets
Downloaded 1900 tweets
Downloaded 2000 tweets
Downloaded 2100 tweets
Downloaded 2200 tweets
Downloaded 2300 tweets
Downloaded 2400 tweets
Downloaded 2500 tweets
Downloaded 2600 tweets
Downloaded 2700 tweets
Downloaded 2800 tweets
Downloaded 2900 tweets
Downloaded 3000 tweets
Downloaded 3100 tweets
Downloaded 3200 tweets
Downloaded 3300 tweets
Downloaded 3400 tweets
Downloaded 3500 tweets
Downloaded 3600 tweets
Downloaded 3700 tweets
Downloaded 3800 tweets
Downloaded 3900 tweets
Downloaded 4000 tweets
Downloaded 4100 tweets
Downloaded 4200 tweets
Downloaded 430

Downloaded 34536 tweets
Downloaded 34636 tweets
Downloaded 34736 tweets
Downloaded 34836 tweets
Downloaded 34936 tweets
Downloaded 35036 tweets
Downloaded 35136 tweets
Downloaded 35236 tweets
Downloaded 35336 tweets
Downloaded 35436 tweets
Downloaded 35536 tweets
Downloaded 35636 tweets
Downloaded 35736 tweets
Downloaded 35836 tweets
Downloaded 35936 tweets
Downloaded 36036 tweets
Downloaded 36136 tweets
Downloaded 36236 tweets
Downloaded 36336 tweets
Downloaded 36436 tweets
Downloaded 36536 tweets
Downloaded 36635 tweets
Downloaded 36735 tweets
Downloaded 36835 tweets
Downloaded 36935 tweets
Downloaded 37035 tweets
Downloaded 37135 tweets
Downloaded 37235 tweets
Downloaded 37335 tweets
Downloaded 37435 tweets
Downloaded 37533 tweets
Downloaded 37633 tweets
Downloaded 37733 tweets
Downloaded 37830 tweets
Downloaded 37930 tweets
Downloaded 38030 tweets
Downloaded 38130 tweets
Downloaded 38227 tweets
Downloaded 38327 tweets
Downloaded 38427 tweets
Downloaded 38527 tweets
Downloaded 38627

Rate limit reached. Sleeping for: 485


Downloaded 45027 tweets
Downloaded 45127 tweets
Downloaded 45227 tweets
Downloaded 45327 tweets
Downloaded 45427 tweets
Downloaded 45527 tweets
Downloaded 45627 tweets
Downloaded 45727 tweets
Downloaded 45827 tweets
Downloaded 45927 tweets
Downloaded 46027 tweets
Downloaded 46116 tweets
Downloaded 46216 tweets
Downloaded 46316 tweets
Downloaded 46416 tweets
Downloaded 46516 tweets
Downloaded 46616 tweets
Downloaded 46716 tweets
Downloaded 46816 tweets
Downloaded 46916 tweets
Downloaded 47016 tweets
Downloaded 47116 tweets
Downloaded 47216 tweets
Downloaded 47316 tweets
Downloaded 47416 tweets
Downloaded 47516 tweets
Downloaded 47616 tweets
Downloaded 47716 tweets
Downloaded 47816 tweets
Downloaded 47916 tweets
Downloaded 48016 tweets
Downloaded 48116 tweets
Downloaded 48216 tweets
Downloaded 48316 tweets
Downloaded 48416 tweets
Downloaded 48516 tweets
Downloaded 48616 tweets
Downloaded 48716 tweets
Downloaded 48816 tweets
Downloaded 48916 tweets
Downloaded 49016 tweets
Downloaded 49116

**Change truncated tweets to full_text**

In [26]:
tweets['condition'] = tweets['retweeted_status'].isna()
tweets['full_text'] = tweets.apply(lambda x: x.full_text if (x.condition == True) else x.retweeted_status['full_text'] , axis = 1)
tweets.drop('condition', inplace=True,axis=1)

In [27]:
# import time
# for tweet in tweets.full_text:
#     print(tweet)
#     time.sleep(1.5)

In [28]:
tweets.to_csv(' '.join(query.split())+'.csv', index=False)