# TWEEPY SCRAPING

## Imports

In [86]:
import tweepy
import pandas as pd
import time
import os
from dotenv import load_dotenv
load_dotenv()
import jsonpickle
import json


## API Connection

**Keys:**

In [87]:
consumer_key = os.getenv('CONSUMER_KEY')
consumer_secret = os.getenv('CONSUMER_SECRET')
access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

**Setup:**

Application only Auth instead of the Access Token Auth allows being able to search at a rate greater than 18K tweets/15 mins. 

In [88]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)
# api = tweepy.API(auth,wait_on_rate_limit=True)

In [89]:
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

## Scraping functions

**Save tweets to .txt:**

Reference code can be found in this [link](https://bhaskarvk.github.io/2015/01/how-to-use-twitters-search-rest-api-most-effectively./#:~:text=Application%20only%20auth%20has%20higher,Auth%20using%20the%20Tweepy%20API.)

In [90]:
def save_tweets_to_txt(searchQuery,maxTweets):
    fName = '{0}-tweets.txt'.format(searchQuery)
    sinceId = None
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100)
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +'\n')
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
                
            except tweepy.TweepError as e:
            # Just exit if any error
                print("some error : " + str(e))
                break

    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


Run the following code tu save load the .txt into a json object

In [91]:
# file_name=''

# with open(file_name) as file:
#     status = [json.loads(line) for line in file]

**Save tweets to DataFrame:**

In [92]:
def tweets_to_df(searchQuery,maxTweets,language='',geocode=[]):
    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    tweets=[]

    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended')
                else:
                     new_tweets = api.search(q=searchQuery, count=100,lang=language,since_id=sinceId,tweet_mode='extended')
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                tweets.append(tweet._json)
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

    print ("Downloaded {0} tweets".format(tweetCount))
    return pd.DataFrame(tweets)


## Scraping & saving to CSV 🙌🏼

In [95]:
max_tweets = int(input('Set maximun tweets: '))
query = input('Choose query: ')
language = input('Choose language or leave empty: ')
geocode = [input('latitude: '), input('longitude: '),input('radius: ')]

Set maximun tweets: 2000
Choose query: todes
Choose language or leave empty: es
latitude: 
longitude: 
radius: 


In [96]:
tweets = tweets_to_df(query,max_tweets,language,geocode)

Downloading max 2000 tweets
Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 400 tweets
Downloaded 500 tweets
Downloaded 600 tweets
Downloaded 700 tweets
Downloaded 800 tweets
Downloaded 900 tweets
Downloaded 1000 tweets
Downloaded 1100 tweets
Downloaded 1200 tweets
Downloaded 1300 tweets
Downloaded 1400 tweets
Downloaded 1500 tweets
Downloaded 1600 tweets
Downloaded 1700 tweets
Downloaded 1800 tweets
Downloaded 1900 tweets
Downloaded 2000 tweets
Downloaded 2000 tweets


**Change truncated tweets to full_text**

In [97]:
tweets['condition'] = tweets['retweeted_status'].isna()
tweets['full_text'] = tweets.apply(lambda x: x.full_text if (x.condition == True) else x.retweeted_status['full_text'] , axis = 1)
tweets.drop('condition', inplace=True,axis=1)

In [100]:
tweets.to_csv(' '.join(query.split())+'.csv', index=False)