# TWEEPY SCRAPING

## Imports

In [1]:
import tweepy
import pandas as pd
import time
import os
from dotenv import load_dotenv
load_dotenv()
import jsonpickle
import json

## API Connection

**Keys:**

In [2]:
consumer_key = os.getenv('CONSUMER_KEY')
consumer_secret = os.getenv('CONSUMER_SECRET')
access_token = os.getenv('ACCESS_TOKEN')
access_token_secret = os.getenv('ACCESS_TOKEN_SECRET')

**Setup:**

Application only Auth instead of the Access Token Auth allows being able to search at a rate greater than 18K tweets/15 mins. 

In [3]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)
# api = tweepy.API(auth,wait_on_rate_limit=True)

In [4]:
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

## Scraping functions

**Save tweets to .txt:**

Reference code can be found in this [link](https://bhaskarvk.github.io/2015/01/how-to-use-twitters-search-rest-api-most-effectively./#:~:text=Application%20only%20auth%20has%20higher,Auth%20using%20the%20Tweepy%20API.)

In [5]:
def save_tweets_to_txt(searchQuery,maxTweets):
    fName = '{0}-tweets.txt'.format(searchQuery)
    sinceId = None
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    with open(fName, 'w') as f:
        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100)
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1))
                    else:
                        new_tweets = api.search(q=searchQuery, count=100,
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break
                for tweet in new_tweets:
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False) +'\n')
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id
                
            except tweepy.TweepError as e:
            # Just exit if any error
                print("some error : " + str(e))
                break

    print ("Downloaded {0} tweets, Saved to {1}".format(tweetCount, fName))


Run the following code tu save load the .txt into a json object

In [6]:
# file_name=''

# with open(file_name) as file:
#     status = [json.loads(line) for line in file]

**Save tweets to DataFrame:**

In [7]:
def tweets_to_df(searchQuery,maxTweets,language='',geocode=[]):
    # If results from a specific ID onwards are reqd, set since_id to that ID.
    # else default to no lower limit, go as far back as API allows
    sinceId = None

    # If results only below a specific ID are, set max_id to that ID.
    # else default to no upper limit, start from the most recent tweet matching the search query.
    max_id = -1
    tweetCount = 0

    print("Downloading max {0} tweets".format(maxTweets))

    tweets=[]

    while tweetCount < maxTweets:
        try:
            if (max_id <= 0):
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended')
                else:
                     new_tweets = api.search(q=searchQuery, count=100,lang=language,since_id=sinceId,tweet_mode='extended')
            else:
                if (not sinceId):
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1))
                else:
                    new_tweets = api.search(q=searchQuery, count=100,lang=language,tweet_mode='extended',
                                            max_id=str(max_id - 1),
                                            since_id=sinceId)
            if not new_tweets:
                print("No more tweets found")
                break
            for tweet in new_tweets:
                tweets.append(tweet._json)
            tweetCount += len(new_tweets)
            print("Downloaded {0} tweets".format(tweetCount))
            max_id = new_tweets[-1].id
        
        except tweepy.TweepError as e:
            # Just exit if any error
            print("some error : " + str(e))
            break

    print ("Downloaded {0} tweets".format(tweetCount))
    return pd.DataFrame(tweets)


## Scraping & saving to CSV 🙌🏼

In [9]:
max_tweets = int(input('Set maximun tweets: '))
query = input('Choose query: ')
language = input('Choose language or leave empty: ')
geocode = [input('latitude: '), input('longitude: '),input('radius: ')]

Set maximun tweets: 1000
Choose query: todes
Choose language or leave empty: es
latitude: 
longitude: 
radius: 


In [10]:
tweets = tweets_to_df(query,max_tweets,language,geocode)

Downloading max 1000 tweets
Downloaded 100 tweets
Downloaded 200 tweets
Downloaded 300 tweets
Downloaded 400 tweets
Downloaded 500 tweets
Downloaded 600 tweets
Downloaded 700 tweets
Downloaded 800 tweets
Downloaded 900 tweets
Downloaded 1000 tweets
Downloaded 1000 tweets


**Change truncated tweets to full_text**

In [11]:
def get_full_text(df):
    df['condition'] = df['retweeted_status'].isna()
    df['full_text'] = df.apply(lambda x: x.full_text if (x.condition == True) else x.retweeted_status['full_text'] , axis = 1)
    df.drop('condition', inplace=True,axis=1)
    return df
    

In [12]:
tweets_df = get_full_text(tweets)

In [13]:
# tweets_df.to_csv(' '.join(query.split())+'.csv', index=False)

In [14]:
tweets_df.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,metadata,source,in_reply_to_status_id,...,retweet_count,favorite_count,favorited,retweeted,lang,quoted_status_id,quoted_status_id_str,quoted_status,possibly_sensitive,extended_entities
0,Fri Jul 24 18:13:28 +0000 2020,1286726205593411584,1286726205593411584,alch qué bueno que el sein se salió de 1d y no...,False,"[0, 132]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'es', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,...,41,0,False,False,es,,,,,
1,Fri Jul 24 18:13:27 +0000 2020,1286726200254255106,1286726200254255106,"La Real Academia Española ratificó el no al ""t...",False,"[0, 92]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'es', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,...,4119,0,False,False,es,,,,,
2,Fri Jul 24 18:13:11 +0000 2020,1286726136291110912,1286726136291110912,"@PJudicialChile Que no sea una excepcion, que ...",False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","{'iso_language_code': 'es', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,...,8,0,False,False,es,,,,,
3,Fri Jul 24 18:13:09 +0000 2020,1286726126497398786,1286726126497398786,La presión de la ola feminista arrasó con la m...,False,"[0, 265]","{'hashtags': [{'text': 'SomosMás', 'indices': ...","{'iso_language_code': 'es', 'result_type': 're...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,...,1,1,False,False,es,1.286716e+18,1.2867160513849467e+18,{'created_at': 'Fri Jul 24 17:33:07 +0000 2020...,False,
4,Fri Jul 24 18:12:59 +0000 2020,1286726084764078082,1286726084764078082,"La Real Academia Española ratificó el no al ""t...",False,"[0, 208]","{'hashtags': [{'text': 'PAZ', 'indices': [189,...","{'iso_language_code': 'es', 'result_type': 're...","<a href=""http://twitter.com/download/android"" ...",,...,0,1,False,False,es,,,,,


### TODO

1. Top words


2. More retweets

In [80]:
def get_top_tweets(df,top_nb):
    sorted_tweets = df.sort_values(by='retweet_count', ascending=False).full_text

    top = []

    for tweet in sorted_tweets:
        while len(top) < top_nb  and tweet not in top:
            top.append(tweet)
    return top

In [81]:
get_top_tweets(tweets_df,5)

['La Real Academia Española ratificó el no al "todes". Una estupidez menos',
 'No al "todes": la Real Academia Española le puso un freno al lenguaje inclusivo - https://t.co/013hGKLAhx',
 'Tengo una tarea para todes! Me ayudan a difundir? Este domingo hay un altísimo festival latinoamericano gratuito "Somos antídoto" organizado por #EFAC Espacio de la Fraternidad Argentino Cubana. Ah y es gratuito. A ver esos RT! https://t.co/T9xAcCFDiT',
 '"Cuando decimos todes no hacemos el ridículo, le hablamos a les que antes no le hablaban".\nLo dijo @alferdez hace una semana.\nSí, chiques, dijo TODES en cadena nacional.\nPorque dice lo que hace.',
 '"Les animales son personas porque tienen personalidad" todes.\n😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂 me parto😭😭\nCasi me muero. https://t.co/jXu8Vglmno']

In [107]:
def get_top_tweets(df,top_nb):    
    sorted_tweets =df.sort_values(by='retweet_count', ascending=False)
    top = []


    for i in range(len(sorted_tweets)):
        if len(top) < top_nb:
            text = [x.full_text for x in top]
            if sorted_tweets.iloc[i].full_text not in text:
                top.append(sorted_tweets.iloc[i])
        else:
            break
    return [{"text": x.full_text, 
      "name": x.retweeted_status['user']['name'],
      "screen_name":x.retweeted_status['user']['screen_name'],
      "favorite_count":x.retweeted_status['favorite_count'],
      "retweeted_count":x.retweet_count} for x in top]

In [108]:
get_top_tweets(tweets_df,5)

[{'text': 'La Real Academia Española ratificó el no al "todes". Una estupidez menos',
  'name': 'Guillermo Salatino',
  'screen_name': 'GuilleSalatino',
  'favorite_count': 15884,
  'retweeted_count': 4119},
 {'text': 'No al "todes": la Real Academia Española le puso un freno al lenguaje inclusivo - https://t.co/013hGKLAhx',
  'name': 'Estela Tramontini',
  'screen_name': 'EstelaTramonti2',
  'favorite_count': 1422,
  'retweeted_count': 847},
 {'text': 'Tengo una tarea para todes! Me ayudan a difundir? Este domingo hay un altísimo festival latinoamericano gratuito "Somos antídoto" organizado por #EFAC Espacio de la Fraternidad Argentino Cubana. Ah y es gratuito. A ver esos RT! https://t.co/T9xAcCFDiT',
  'name': 'gaby delelisi',
  'screen_name': 'gabydelelisi',
  'favorite_count': 844,
  'retweeted_count': 836},
 {'text': '"Cuando decimos todes no hacemos el ridículo, le hablamos a les que antes no le hablaban".\nLo dijo @alferdez hace una semana.\nSí, chiques, dijo TODES en cadena nac