In [None]:
import tweepy
import configparser
import csv
import json
import time
import os
import numpy as np
import pandas as pd

In [None]:
# Twitter authentication via provided credentials
config = configparser.ConfigParser(interpolation=None)
config.read_file(open('./twitter_credentials.cfg'))

consumer_key = config.get('TWITTER','API_KEY')
consumer_secret = config.get('TWITTER','API_SECRET_KEY')

In [None]:
auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

# Path to store collected tweet and user data
path = './twitter_data/'

In [None]:
# Helper function to handle twitter API rate limit
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError as rle:
            print(rle)
            time.sleep(15 * 60) # 15 minutes
        # Catch any other Twitter API exceptions
        except tweepy.error.TweepError as te:
            print(te)
        except StopIteration as si:
            print(si)
            return

In [None]:
# Helper function to get data from a specified tweet and save it to a CSV file
# Assumption: Twitter authorization and tweepy initialization have been done
def get_tweet_data(tweet, path, trend_to_search):
    #transform the tweepy tweets into a 2D array that will populate the csv
    try:
        tweet_data = [[tweet.id_str, tweet.created_at, tweet.text, tweet.source, tweet.favorite_count,
                    tweet.in_reply_to_screen_name, tweet.in_reply_to_status_id_str, tweet.in_reply_to_user_id_str,
                    status.entities["user_mentions"][0]["screen_name"], status.entities["user_mentions"][0]["id_str"], 
                    tweet.retweet_count, tweet.lang, tweet.user.id_str, tweet.user.screen_name
                    ]]
    except IndexError:
        tweet_data = [[tweet.id_str, tweet.created_at, tweet.text, tweet.source, tweet.favorite_count,
                    tweet.in_reply_to_screen_name, tweet.in_reply_to_status_id_str, tweet.in_reply_to_user_id_str,
                    '', np.nan, 
                    tweet.retweet_count, tweet.lang, tweet.user.id_str, tweet.user.screen_name
                    ]]

    # write the csv
    with open(f'{path}{trend_to_search}_tweets.csv', 'a+') as f:
        writer = csv.writer(f)
        if not os.path.getsize(f.name):
            writer.writerow(["tweet_id","tweet_created_at","tweet_text","tweet_source","likes",
                             "in_reply_to_name","in_reply_to_status_id","in_reply_to_user_id",
                             "user_mentions_name", "user_mentions_id",
                             "retweet_cnt","tweet_lang", "user_id", "user_screen_name"
                            ])
        writer.writerows(tweet_data)

In [None]:
# Helper function to get info on retweeters of a tweet and save data to a CSV file
# Assumption: Twitter authorization and tweepy initialization have been done
def get_retweeter_data(status, path, trend_to_search):
    #print(f'\nid: {status.id}')
    if (status.retweet_count > 0) and (not hasattr(status, 'retweeted_status')):  
        retweeters_list = tweepy.Cursor(api.retweeters, id=status.id).items()
        
        retweeters_data = [(status.id, status.user.id_str, status.user.screen_name, 
                            api.get_user(retweeter).id_str, api.get_user(retweeter).screen_name) 
                           for retweeter in retweeters_list]
        
        # write the csv
        with open(f'{path}{trend_to_search}_retweeters.csv', 'a+') as f:
            writer = csv.writer(f)
            if not os.path.getsize(f.name):
                writer.writerow(["tweet_id","orig_user_id", "orig_user_name", "retweeter_id", "retweeter_name"])
            writer.writerows(retweeters_data)

In [None]:
# Below function could be used to make lookup requests for ids 100 at a time leading to 18K lookups in each 15 min window
# Source: https://stackoverflow.com/questions/31000178/how-to-get-large-list-of-followers-tweepy
def get_usernames(userids):
    users = []
    u_count = len(userids)
    print(u_count)
    try:
        for i in range(int(u_count/100) + 1):            
            end_loc = min((i + 1) * 100, u_count)
            users.extend(api.lookup_users(user_ids=userids[i * 100:end_loc]))
    except:
        import traceback
        traceback.print_exc()
    return users

In [None]:
# Function to save followers of a specified user to a CSV file 
def get_followers(id_str, screen_name, trend_to_search):
    # Create a list to store follower data
    followers_id_list = []  
    
    # For-loop to iterate over tweepy cursors
    cursor = tweepy.Cursor(api.followers_ids, screen_name=screen_name, count=5000).items(5000)
    for item in limit_handled(cursor):
        # Add latest batch of follower data to the list
        followers_id_list.append(item)
    
    if not len(followers_id_list):
        return
        
    #Calling the function to get users from the list of follower ids
    followers_users = get_usernames(followers_id_list)

    if not len(followers_users):
        return
    
    # Extract the follower information
    followers_list = [(id_str, screen_name, follower.id_str, follower.screen_name) for follower in followers_users]
    
    # write the csv
    with open(f'{path}{trend_to_search}_followers.csv', 'a+') as f:
        writer = csv.writer(f)
        if not os.path.getsize(f.name):
            writer.writerow(["orig_user_id", "orig_user_name", "follower_id", "follower_name"])
        writer.writerows(followers_list)

In [None]:
# Function to save friends of a specified user to a CSV file
def get_friends(id_str, screen_name, trend_to_search):
    # Create a list to store friends data
    friends_id_list = []  
    
    # For-loop to iterate over tweepy cursors
    #cursor = tweepy.Cursor(api.friends_ids, screen_name=screen_name).pages()
    cursor = tweepy.Cursor(api.friends_ids, screen_name=screen_name, count=5000).items(5000)
    for item in limit_handled(cursor):
        # Add latest batch of friend data to the list
        friends_id_list.append(item)
        #time.sleep(60)
    
    if not len(friends_id_list):
        return
        
    #Calling the function to get users from the list of follower ids
    friends_users = get_usernames(friends_id_list)
    
    if not len(friends_users):
        return

    # Extract the friends information
    friends_list = [(id_str, screen_name, friend.id_str, friend.screen_name) for friend in friends_users]  
    
    # write the csv
    with open(f'{path}{trend_to_search}_friends.csv', 'a+') as f:
        writer = csv.writer(f)
        if not os.path.getsize(f.name):
            writer.writerow(["orig_user_id", "orig_user_name", "friend_id", "friend_name"])
        writer.writerows(friends_list)

In [None]:
# Helper function to get info of a specified user
# Assumption: Twitter authorization and tweepy initialization have been done
def get_user_data(tweet, path, trend_to_search):    
    #transform the tweepy tweets into a 2D array that will populate the csv
    user_data = [[tweet.user.id_str, tweet.user.name, tweet.user.screen_name, tweet.user.location, 
                  tweet.user.description, tweet.user.followers_count, tweet.user.friends_count,
                  tweet.user.created_at, tweet.user.verified]
                 ] #for tweet in single_tweet]
    
    # write the csv
    with open(f'{path}{trend_to_search}_user_data.csv', 'a+') as f:
        writer = csv.writer(f)
        if not os.path.getsize(f.name):
            writer.writerow(["user_id", "user_name", "user_screen_name", "user_location",
                             "user_description", "followers_count","friends_count",
                             "user_created_at","user_verified"
                            ])
        writer.writerows(user_data)
    get_followers(tweet.user.id_str, tweet.user.screen_name, trend_to_search)
    get_friends(tweet.user.id_str, tweet.user.screen_name, trend_to_search)

In [None]:
# Choose a trend topic to initiate data collection
# The response is an array of “trend” objects that encode the name of the trending topic, 
# the query parameter that can be used to search for the topic on Twitter Search, and the Twitter Search URL.
World_WOE_ID = 1
US_WOE_ID = 23424977
NY_WOE_ID = 2459115

TOP_TRENDS_CNT = 9

trends = json.loads(json.dumps(api.trends_place(World_WOE_ID), indent=1))
if trends:
    for i,trend in enumerate(trends[0]['trends']):
        if i > TOP_TRENDS_CNT: break
        print(f'{i}: trend: {trend["name"]}, Volume: {trend["tweet_volume"]}')
     
    try:
        choice = (int(input(f'Choose a trend (enter a number 0-{TOP_TRENDS_CNT}):')))% (TOP_TRENDS_CNT+1)
    except Exception:
        choice = 0
        print("Illegal Value: Default value of '0' is chosen")
    trend_to_search = trends[0]['trends'][choice]['name']

In [None]:
# Search for the chosen trend tweets and call functions to collect tweet/user data
for status in tweepy.Cursor(api.search, q=trend_to_search).items():
    get_tweet_data(status, path, trend_to_search)
    get_user_data(status, path, trend_to_search)
    get_retweeter_data(status, path, trend_to_search)