# Scraping Twitter with Tweepy

In [6]:
# Imports
import numpy as np
import pandas as pd

import tweepy

from datetime import datetime
import re
import time

In [7]:
# twitter_keys = {
#         'consumer_key':        '----INPUT KEYS HERE----',
#         'consumer_secret':     '----INPUT KEYS HERE----',
#         'access_token_key':    '----INPUT KEYS HERE----',
#         'access_token_secret': '----INPUT KEYS HERE----'
#     }



#Setup access to API
auth = tweepy.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token_key'], twitter_keys['access_token_secret'])

api = tweepy.API(auth)

### Current features in play

In [3]:
features = ['verified', 
            #'created_at',
            'hour_created',
            #'lang',
            #'acct_location',
            'geo_enabled', 
            'default_profile', 
            'default_profile_image', 
            'favourites_count', 
            'followers_count', 
            'friends_count', 
            'statuses_count', 
            'average_tweets_per_day',
            #'avg_daily_followers', 
            #'avg_daily_friends',
            #'avg_daily_favorites',
            'popularity', 
            'tweet_to_followers', 
            'follower_acq_rate', 
            'friends_acq_rate', 
            #'favs_rate'
           ]

## Scraping functions

### Scrape user info from Twitter handle

In [3]:
def get_user_info(screen_name):
    '''
    Input: a Twitter handle (screen_name)
    Returns: a list of account-level information -- 
            [handle, created_at, account_age_days, verified, geo_enabled, lang, location, 
                default_profile, default_profile_image, favourites_count, followers_count, 
                friends_count, statuses_count, average_tweets_per_day]
    '''
    try:      
        # Get user information from screen name
        user = api.get_user(screen_name)
        
        # account attributes to return
        handle = user.screen_name
        created_at = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
        account_age_days = (datetime.now() - user.created_at).days       
        verified = user.verified
        geo_enabled = user.geo_enabled
        lang = user.lang
        location = user.location
        default_profile = user.default_profile
        default_profile_image = user.default_profile_image
        favourites_count = user.favourites_count
        followers_count = user.followers_count
        friends_count = user.friends_count
        statuses_count = user.statuses_count
        average_tweets_per_day = np.round(statuses_count / account_age_days, 3)
        
        # organizing list to be returned
        account_info = [handle, created_at, account_age_days, verified, geo_enabled, lang, location, 
                        default_profile, default_profile_image, favourites_count, followers_count, 
                        friends_count, statuses_count, average_tweets_per_day]
        
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
    
    return account_info 

In [4]:
#get_user_info('scrapfishies')

['scrapfishies',
 '2020-07-08 14:28:25',
 105,
 False,
 False,
 None,
 'Oakland, CA',
 True,
 False,
 17,
 2,
 62,
 14,
 0.133]

### Scrape multiple users from a list into a dataframe

In [3]:
def account_level_df(list_of_users):
    '''
    Input: a list of Twitter users (by handle or screen name)
    Returns: a pandas dataframe of account-level details provided 
                using the get_user_info() function
    '''
    try: 
        # Scrape each account and compile into a list
        accounts = [get_user_info(user) for user in list_of_users]
        
        # Assemble accounts list into a pandas dataframe
        headers = ['handle', 'created_at', 'account_age_days', 'verified', 'geo_enabled', 'lang', 'location', 
                        'default_profile', 'default_profile_image', 'favourites_count', 'followers_count', 
                        'friends_count', 'statuses_count', 'average_tweets_per_day']   
        
        df = pd.DataFrame(accounts, columns=headers)
    
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
        
    return df    

In [7]:
#users = ['scrapfishies', 'aoc', 'FloridaMan_']

#account_level_df(users)

Unnamed: 0,handle,created_at,account_age_days,verified,geo_enabled,lang,location,default_profile,default_profile_image,favourites_count,followers_count,friends_count,statuses_count,average_tweets_per_day
0,scrapfishies,2020-07-08 14:28:25,102,False,False,,"Oakland, CA",True,False,17,2,61,14,0.137
1,AOC,2010-04-28 22:38:40,3825,True,False,,"Bronx + Queens, NYC",False,False,26910,9086267,2747,11275,2.948
2,FloridaMan_,2014-07-24 02:57:55,2278,False,False,,,True,True,1,20,0,8,0.004


## Scrape for predictions

In [18]:
### STILL IN PROGRESS

def get_user_features(screen_name):
    '''
    Input: a Twitter handle (screen_name)
    Returns: a list of account-level information used to make a prediction 
            whether the user is a bot or not
    '''
    try:      
        # Get user information from screen name
        user = api.get_user(screen_name)
        
        # account features to return for predicton
        handle = user.screen_name
        created_at = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
        account_age_days = (datetime.now() - user.created_at).days       
        verified = user.verified
        geo_enabled = user.geo_enabled
        lang = user.lang
        location = user.location
        default_profile = user.default_profile
        default_profile_image = user.default_profile_image
        favourites_count = user.favourites_count
        followers_count = user.followers_count
        friends_count = user.friends_count
        statuses_count = user.statuses_count
        average_tweets_per_day = np.round(statuses_count / account_age_days, 3)
        
        # manufactured features
        hour_created = user.created_at.strftime('%H')
        popularity = np.round(np.log(1 + friends_count) * np.log(1 + followers_count), 3)
        tweet_to_followers = np.round(np.log(1 + statuses_count) / np.log(1 + followers_count), 3)
        follower_acq_rate = np.round(np.log(1 + (followers_count / account_age_days)), 3)
        friends_acq_rate = np.round(np.log(1 + (friends_count / account_age_days)), 3)
        
        # organizing list to be returned
        account_features = [verified, hour_created, geo_enabled, default_profile, default_profile_image, 
                           favourites_count, followers_count, friends_count, statuses_count, 
                           average_tweets_per_day, popularity, tweet_to_followers, follower_acq_rate, 
                           friends_acq_rate]
        
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
    
    return account_features 

In [None]:
features = ['verified', 
            #'created_at',
            'hour_created',
            #'lang',
            #'acct_location',
            'geo_enabled', 
            'default_profile', 
            'default_profile_image', 
            'favourites_count', 
            'followers_count', 
            'friends_count', 
            'statuses_count', 
            'average_tweets_per_day',
            #'avg_daily_followers', 
            #'avg_daily_friends',
            #'avg_daily_favorites',
            'popularity', 
            'tweet_to_followers', 
            'follower_acq_rate', 
            'friends_acq_rate', 
            #'favs_rate'
           ]

In [19]:
user_info = np.matrix(get_user_features('scrapfishies'))
user_info

matrix([['False', '14', 'False', 'True', 'False', '17', '2', '62', '14',
         '0.133', '4.552', '2.465', '0.019', '0.464']], dtype='<U5')