# Scraping Twitter with Tweepy

In [26]:
# Imports
import numpy as np
import pandas as pd

import pickle
import tweepy

from datetime import datetime
import re
import time

In [2]:
# twitter_keys = {
#         'consumer_key':        '----INPUT KEYS HERE----',
#         'consumer_secret':     '----INPUT KEYS HERE----',
#         'access_token_key':    '----INPUT KEYS HERE----',
#         'access_token_secret': '----INPUT KEYS HERE----'
#     }


#Setup access to API
auth = tweepy.OAuthHandler(twitter_keys['consumer_key'], twitter_keys['consumer_secret'])
auth.set_access_token(twitter_keys['access_token_key'], twitter_keys['access_token_secret'])

api = tweepy.API(auth)

### Current features in play

In [3]:
features = ['verified', 
            #'created_at',
            'hour_created',
            #'lang',
            #'acct_location',
            'geo_enabled', 
            'default_profile', 
            'default_profile_image', 
            'favourites_count', 
            'followers_count', 
            'friends_count', 
            'statuses_count', 
            'average_tweets_per_day',
            #'avg_daily_followers', 
            #'avg_daily_friends',
            #'avg_daily_favorites',
            'popularity', 
            'tweet_to_followers', 
            'follower_acq_rate', 
            'friends_acq_rate', 
            #'favs_rate'
           ]

## Scraping functions

### Scrape user info from Twitter handle

In [4]:
def get_user_info(screen_name):
    '''
    Input: a Twitter handle (screen_name)
    Returns: a list of account-level information -- 
            [handle, created_at, account_age_days, verified, geo_enabled, lang, location, 
                default_profile, default_profile_image, favourites_count, followers_count, 
                friends_count, statuses_count, average_tweets_per_day]
    '''
    try:      
        # Get user information from screen name
        user = api.get_user(screen_name)
        
        # account attributes to return
        handle = user.screen_name
        created_at = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
        account_age_days = (datetime.now() - user.created_at).days       
        verified = user.verified
        geo_enabled = user.geo_enabled
        lang = user.lang
        location = user.location
        default_profile = user.default_profile
        default_profile_image = user.default_profile_image
        favourites_count = user.favourites_count
        followers_count = user.followers_count
        friends_count = user.friends_count
        statuses_count = user.statuses_count
        average_tweets_per_day = np.round(statuses_count / account_age_days, 3)
        
        # organizing list to be returned
        account_info = [handle, created_at, account_age_days, verified, geo_enabled, lang, location, 
                        default_profile, default_profile_image, favourites_count, followers_count, 
                        friends_count, statuses_count, average_tweets_per_day]
        
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
    
    return account_info 

In [5]:
#get_user_info('scrapfishies')

### Scrape multiple users from a list into a dataframe

In [6]:
def account_level_df(list_of_users):
    '''
    Input: a list of Twitter users (by handle or screen name)
    Returns: a pandas dataframe of account-level details provided 
                using the get_user_info() function
    '''
    try: 
        # Scrape each account and compile into a list
        accounts = [get_user_info(user) for user in list_of_users]
        
        # Assemble accounts list into a pandas dataframe
        headers = ['handle', 'created_at', 'account_age_days', 'verified', 'geo_enabled', 'lang', 'location', 
                        'default_profile', 'default_profile_image', 'favourites_count', 'followers_count', 
                        'friends_count', 'statuses_count', 'average_tweets_per_day']   
        
        df = pd.DataFrame(accounts, columns=headers)
    
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
        
    return df    

In [7]:
#users = ['scrapfishies', 'aoc', 'FloridaMan_']

#account_level_df(users)

## Scrape for predictions

In [22]:
### STILL IN PROGRESS

def get_user_features(screen_name):
    '''
    Input: a Twitter handle (screen_name)
    Returns: a list of account-level information used to make a prediction 
            whether the user is a bot or not
    '''
    try:      
        # Get user information from screen name
        user = api.get_user(screen_name)
        
        # account features to return for predicton
        handle = user.screen_name
        created_at = user.created_at.strftime('%Y-%m-%d %H:%M:%S')
        account_age_days = (datetime.now() - user.created_at).days       
        verified = user.verified
        geo_enabled = user.geo_enabled
        lang = user.lang
        location = user.location
        default_profile = user.default_profile
        default_profile_image = user.default_profile_image
        favourites_count = user.favourites_count
        followers_count = user.followers_count
        friends_count = user.friends_count
        statuses_count = user.statuses_count
        average_tweets_per_day = np.round(statuses_count / account_age_days, 3)
        
        # manufactured features
        hour_created = int(user.created_at.strftime('%H'))
        popularity = np.round(np.log(1 + friends_count) * np.log(1 + followers_count), 3)
        tweet_to_followers = np.round(np.log(1 + statuses_count) / np.log(1 + followers_count), 3)
        follower_acq_rate = np.round(np.log(1 + (followers_count / account_age_days)), 3)
        friends_acq_rate = np.round(np.log(1 + (friends_count / account_age_days)), 3)
        
        # organizing list to be returned
        account_features = [verified, hour_created, geo_enabled, default_profile, default_profile_image, 
                           favourites_count, followers_count, friends_count, statuses_count, 
                           average_tweets_per_day, popularity, tweet_to_followers, follower_acq_rate, 
                           friends_acq_rate]
        
    except BaseException as e:
          print('failed on_status,',str(e))
          time.sleep(3)
    
    return account_features 

In [9]:
features = ['verified', 
            'hour_created',
            'geo_enabled', 
            'default_profile', 
            'default_profile_image', 
            'favourites_count', 
            'followers_count', 
            'friends_count', 
            'statuses_count', 
            'average_tweets_per_day',
            'popularity', 
            'tweet_to_followers', 
            'follower_acq_rate', 
            'friends_acq_rate', 
           ]

In [23]:
scrapfish = np.matrix(np.array(get_user_features('scrapfishies'), dtype='O'))
scrapfish

matrix([[False, 14, False, True, False, 18, 2, 63, 14, 0.132, 4.569,
         2.465, 0.019, 0.466]], dtype=object)

In [18]:
scrapfish = np.matrix(np.array(get_user_features('scrapfishies'), dtype='O'))
scrapfish

matrix([[False, '14', False, True, False, 18, 2, 63, 14, 0.132, 4.569,
         2.465, 0.019, 0.466]], dtype=object)

In [25]:
user_info = np.matrix(get_user_features('scrapfishies'))
user_info

matrix([[0.000e+00, 1.400e+01, 0.000e+00, 1.000e+00, 0.000e+00,
         1.800e+01, 2.000e+00, 6.300e+01, 1.400e+01, 1.320e-01,
         4.569e+00, 2.465e+00, 1.900e-02, 4.660e-01]])

In [11]:
sc

''

In [24]:
get_user_features('scrapfishies')

[False,
 14,
 False,
 True,
 False,
 18,
 2,
 63,
 14,
 0.132,
 4.569,
 2.465,
 0.019,
 0.466]

In [51]:
def bot_or_not(twitter_handle):
    
    features = ['verified', 'hour_created', 'geo_enabled', 'default_profile', 'default_profile_image', 
            'favourites_count', 'followers_count', 'friends_count', 'statuses_count', 'average_tweets_per_day',
            'popularity', 'tweet_to_followers', 'follower_acq_rate', 'friends_acq_rate']

    user_df = pd.DataFrame(np.matrix(get_user_features(twitter_handle)), columns=features)
    
    prediction = xgb_model.predict(user_df)[0]
    
    return "Bot" if prediction == 1 else "Not a bot"

In [56]:
def bot_proba(twitter_handle):
    
    user = np.matrix(get_user_features(twitter_handle))
    
    proba = xgb_model.predict_proba(user)[:,1][0]
    
    print(f'Probability of being a bot: {proba*100:.2f}%')
    
    return proba

In [72]:
bot_or_not('best_in_dumbest')

'Bot'

In [73]:
bot_proba('best_in_dumbest')

Probability of being a bot: 97.96%


0.97964954

## Importing model for predictions!

In [33]:
with open('model.pickle','rb') as read_file:
    xgb_model = pickle.load(read_file)

In [34]:
xgb_model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.05, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1.8, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)