In [None]:
!pip install tweepy
!pip install twitter

In [None]:
from bs4 import BeautifulSoup
import datetime
import json
import numpy as np
import pandas as pd
import pickle
import re
import time
import tweepy
import twitter
from urllib.parse import quote
import urllib3
import wptools

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
# Get the most followed twitter profiles by scraping trackanalytics.com
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

url = 'https://www.trackalytics.com/the-most-followed-twitter-profiles/page/'

user_list = []

for num in range(1, 655):
    try:
        query = url + str(num) + '/'
        response = urllib3.PoolManager().request('GET', query).data

        # Avoid decoding warnings
        FromRaw = lambda r: r if isinstance(r, str) else r.decode('utf-8', 'ignore')

        soup = BeautifulSoup(FromRaw(response), 'html.parser')
        rows = soup.find_all('tr')[1:]
        for row in rows:
            try:
                new = BeautifulSoup(str(row), 'html.parser').find_all('a')[1]
                screen_name = new.get('title')#.title() # normalise capitalization
                username = "@" + new.get('href').split('/')[-2] # get username from link

                user_list.append([screen_name, username])

            except Exception as e:
                print('link error:', i, e)
                continue
    except Exception as e:
        print('page error:', num, e)
        continue

In [None]:
# Method to get the wikipedia page for each user

def search_wiki(search_word):
    # Compile search query
    query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=' + quote(search_word.replace(' ', '_'))
    try:
        # Query wikipedia
        wikiresponse = urllib3.PoolManager().request('GET', query)
        wikijson2 = json.loads(wikiresponse.data)

        results_list = []

        for i in wikijson2['query']['search']:
            try:
                results_list.append(i['title'])
            except:
                print('no results:', i)
                continue
        return results_list
    except Exception as e:
        print('Query error:', search_word, e)
        return []

In [None]:
# Method to retrieve the borthday from the infobox in a wikipedia page

def query_birthday(wiki_link):
    page = wptools.page(wiki_link, verbose=False, silent=True)
    page.get_parse()
    print()
    birthday = page.data['infobox']['birth_date']
    print(birthday)
    birthday = re.search('\d{4}\|\d+\|\d+', birthday).group()
    date = datetime.datetime.strptime(birthday, "%Y|%m|%d").date()
    return date


In [None]:
# Returns the zodiac sign given a date
# Source: https://www.geeksforgeeks.org/program-display-astrological-sign-zodiac-sign-given-date-birth/

def zodiac_sign(date): 
    month = date.month
    day = date.day

    if month == 12: 
        astro_sign = 'Sagittarius' if (day < 22) else 'Capricorn'
        
    elif month == 1: 
        astro_sign = 'Capricorn' if (day < 20) else 'Aquarius'
        
    elif month == 2: 
        astro_sign = 'Aquarius' if (day < 19) else 'Pisces'
        
    elif month == 3: 
        astro_sign = 'Pisces' if (day < 21) else 'Aries'
        
    elif month == 4: 
        astro_sign = 'Aries' if (day < 20) else 'Taurus'
        
    elif month == 5: 
        astro_sign = 'Taurus' if (day < 21) else 'Gemini'
        
    elif month == 6: 
        astro_sign = 'Gemini' if (day < 21) else 'Cancer'
        
    elif month == 7: 
        astro_sign = 'Cancer' if (day < 23) else 'Leo'
        
    elif month == 8: 
        astro_sign = 'Leo' if (day < 23) else 'Virgo'
        
    elif month == 9: 
        astro_sign = 'Virgo' if (day < 23) else 'Libra'
        
    elif month == 10: 
        astro_sign = 'Libra' if (day < 23) else 'Scorpio'
        
    elif month == 11: 
        astro_sign = 'Scorpio' if (day < 22) else 'Sagittarius'
        
    return astro_sign 

In [None]:
# Returns the element of the zodiac sign
    
def zodiac_element(zodiac_sign):
    earth = ['Capricorn', 'Taurus', 'Virgo' ]
    water = ['Cancer', 'Pisces', 'Scorpio' ]
    fire = ['Leo', 'Aries', 'Sagittarius']
    air = ['Libra', 'Gemini', 'Aquarius']

    if zodiac_sign in earth:
        return 'Earth'
    elif zodiac_sign in water:
        return 'Water'
    elif zodiac_sign in fire:
        return 'Fire'
    elif zodiac_sign in air:
        return 'Air'
    else:
        return ''

In [None]:
# Get the birthday, sign, element and wikipedia for each user

info = {}
for user in user_list:
    # Get name and twitter username from user
    name, username = user[:2]

    # Find Wiki pages corresponding to twitter page
    pages = []
    pages += (search_wiki(username))
    pages += (search_wiki(name))
    pages = [p for p in pages if 'List of' not in p]
    
    # Select wiki_link from pages
    
    wiki_link = ''

    # If twitter name (first and last) appears in searches, thes are prioritized
    first_name = name.split(' ')[0]
    last_name = name.split(' ')[-1]
    name_matches = [p for p in pages if first_name.lower() in p.lower() and last_name.lower() in p.lower() and '&' not in p]
    username_matches = [p for p in pages if (p.split(' ')[0].lower() in username and p.split(' (')[0].split(' ')[-1].lower() in username) or (p.split(' ')[0].lower() in name and p.split(' (')[0].split(' ')[-1].lower() in name)]


    # Sort by length as celebrities often have several related pages
    matches = sorted(name_matches, key=len)

    if len(matches) > 0:
        wiki_link = matches[0] # best name match
    elif len(pages) > 0: 
        wiki_link = pages[0] # first result from query
    else:
        # No pages were found
        continue

    # Extract birthday from wiki_link
    try:
        birthday = query_birthday(wiki_link)
    except Exception as e:
        print(user)
        print(wiki_link)
        try:
            # Second and last attempt, this time prioritizing
            new_matches = [m for m in username_matches+name_matches+pages if not m is wiki_link]
            wiki_link = new_matches[0] # best name match
            print(wiki_link)
            birthday = query_birthday(wiki_link)
        except:
            print('failed')
            # print('bday error:', wiki_link, user, e)
            # print('FAILURE:' wiki_link, user)
            # No birthday found
            continue

    # Calculate zodiac from birthday
    zodiac = zodiac_sign(birthday)
    element = zodiac_element(zodiac)
    # print('SUCCESS:', wiki_link, user)
    info[name] = [name, username, 0, birthday, zodiac, element, wiki_link] 


In [None]:
# Create a dataframe with all the information gathered so far
columns = ['Name', 'Handle', 'FollowersCount', 'Birthday', 'ZodiacSign', 'Element', 'WikiLink']
df = pd.DataFrame(list(info.values()), columns=columns)

In [None]:
# Load the twitter API credentials (not shown here for privacy)

with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)



In [None]:
# Query tweets from twitter API using tweepy package

# Initialize API
auth = tweepy.OAuthHandler(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])
auth.set_access_token(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'])
api = tweepy.API(auth)

tweets = {}
for twitter_user in df.Handle:
    # Getting their 50 last tweets 
    tweets[twitter_user] = []
    for status in tweepy.Cursor(api.user_timeline, id = twitter_user, since="2017-10-20",lang="en", include_rts=False).items(50):
        tweets[twitter_user].append(status.text)

In [None]:
# Query friends from twitter API (directly from twitter API)

# Initialize API
twitter = Twitter(auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))

friends = {}
for username in df.Handle: 
    # Initial query (max size 5000)
    try:
        query = twitter.friends.ids(screen_name = username)
    except TwitterHTTPError: 
        print("Rate limit reached, going to sleep")
        time.sleep(15*60)
        query = twitter.friends.ids(screen_name = username)
    
    # Query in loop until no more friends left
    friends[username] = []
    while len(query["ids"]) > 0:
        friends[username] += query["ids"]
        try:
            query = twitter.friends.ids(cursor = query['next_cursor'], screen_name = username)
        except TwitterHTTPError:
            print("Friends rate limit reached, going to sleep")
            time.sleep(15*60)
            query = twitter.friends.ids(cursor = query['next_cursor'], screen_name = username)


In [1]:
# Query ids from twitter API (directly from twitter API)

# Initialize API
twitter = Twitter(auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))


# TODO: comment and review code
usernames = [handle[1:] for handle in df['Handle']]

ids = {}
chunks = [usernames[x:x+100] for x in range(0, len(usernames), 100)]

for chunk in [usernames[x:x+100] for x in range(0, len(usernames), 100)]:
    usernames_string = ",".join(chunk)
    #-----------------------------------------------------------------------
    # create a subquery, looking up information about these users
    # twitter API docs: https://dev.twitter.com/rest/reference/get/users/lookup
    #-----------------------------------------------------------------------

    try:
        subquery = twitter.users.lookup(screen_name = usernames_string)
    except TwitterHTTPError as e:
        print(e)
        time.sleep(15*60)
        subquery = twitter.users.lookup(screen_name = usernames_string)
    for user in subquery:
        #-----------------------------------------------------------------------
        # now print out user info, starring any users that are Verified.
        #-----------------------------------------------------------------------
        print(user['screen_name'])
        ids['@' + user['screen_name'].lower()] = user['id']

NameError: name 'df' is not defined

In [None]:
# Add the rest of the columns to the dataframe
df['Tweets'] = df['Handle'].map(tweets)  
df['Following'] = df['Handle'].map(friends)
df['Id'] = df['Handle'].map(ids)

# Save the id as int instead of as double without affecting the NaN
df['Id'] = df['Id'].fillna(-1)
df['Id'] = df['Id'].astype(int)
df['Id'] = df['Id'].replace('-1', np.nan)

In [None]:
# Save dataframe
df.drop(['Unnamed: 0'], axis = 1, inplace =True)
df.drop(['FollowersCount'], axis = 1, inplace =True)
df.to_pickle("./datafiles/full_df.pkl")