# Data gathering file for Zodiac project

This file generates a dataframe used for the [twitter-zodiac project](github.com/simoneengelbr7twitter-zodiac). The data is gathered from www.trackalytics.com, the Twitter API, and the Wikipedia API. The data consists of Twitter user information and tweets from >2000 of the most popular Twitter users. Based on the Twitter user information, a search of Wikipedia is performed and the birthday of a likely match is extracted. The resulting data should however be cleaned manually. A clean version of the dataframes can be found as a csv file in the repository.

## Imports

In [None]:
from bs4 import BeautifulSoup
import datetime
import json
import numpy as np
import pandas as pd
import pickle
import re
import time
import tweepy
import twitter
from urllib.parse import quote
import urllib3
import wptools

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Most popular Twitter Profiles

The most popular twitter profiles are scraped from trackanalytics.com. The profiles are listed in a paginated table, and the site has 655 pages as of November 2020.

In [None]:
### Get the most followed twitter profiles by scraping trackanalytics.com ###

# trackanalytics 'Most Followed Twitter Profiles' page url
url = 'https://www.trackalytics.com/the-most-followed-twitter-profiles/page/'

# list of [screen name, username] for each popular page
user_list = []

# Iterate through site pages
for num in range(1, 655):
    try:
        # Load page
        query = url + str(num) + '/'
        response = urllib3.PoolManager().request('GET', query).data

        # Avoid decoding warnings
        FromRaw = lambda r: r if isinstance(r, str) else r.decode('utf-8', 'ignore')

        # Parse page html
        soup = BeautifulSoup(FromRaw(response), 'html.parser')
        
        # Extract row elements in table of users
        rows = soup.find_all('tr')[1:]
        for row in rows:
            try:
                # Parse row to extract link with twitter user
                new = BeautifulSoup(str(row), 'html.parser').find_all('a')[1]
                screen_name = new.get('title')
                username = "@" + new.get('href').split('/')[-2] # get username from link
                
                # Store user in user_list
                user_list.append([screen_name, username])

            except Exception as e:
                print('link error:', i, e)
                continue
    except Exception as e:
        print('page error:', num, e)
        continue

## Wikipedia page by Twitter user

The list of Twitter users is enriched with birthday information if a matching Wikipedia page with a birthday can be found.

First, a number of methods are defined, and then a list of information for each user is compiled.

In [None]:
# Method to search Wikipedia for a given search word. Returns list of wiki-links

def search_wiki(search_word):
    # Compile search query
    query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=' + quote(search_word.replace(' ', '_'))
    try:
        # Query wikipedia
        wikiresponse = urllib3.PoolManager().request('GET', query)
        wikijson2 = json.loads(wikiresponse.data)
        
        # Compile list of wikilink results
        results_list = []
        
        # extract wikilinks from search results in JSON format
        for i in wikijson2['query']['search']:
            try:
                results_list.append(i['title'])
            except:
                print('no results:', i)
                continue
        return results_list
    except Exception as e:
        print('Query error:', search_word, e)
        return []

In [None]:
# Method to retrieve a birthday from the Wikipedia page with the supplied wiki-link, returns date object.
# Throws error if no parsable birthday is found

def query_birthday(wiki_link):
    # wptools is used to read wikipedia infoboxes, as these are often inconsistent
    page = wptools.page(wiki_link, verbose=False, silent=True)
    page.get_parse()

    # attempt to extract and birthday from infobox birth_date
    birthday = page.data['infobox']['birth_date']
    birthday = re.search('\d{4}\|\d+\|\d+', birthday).group()
    
    # parse birthday to date object
    date = datetime.datetime.strptime(birthday, "%Y|%m|%d").date()
    
    return date


In [None]:
# Method to determine zodiac sign from date object. Returns the zodiac sign of the given date
# Code adapted from: https://www.geeksforgeeks.org/program-display-astrological-sign-zodiac-sign-given-date-birth/

def zodiac_sign(date): 
    month = date.month
    day = date.day

    if month == 12: 
        astro_sign = 'Sagittarius' if (day < 22) else 'Capricorn'
        
    elif month == 1: 
        astro_sign = 'Capricorn' if (day < 20) else 'Aquarius'
        
    elif month == 2: 
        astro_sign = 'Aquarius' if (day < 19) else 'Pisces'
        
    elif month == 3: 
        astro_sign = 'Pisces' if (day < 21) else 'Aries'
        
    elif month == 4: 
        astro_sign = 'Aries' if (day < 20) else 'Taurus'
        
    elif month == 5: 
        astro_sign = 'Taurus' if (day < 21) else 'Gemini'
        
    elif month == 6: 
        astro_sign = 'Gemini' if (day < 21) else 'Cancer'
        
    elif month == 7: 
        astro_sign = 'Cancer' if (day < 23) else 'Leo'
        
    elif month == 8: 
        astro_sign = 'Leo' if (day < 23) else 'Virgo'
        
    elif month == 9: 
        astro_sign = 'Virgo' if (day < 23) else 'Libra'
        
    elif month == 10: 
        astro_sign = 'Libra' if (day < 23) else 'Scorpio'
        
    elif month == 11: 
        astro_sign = 'Scorpio' if (day < 22) else 'Sagittarius'
        
    return astro_sign 

In [None]:
# Method to determine zodiac element. Returns the element of the given zodiac sign
    
def zodiac_element(zodiac_sign):
    earth = ['Capricorn', 'Taurus', 'Virgo' ]
    water = ['Cancer', 'Pisces', 'Scorpio' ]
    fire = ['Leo', 'Aries', 'Sagittarius']
    air = ['Libra', 'Gemini', 'Aquarius']

    if zodiac_sign in earth:
        return 'Earth'
    elif zodiac_sign in water:
        return 'Water'
    elif zodiac_sign in fire:
        return 'Fire'
    elif zodiac_sign in air:
        return 'Air'
    else:
        return ''

In [None]:
# Get the wiki-link, birthday, sign, and element for each user

Dictionary of user information
info = {}

for user in user_list:
    # Get name and twitter username from user
    name, username = user[:2]

    # Find Wiki pages corresponding to twitter username and screen name
    pages = (search_wiki(username))
    pages += (search_wiki(name))
    
    # Remove some non-person Wikipedia results (ie. 'List of most followed twitter profiles')
    pages = [p for p in pages if 'List of' not in p]

    # If twitter name (first and last) appears in searches, these are prioritized
    first_name = name.split(' ')[0]
    last_name = name.split(' ')[-1]
    name_matches = [p for p in pages if first_name.lower() in p.lower() and last_name.lower() in p.lower() and '&' not in p]
    
    # Sort by length as celebrities often have several related pages, ie. Discography
    matches = sorted(name_matches, key=len)

    if len(matches) > 0:
        wiki_link = matches[0] # best name match
    elif len(pages) > 0: 
        wiki_link = pages[0] # first result from query
    else:
        # No pages were found
        continue

    # Extract birthday from wiki-link
    try:
        birthday = query_birthday(wiki_link)
    except Exception as e:
        # Initial match did not yield birthday
        try:
            # Second and last attempt, this time compiling all results in prioritized list
            
            # If wikilink name (first and last) appears in twitter username, these are also prioritized
            username_matches = [p for p in pages if (p.split(' ')[0].lower() in username and p.split(' (')[0].split(' ')[-1].lower() in username) or (p.split(' ')[0].lower() in name and p.split(' (')[0].split(' ')[-1].lower() in name)]
            
            # Compile list
            new_matches = [m for m in username_matches+name_matches+pages if not m is wiki_link]
            # select new wiki-link
            wiki_link = new_matches[0]
    
            # Extract birthday from wiki-link
            birthday = query_birthday(wiki_link)
        except:
            # No birthday found, twitter user is dropped
            print('FAILURE:' wiki_link, user)
            continue

            
    # Determine zodiac from birthday
    zodiac = zodiac_sign(birthday)
    
    # Determine element from zodiac
    element = zodiac_element(zodiac)
    
    # compile results as list in info dictionary
    info[name] = [name, username, birthday, zodiac, element, wiki_link] 


In [None]:
# Create a dataframe with the information gathered so far
columns = ['Name', 'Handle', 'Birthday', 'ZodiacSign', 'Element', 'WikiLink']
df = pd.DataFrame(list(info.values()), columns=columns)

## Twitter API queries

Now the data for each user will be enriched with information from Twitter. The Twitter API will be used to query the last 50 tweets from each user. Hereafter the profiles that each user follows, called friends, will be queried. As and the twitter friends are returned as Twitter user ids, the twitter ID of each user will be queried so friendships within the dataset can be determined.

In [None]:
# Load the twitter API credentials (not shown here for privacy)

with open("twitter_credentials.json", "r") as file:
    creds = json.load(file)

In [None]:
# Query tweets from twitter API using tweepy package

# Initialize API
auth = tweepy.OAuthHandler(creds['CONSUMER_KEY'], creds['CONSUMER_SECRET'])
auth.set_access_token(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'])
api = tweepy.API(auth)

tweets = {}

# Loop through each Twitter username in dataframe
for twitter_user in df.Handle:
    
    tweets[twitter_user] = []
    
    # Getting the 50 last tweets 
    for status in tweepy.Cursor(api.user_timeline, id = twitter_user, since="2017-10-20",lang="en", include_rts=False).items(50):
        tweets[twitter_user].append(status.text)

In [None]:
# Query friends from twitter API
# This query can only load 5000 friends at a time, friendships will need to be queried in a loop
# There is a rate limit of 15 calls pr 15 minutes

# Initialize API
twitter = Twitter(auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))

friends = {}

# Loop through each Twitter username in dataframe
for username in df.Handle:
    # Initial query (max size 5000)
    try:
        query = twitter.friends.ids(screen_name = username)
    except TwitterHTTPError: 
        print("Rate limit potentially reached, going to sleep")
        time.sleep(15*60)
        # Retry (will fail if user does not exist)
        query = twitter.friends.ids(screen_name = username)
    
    # Compile list of friends
    friends[username] = []
    
    # Query in loop until no more friends left
    while len(query["ids"]) > 0:
        
        # Add friend query IDs to friend list
        friends[username] += query["ids"]
        try:
            # query from 'next cursor'(end) of previous query
            query = twitter.friends.ids(cursor = query['next_cursor'], screen_name = username)
        except TwitterHTTPError:
            print("Friends rate limit reached, going to sleep")
            time.sleep(15*60)
            query = twitter.friends.ids(cursor = query['next_cursor'], screen_name = username)


In [None]:
# Query ids from twitter API

# Initialize API
twitter = Twitter(auth=OAuth(creds['ACCESS_TOKEN'], creds['ACCESS_SECRET'], creds['CONSUMER_KEY'], creds['CONSUMER_SECRET']))


# List of twitter usernames without '@'
usernames = [handle[1:] for handle in df['Handle']]

ids = {}

# load usernames in chunks of 100, as API can take 100 usernames in each query
chunks = [usernames[x:x+100] for x in range(0, len(usernames), 100)]

for chunk in [usernames[x:x+100] for x in range(0, len(usernames), 100)]:
    usernames_string = ",".join(chunk)
    
    try:
        subquery = twitter.users.lookup(screen_name = usernames_string)
    except TwitterHTTPError:
            print("User ID rate limit reached, going to sleep")
        time.sleep(15*60)
        subquery = twitter.users.lookup(screen_name = usernames_string)
    
    # Store each resulting user ID in ids dictionary
    for user in subquery:
        ids['@' + user['screen_name'].lower()] = user['id']

In [None]:
# Add the results to the dataframe as new columns
df['Tweets'] = df['Handle'].map(tweets)  
df['Following'] = df['Handle'].map(friends)
df['Id'] = df['Handle'].map(ids)

# Save the id as int instead of as double without affecting the NaN
df['Id'] = df['Id'].fillna(-1)
df['Id'] = df['Id'].astype(int)
df['Id'] = df['Id'].replace('-1', np.nan)

## Conclusion

The result of this execution is a dataframe of popular twitter users and their birthday, tweets, and friends. note that the data should be manually cleaned, as the wikipedia page match is erroneous. 

In [None]:
# Save dataframe as file
df.drop(['Unnamed: 0'], axis = 1, inplace =True)
df.to_pickle("./datafiles/full_df.pkl")
df.to_csv("./datafiles/full_df.csv")