In [11]:
# Imports needed for this ipython file

from collections import Counter
import ConfigParser
import matplotlib.pyplot as plt
import networkx as nx
import sys
import time
import pickle
import os
from TwitterAPI import TwitterAPI

%matplotlib inline

In [2]:
def get_twitter(config_file):
    """ 
    Args:
      config_file ... A config file in ConfigParser format with Twitter credentials
    Returns:
      An instance of TwitterAPI.
    """
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [3]:
def robust_request(twitter, resource, params, max_tries=2):
    """ If a Twitter request fails, sleep for 15 minutes.
    Do this at most max_tries times before quitting.
    Args:
      twitter .... A TwitterAPI object.
      resource ... A resource string to request.
      params ..... A parameter dictionary for the request.
      max_tries .. The maximum number of tries to attempt.
    Returns:
      A TwitterResponse object, or None if failed.
    """
    for i in range(max_tries):
        try:
            request = twitter.request(resource, params)
        except:
            twitter = get_twitter('twitter.cfg')
            request = twitter.request(resource, params)
        if request.status_code == 200:
            return request
        elif "Not authorized" in request.text:
            return -1
        elif "page does not exist" in request.text:
            return -2
        else:
            print >> sys.stderr, 'Got error:', request.text, '\nsleeping for 15 minutes.'
            sys.stderr.flush()
            time.sleep(61 * 15)

In [4]:
def get_followers(screen_name):
    """ Return a dictionary of the users that follows this person on Twitter.
    
    Args:
        screen_name: a string of a Twitter screen name
    Returns:
        A dictionary of dictionary, one per follower containing {index, user_info} pairs.
    Note: Many users follow more than 200 accounts; we will return those accounts too.
    """
    followers = {}
    cursor = -1
    key = 0
    rate_limit = 1
    while True:
        if rate_limit == 15:
            rate_limit = 1
            print >> sys.stderr, 'Avoided rate_limit error: \nsleeping for 15 minutes.'
            time.sleep(61 * 15)
        request = robust_request(twitter,'followers/list', 
                                 {'screen_name': screen_name, 'count':200, 'cursor': cursor})
        json_response = request.json()
        
        # Get 200 followers from json_response
        for follower in json_response['users']:
                followers[follower['screen_name']] = follower
                key = key + 1
                
        # If user has < 200 followers or to retrieve last 200 followers
        if json_response["next_cursor"] == 0:
            break
            
        # Update cursor to get next 200 followers
        cursor = json_response["next_cursor"]
        rate_limit = rate_limit + 1
        print "Collected %d followers" % key
    return followers

In [5]:
def get_followers_and_pickle_object(account_name):
    """ Calls get_followers() to collect all followers of a Twitter account
        and then saves it by pickling for future processing purposes.
    
    Args: A string representing the screen_name of a Twitter account,
          whose followers we need.
    Returns : A dictionary of { screen_name : user_info } pairs of followers,
              where user_info is a dictionary object containing user information
    """
    followers = get_followers(account_name)
    fileObject = open("raw_data/" + account_name,'wb')
    pickle.dump(followers,fileObject)
    fileObject.close()
    return followers

In [25]:
def get_followers_from_pickle(account_name):
    """ Returns all followers of a Twitter account by retrieving saved data,
        using pickle method.
    
    Args: A string representing the screen_name of a Twitter account,
          whose followers we need.
    Returns : A dictionary of { screen_name : user_info } pairs of followers,
              where user_info is a dictionary object containing user information
    """
    fileObject = open("raw_data/" + account_name,'r')
    followers = pickle.load(fileObject)
    fileObject.close()
    return followers

In [61]:
# Getting iit alumni association followers and pickling them
iit_alumni_followers = get_followers_and_pickle_object('iitalumniassn')

In [6]:
iit_alumni_followers = get_followers_from_pickle('iitalumniassn')
print "Found %d followers for iitalumniassn" %len(iit_alumni_followers.keys())

Found 1861 followers for iitalumniassn


In [6]:
# Getting iit career services followers and pickling them
iit_career_followers = get_followers_and_pickle_object('IITCareers')

In [8]:
iit_career_followers = get_followers_from_pickle('IITCareers')
print "Found %d followers for IITCareers" %len(iit_career_followers.keys())

Found 643 followers for IITCareers


In [None]:
# Getting iit main university followers and pickling them
iit_univ_followers = get_followers_and_pickle_object('illinoistech')

In [9]:
iit_univ_followers = get_followers_from_pickle('illinoistech')
print "Found %d followers for IIT University" %len(iit_univ_followers.keys())

Found 5426 followers for IIT University


In [None]:
# Getting uiuc main university followers and pickling them
uiuc_univ_followers = get_followers_and_pickle_object('Illinois_Alma')

In [16]:
uiuc_univ_followers = get_followers_from_pickle('Illinois_Alma')
print "Found %d followers for UIUC University" %len(uiuc_univ_followers.keys())

Found 40017 followers for UIUC University


In [None]:
# Getting uiuc alumni followers and pickling them
uiuc_alumni_followers = get_followers_and_pickle_object('UIAA')

In [45]:
uiuc_alumni_followers = get_followers_from_pickle('UIAA')
print "Found %d followers for UIUC Alumni" %len(uiuc_alumni_followers.keys())

Found 5960 followers for UIUC Alumni


In [37]:
def get_all_screen_names(*args):
    """ Returns union of all followers of a particular university
    
    Args: Variables containing followers from different accounts in dictionary format.
    
    Returns : A set of intersection of keys from the dictionaries passed as input.
    """
    univ_followers = set()
    for followers in args:
        for user in followers.keys():
            univ_followers.add(user)
    return univ_followers

In [11]:
iit_followers = get_all_screen_names(iit_alumni_followers,iit_career_followers,iit_univ_followers)
print "Found %d unique accounts from %d accounts " %(len(iit_followers), 
            (len(iit_alumni_followers)+len(iit_career_followers)+len(iit_univ_followers)))

Found 6565 unique accounts from 7930 accounts 


In [13]:
# Pickling iit_followers for future calculations.
fileObject = open("processed_data/iit_followers",'wb')
pickle.dump(iit_followers,fileObject)
fileObject.close()

In [8]:
# Loading iit_followers from pickle.
fileObject = open("processed_data/iit_followers",'r')
iit_followers = pickle.load(fileObject)
fileObject.close()

In [10]:
def get_friends(screen_name):
    """ Return a dictionary of users that are friends of this person on Twitter.
    
    Args:
        screen_name: a string of a Twitter screen name
    Returns:
        A dictionary of {screen_name : description} per friend of the given account.
    """
    friends = {}
    cursor = -1
    key = 0
    while True:
        request = robust_request(twitter,'friends/list', 
                                 {'screen_name': screen_name, 'count':200, 'cursor': cursor})
        
        if request == None:
            return 0
        
        if request == -1 :    # Authentication error : Can't access follower data
            return -1
        elif request == -2:
            return -2
        
        json_response = request.json()
        
        # Get 200 friends from json_response
        for friend in json_response['users']:
                #friends.append(friend['screen_name']) -- to_remove
                friends[friend['screen_name']] = friend['description']
                key = key + 1
                
        # If user has < 200 friends or to retrieve last friends followers
        if json_response["next_cursor"] == 0:
            break
            
        # Update cursor to get next 200 friends
        cursor = json_response["next_cursor"]
        print "Collected %d friends for %s" % (key,screen_name)
    print "Returning --- %d friends for %s" % (key,screen_name)
    return friends

In [None]:
friends_of_iit_followers = { x : get_friends(x) for x in iit_followers}

In [11]:
def get_friends_and_pickle_object(univ_followers, univ_name):
    """ Calls get_friends() to collect all friends of a Twitter account
        and then saves it by pickling.
    
    Args: A list of strings representing the screen_names of all followers of a university,
          A string representing the university's name.
          
    Returns : A dictionary of { String , [List] } pairs,
              where String is a university follower's screen_name
              and List is a list of screen_names of their friends
    """
    friends_of_univ_followers = {}
    for follower in univ_followers:
        friends = get_friends(follower)
        friends_of_univ_followers[follower] = friends
        fileObject = open("raw_data/friends_of_" + univ_name + "_followers",'wb')
        pickle.dump(friends_of_univ_followers,fileObject)
        fileObject.close()
    return friends_of_univ_followers

In [None]:
get_friends_and_pickle_object(iit_followers,"iit")

In [16]:
for idx,follower in enumerate(iit_followers):
    if "nancyzhang2" in follower:
        print idx

4717


In [21]:
iit_followers_2 =  list(iit_followers)[1548:]

In [71]:
#Need to be executed for second half split_pickle
def get_friend_ids(screen_name):
    """ Return a dictionary of users that are friends of this person on Twitter.
    
    Args:
        screen_name: a string of a Twitter screen name
    Returns:
        A list of ids per friend of the given account.
    """
    friends = []
    cursor = -1
    key = 0
    while True:
        request = robust_request(twitter,'friends/ids', 
                                 {'screen_name': screen_name, 'count':5000, 'cursor': cursor})
        
        if request == None:
            return 0
        elif request == -1 :    # Authentication error : Can't access follower data
            return -1
        elif request == -2:   # Page does not exist error
            return -2
        
        json_response = request.json()
        
        # Get 5000 friends from json_response
        friends = json_response['ids']
        key = len(friends)
                
        # If user has < 200 friends or to retrieve last friends followers
        if json_response["next_cursor"] == 0:
            break
            
        # Update cursor to get next 200 friends
        cursor = json_response["next_cursor"]
        print "Collected %d friends for %s" % (key,screen_name)
    print "Returning --- %d friends for %s" % (key,screen_name)
    return friends

In [72]:
#Need to be executed for second half split_pickle
def get_friend_ids_and_pickle_object(univ_followers, univ_name, split_pickle = False):
    """ Calls get_friends() to collect all friends of a Twitter account
        and then saves it by pickling.
    
    Args: A list of strings representing the screen_names of all followers of a university,
          A string representing the university's name.
          A boolean indicating whether to split the pickling method for each follower.
          
    Returns : A dictionary of { String , [List] } pairs,
              where String is a university follower's screen_name
              and List is a list of screen_names of their friends
    """
    friends_of_univ_followers = {}
    for follower in univ_followers:
        friends = get_friend_ids(follower)
        friends_of_univ_followers[follower] = friends
        
        if split_pickle == True:
            fileObject = open("raw_data/friend_ids_of_" + univ_name + "_" + follower
                              + "_followers",'wb')
        else:
            fileObject = open("raw_data/friend_ids_of_" + univ_name + "_followers",'wb')
        pickle.dump(friends_of_univ_followers,fileObject)
        fileObject.close()
        
        if split_pickle == True:
            friends_of_univ_followers.clear()
        
    return friends_of_univ_followers

In [None]:
get_friend_ids_and_pickle_object(iit_followers_2,"iit2")

In [11]:
iit_followers_3 =  list(iit_followers)[2777:]
print len(iit_followers_3)

3788


In [None]:
get_friend_ids_and_pickle_object(iit_followers_3,"iit3")

In [17]:
iit_followers_4 =  list(iit_followers)[4717:]

In [None]:
get_friend_ids_and_pickle_object(iit_followers_4,"iit4")

#Collecting data for North Western University

In [10]:
def get_followers_and_robust_pickle(screen_name):
    """ 
    Retrieves and pickles a dictionary of dictionary, 
        one per follower containing { follower_screen_name: {follower_object} } pairs.
        Pickles by splitting to a different file every 1000 followers
        
    Args:
        screen_name: a string of a Twitter screen name
           
    Note: Many users follow more than 200 accounts; we will return those accounts too.
    """
    followers = {}
    cursor = -1
    key = 0
    rate_limit = 1
    while True:
        if rate_limit == 15:
            rate_limit = 1
            print >> sys.stderr, 'Avoided rate_limit error: \nsleeping for 15 minutes.'
            time.sleep(61 * 15)
        request = robust_request(twitter,'followers/list', 
                                 {'screen_name': screen_name, 'count':200, 'cursor': cursor})
        json_response = request.json()
        
        # Get 200 followers from json_response
        for follower in json_response['users']:
                followers[follower['screen_name']] = follower
                key = key + 1
                
        # If user has < 200 followers or to retrieve last 200 followers
        if json_response["next_cursor"] == 0:
            break
            
        # Update cursor to get next 200 followers
        cursor = json_response["next_cursor"]
        rate_limit = rate_limit + 1
        
        # Pickling followers after 1000 followers and clearing followers
        if key % 1000 == 0:
            fileObject = open("raw_data/" + screen_name + str(key),'wb')
            pickle.dump(followers,fileObject)
            fileObject.close()
            followers = {}
            print "Pickling %d followers and clearing followers object" % key
        
        print "Collected %d followers" % key
    
    # Checking if followers is not empty
    if bool(followers) != False:
        fileObject = open("raw_data/" + screen_name + str(key+1),'wb')
        pickle.dump(followers,fileObject)
        fileObject.close()
        followers = {}
        print "Pickling %d followers and clearing followers object" % key

In [None]:
# Getting North Western University Career services followers and pickling them
NUCareerAdvance_followers = get_followers_and_robust_pickle('NUCareerAdvance')

In [None]:
# Getting North Western University Alumni followers and pickling them
nualumni_followers = get_followers_and_robust_pickle('nualumni')

In [None]:
# Getting North Western University followers and pickling them
NorthwesternU_followers = get_followers_and_robust_pickle('NorthwesternU')

In [28]:
def get_filenames(account_name, path = 'raw_data'):
    """ Calls get_friends() to collect all friends of a Twitter account
        and then saves it by pickling.
    
    Args: Path; which represents the sub directory to search for.
          Account_name; which represents the file names to be searched for against an account' screen_name.
          
    Returns : A list of filenames associated with an account
    """
    files = os.listdir(path)
    file_names = []
    for file_name in files:
        if account_name in file_name:
            file_names.append(file_name)
    return file_names

print get_filenames('NorthwesternU')

['NorthwesternU1000', 'NorthwesternU10000', 'NorthwesternU11000', 'NorthwesternU12000', 'NorthwesternU13000', 'NorthwesternU14000', 'NorthwesternU15000', 'NorthwesternU16000', 'NorthwesternU17000', 'NorthwesternU18000', 'NorthwesternU19000', 'NorthwesternU2000', 'NorthwesternU20000', 'NorthwesternU21000', 'NorthwesternU22000', 'NorthwesternU23000', 'NorthwesternU24000', 'NorthwesternU25000', 'NorthwesternU26000', 'NorthwesternU27000', 'NorthwesternU28000', 'NorthwesternU29000', 'NorthwesternU3000', 'NorthwesternU30000', 'NorthwesternU31000', 'NorthwesternU32000', 'NorthwesternU4000', 'NorthwesternU5000', 'NorthwesternU6000', 'NorthwesternU7000', 'NorthwesternU8000', 'NorthwesternU9000']


In [29]:
def aggregate_data(account_name):
    """
    Collect data from different pickled objects related to an account
    and aggregate them to another pickle
    
    Args: account_name; a string representing the screen_name of the account which we want to aggregate.
    """
    aggregated_followers = {}
    files = get_filenames(account_name)
    for file_name in files:
        old_followers = get_followers_from_pickle(file_name)
        aggregated_followers.update(old_followers)
        
    fileObject = open("raw_data/" + account_name,'wb')
    pickle.dump(aggregated_followers,fileObject)
    fileObject.close()

In [30]:
aggregate_data('NorthwesternU')

In [31]:
aggregate_data('nualumni')

In [32]:
aggregate_data('NUCareerAdvance')

In [33]:
nu_univ_followers = get_followers_from_pickle('NorthwesternU')
print "Found %d followers for Northwestern University" %len(nu_univ_followers.keys())

Found 31999 followers for Northwestern University


In [34]:
nu_alumni_followers = get_followers_from_pickle('nualumni')
print "Found %d followers for Northwestern University" %len(nu_alumni_followers.keys())

Found 7816 followers for Northwestern University


In [35]:
nu_career_followers = get_followers_from_pickle('NUCareerAdvance')
print "Found %d followers for Northwestern University" %len(nu_career_followers.keys())

Found 3738 followers for Northwestern University


In [49]:
nu_followers = sorted(get_all_screen_names(nu_univ_followers,nu_alumni_followers,nu_career_followers))
print "Found %d unique accounts from %d accounts " %(len(nu_followers), 
            (len(nu_univ_followers)+len(nu_alumni_followers)+len(nu_career_followers)))

Found 35984 unique accounts from 43553 accounts 


In [52]:
# Pickling nu_followers as a SORTED LIST for future calculations.
fileObject = open("processed_data/nu_followers",'wb')
pickle.dump(nu_followers,fileObject)
fileObject.close()

In [53]:
# Loading nu_followers from pickle.
fileObject = open("processed_data/nu_followers",'r')
nu_followers = pickle.load(fileObject)
fileObject.close()

### Splitting university followers into two lists. To reduce time in collecting future data ( their friend ids ).

In [58]:
# Pickling first half of nu_followers for collecting their friend ids.
fileObject = open("processed_data/nu_followers_first_half",'wb')
pickle.dump(nu_followers[:len(nu_followers)/2],fileObject)
fileObject.close()

In [59]:
# Pickling second half of nu_followers for collecting their friend ids.
fileObject = open("processed_data/nu_followers_second_half",'wb')
pickle.dump(nu_followers[len(nu_followers)/2:],fileObject)
fileObject.close()

In [60]:
# Loading first half of nu_followers from pickle.
fileObject = open("processed_data/nu_followers_first_half",'r')
nu_followers_1 = pickle.load(fileObject)
fileObject.close()

In [61]:
# Loading second half of nu_followers from pickle.
fileObject = open("processed_data/nu_followers_second_half",'r')
nu_followers_2 = pickle.load(fileObject)
fileObject.close()

In [None]:
# Collecting first half of nu_followers's friends' ids and pickling them separately
get_friend_ids_and_pickle_object(nu_followers_1, 'nu', split_pickle = True)

Returning --- 1292 friends for 005ia
Returning --- 349 friends for 007MAV
Returning --- 882 friends for 007kengom
Returning --- 1965 friends for 0118mag3
Returning --- 20 friends for 0179849769
Returning --- 358 friends for 027_rmm

In [None]:
# Collecting second half of nu_followers's friends' ids and pickling them separately
get_friend_ids_and_pickle_object(nu_followers_2, 'nu', split_pickle = True)