## HEAD

#### HEAD 01 - toggle user settings

In [1]:
## determine whether to cache data from some time consuming tasks
settings = {
    'test_mode': True, ## when true, only a small of data is collected
    'collect_data': False, ## toggles twitter api pulls in PULL01-03
    'rebuild_word_data': True, ## rebuild vs cache load for word_data MUNG02
    'rebuild_tweet_words': True, ## rebuild vs cache load tweet_words MUNG03
    'rebuild_user_token': True ## rebuild vs cache load user_token_tally MUNG04
    }

#### HEAD02 - load libraries

In [2]:
##########==========##########==========##########==========##########==========
import tweepy
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

#### HEAD03 - load data files

In [3]:
## read in roster of handles
user_data = pd.read_excel("A_Input/twitter_handles.xlsx")

## drop all but a handful of cases if in test mode
if settings['test_mode']:
    user_data = user_data.sample(int(30 / 0.7))

## read in twitter credentials; initialize api connection+
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
api = tweepy.API(twitter_credentials)

  warn(msg)


#### HEAD04 - create build or cache decision function

In [4]:
## build switching function to execute code or cache results
def build_or_cache(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        x.to_csv(address)
        return x
    else:
        return pd.read_csv(address)            

## HAND

#### HAND01 - extract handles from roster URLs

In [5]:
## extract handles from roster urls
user_data['handle'] = user_data.url.str.replace('https://twitter.com/', '',
            regex = False).str.strip()

## PULL

#### PULL01 - query API for each roster handle's user_timeline data

In [6]:
## extract tweet data from api object
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 'created_at': x[i].created_at, 'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name, 'verified': x[i].author.verified
        })
    return pd.DataFrame(tweet_data)

## define function to pull user tweet data and apply function to extract tweet data
def pull_tweet_data(handles = user_data.handle, a = api):
    tweet_data = list()
    for i in handles:
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep(0.1)
        except:
            pass
    return pd.concat(tweet_data)

## execute code
if settings['collect_data']:
    tweet_data = pull_tweet_data()

#### PULL02 - tabulate tweet statistics

In [7]:
## combile tweet count and verification status at the user level
def summarize_user_activity(td, ud = user_data):
    
    ## calculate tweet summary statistics
    td_original = tweet_data
    td = td.copy().groupby('screen_name').agg({np.mean, len})
    td.columns = td.columns.droplevel(0)
    td = td.rename({'len': 'tweets', 'mean': 'verified'}, axis = 1)
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, left_on = 'handle', right_on = 'screen_name', 
                  how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index(drop = True
            ).rename({'index': 'screen_name'}, axis = 1
                    ).set_index('screen_name')
    return ud, td_original

## execute code
if settings['collect_data']:
    user_data, tweet_data = summarize_user_activity(
        tweet_data[['screen_name', 'verified']])
    
## The Twitter API only returns data for 70% of screen_name requests
## Drop all api-denied request
user_data = user_data.dropna()

#### PULL03 - save datasets to disk

In [8]:
## save user/tweet datasets to disk as csvs
if settings['collect_data']:
    user_data.to_csv('B_Process/user_data.csv.gz')
    tweet_data.to_csv('B_Process/tweet_data.csv.gz')
else:
    user_data = pd.read_csv('B_Process/user_data.csv.gz')
    tweet_data = pd.read_csv('B_Process/tweet_data.csv.gz')

## MUNG

#### MUNG01 - parse tweet text into tokens

In [9]:
## tokenize and remove capitalization
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    return x

## execute code
tweet_data['tokens'] = tweet_data.full_text.apply(nlp_tokenize_tweet)

#### MUNG02 - create word/token level dataset

In [10]:
## create word/token level dataset and identify valid word tokens
def make_word_data(td = tweet_data):

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.tokens:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > max(
        word_data['count'].quantile(0.2), 3)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.word:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data.reset_index(drop = True)

## execute code
word_data = build_or_cache(
    address = 'B_Process/word_data.csv.gz',
    function = make_word_data,
    build_bool = settings['rebuild_word_data']
    )

#### MUNG03 - generate a tokens x tweets link database

In [11]:
def make_tweet_token_data(td = tweet_data, wd = word_data):
    
    wd = wd.set_index('word')
    
    ## replicate tweet ids
    n = td.tokens.apply(len).values
    tweet_tokens = pd.Series(np.repeat(td.tweet_id.values, n), name = "tweet_id")
    tweet_tokens = pd.DataFrame(tweet_tokens)
    
    ## allocate words to the new dataset
    words = list()
    for i in td.tokens:
        words += i
    tweet_tokens['words'] = words
    
    ## convert words to tokens
    tweet_tokens['tokens'] = wd.loc[
        tweet_tokens.words.values, 'token'].values
    tweet_tokens = tweet_tokens.dropna()

    return tweet_tokens.reset_index(drop = True)

## execute code
tweet_words = build_or_cache(
    address = 'B_Process/tweet_words.csv.gz',
    function = make_tweet_token_data,
    build_bool = settings['rebuild_tweet_words']
    )
tweet_data = tweet_data.drop('tokens', axis = 1)

In [12]:
tweet_words

Unnamed: 0,tweet_id,words,tokens
0,1552338287771193344,honored,honor
1,1552338287771193344,to,to
2,1552338287771193344,be,be
3,1552338287771193344,recognized,recognize
4,1552338287771193344,among,among
...,...,...,...
184557,1499403305985916930,million,million
184558,1499403305985916930,toxic-exposed,toxic-exposed
184559,1499403305985916930,veterans,veteran
184560,1499403305985916930,1/2,1/2


#### MUNG04 - generate a tokens x users count; drop tokens with only one user

In [23]:
def make_user_token_matrix(td = tweet_data, tw = tweet_words):
    
    ## count of number of times each user wrote each token
    tw = tw.merge(right = td[['screen_name', 'tweet_id']],
                  how = 'left', on = 'tweet_id')
    tw = tw.drop(['tweet_id', 'words'], axis = 1).groupby('screen_name')
    tw = tw.value_counts()
    tw.name = 'count'
    tw = tw.reset_index().set_index('screen_name')
    tw = tw.pivot(columns = 'tokens').fillna(0).astype(int)
    
    ## remove tokens that fewer than 3 or more than 80 percent of users use
    valid_usage = (tw > 0).astype(int).sum().values
    valid_usage = (valid_usage > 2
                  ) & (valid_usage < int(tw.shape[0] * 0.8))
    
    return tw.loc[:, valid_usage]


## execute code
user_token_matrix = build_or_cache(
    address = 'B_Process/user_token_tally.csv',
    function = make_user_token_matrix,
    build_bool = settings['rebuild_user_token']
    )

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
tokens,%,','d,'ll,'m,'re,'ve,*,+,..,...,❌,❤️,➡️,⬇️,🇨🇦,🇺🇸,🎉,👇,📺,🚨
screen_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AryaCanada,33,0,0,0,1,1,2,0,0,9,...,0,0,0,0,0,0,0,0,0,0
CharlieAngusNDP,0,0,1,1,3,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cooper4SAE,6,5,0,0,2,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DonDavies,7,0,0,0,0,0,0,1,2,0,...,0,0,0,0,0,0,0,2,0,0
DrNealDunnFL2,0,0,0,0,3,5,0,4,0,0,...,0,0,0,1,0,1,0,0,0,1
Jamie_Schmale,3,1,0,0,0,0,0,3,0,0,...,0,0,0,0,1,0,0,1,0,0
JerryMoran,1,3,0,0,3,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
JoyceMurray,5,6,0,1,1,0,1,0,0,0,...,0,0,0,8,4,1,0,0,0,0
Laurel_BC,3,2,0,0,8,3,1,0,0,0,...,0,0,0,0,0,0,1,1,0,2
MartinBowRiver,2,10,0,3,4,1,3,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [24]:
user_token_matrix

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count,count
tokens,%,','d,'ll,'m,'re,'ve,*,+,..,...,❌,❤️,➡️,⬇️,🇨🇦,🇺🇸,🎉,👇,📺,🚨
screen_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
AryaCanada,33,0,0,0,1,1,2,0,0,9,...,0,0,0,0,0,0,0,0,0,0
CharlieAngusNDP,0,0,1,1,3,2,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cooper4SAE,6,5,0,0,2,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
DonDavies,7,0,0,0,0,0,0,1,2,0,...,0,0,0,0,0,0,0,2,0,0
DrNealDunnFL2,0,0,0,0,3,5,0,4,0,0,...,0,0,0,1,0,1,0,0,0,1
Jamie_Schmale,3,1,0,0,0,0,0,3,0,0,...,0,0,0,0,1,0,0,1,0,0
JerryMoran,1,3,0,0,3,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
JoyceMurray,5,6,0,1,1,0,1,0,0,0,...,0,0,0,8,4,1,0,0,0,0
Laurel_BC,3,2,0,0,8,3,1,0,0,0,...,0,0,0,0,0,0,1,1,0,2
MartinBowRiver,2,10,0,3,4,1,3,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [None]:
## TODO: fix issue with api-skipped handles not dropping correctly