## HEAD

#### HEAD 01 - toggle user settings

In [1]:
## determine whether to cache data from some time consuming tasks
settings = {
    'collect_data': False, ## toggles twitter api pulls in PULL01-03
    'rebuild_word_data': False, ## toggle rebuild/load from cache for word_data
    'rebuild_tweet_words': False, ## toggle rebuild/cache for tweet_words
    'rebuild_user_token': False ## toggle rebuild/cache for user_token_tally
    }

#### HEAD02 - load libraries

In [2]:
##########==========##########==========##########==========##########==========
import tweepy
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
from time import sleep
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

#### HEAD03 - load data files

In [3]:
## read in roster of handles
user_data = pd.read_excel("A_Input/twitter_handles.xlsx")

## read in twitter credentials; initialize api connection+
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
api = tweepy.API(twitter_credentials)

  warn(msg)


#### HEAD04 - create build or cache decision function

In [4]:
## build switching function to execute code or cache results
def build_or_cache(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        x.to_csv(address)
        return x
    else:
        return pd.read_csv(address)            

## HAND

#### HAND01 - extract handles from roster URLs

In [5]:
## extract handles from roster urls
user_data['handle'] = user_data.url.str.replace('https://twitter.com/', '',
            regex = False).str.strip()
user_data = user_data.set_index('handle')

## PULL

#### PULL01 - query API for each roster handle's user_timeline data

In [6]:
## extract tweet data from api object
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 'created_at': x[i].created_at, 'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name, 'verified': x[i].author.verified
        })
    return pd.DataFrame(tweet_data).set_index('tweet_id')

## define function to pull user tweet data and apply function to extract tweet data
def pull_tweet_data(x = user_data.index.values, a = api):
    tweet_data = list()
    for i in x:
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep(0.1)
        except:
            pass
    return pd.concat(tweet_data)

## execute code
if settings['collect_data']:
    tweet_data = pull_tweet_data()

#### PULL02 - tabulate tweet statistics

In [7]:
## move verified account data from tweet dataset to user dataset
def summarize_verified(td, ud = user_data):
    
    ## calculate tweet summary statistics
    td_original = tweet_data
    td = td.copy().groupby('screen_name').agg({np.mean, len})
    td.columns = td.columns.droplevel(0)
    td = td.rename({'len': 'tweets', 'mean': 'verified'}, axis = 1)
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, left_index = True, right_index = True, how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index().rename({'index': 'screen_name'},
                axis = 1).set_index('screen_name')
    return ud, td_original

## execute code
if settings['collect_data']:
    user_data, tweet_data = summarize_verified(tweet_data[['screen_name', 'verified']])

#### PULL03 - save datasets to disk

In [8]:
## save user/tweet datasets to disk as csvs
if settings['collect_data']:
    user_data.to_csv('B_Process/user_data.csv.gz')
    tweet_data.to_csv('B_Process/tweet_data.csv.gz')
else:
    user_data = pd.read_csv('B_Process/user_data.csv.gz')
    tweet_data = pd.read_csv('B_Process/tweet_data.csv.gz')

## MUNG

#### MUNG01 - parse tweet text into tokens

In [9]:
## tokenize and remove capitalization
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    return x

## execute code
tweet_data['tokens'] = tweet_data.full_text.apply(nlp_tokenize_tweet)

#### MUNG02 - create word/token level dataset

In [10]:
## create word/token level dataset and identify valid word tokens
def make_word_data(td = tweet_data):

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.tokens:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > word_data['count'].quantile(0.2)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.index:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data

## execute code
word_data = build_or_cache(
    address = 'B_Process/word_data.csv.gz',
    function = make_word_data,
    build_bool = settings['rebuild_word_data']
    )

  return pd.read_csv(address)


#### MUNG03 - generate a tokens x tweets link database

In [11]:
def make_tweet_token_data(td = tweet_data, wd = word_data):
    
    ## replicate tweet ids
    n = td.tokens.apply(len).values
    tweet_tokens = pd.Series(np.repeat(td.tweet_id.values, n), name = "tweet_id")
    tweet_tokens = pd.DataFrame(tweet_tokens)
    
    ## allocate words to the new dataset
    words = list()
    for i in td.tokens:
        words += i
    tweet_tokens['words'] = words
    
    ## convert words to tokens
    tweet_tokens['tokens'] = word_data.loc[
        tweet_tokens.words.values, 'token'].values
    tweet_tokens = tweet_tokens.dropna().set_index('tweet_id')

    return tweet_tokens

## execute code
tweet_words = build_or_cache(
    address = 'B_Process/tweet_words.csv.gz',
    function = make_tweet_token_data,
    build_bool = settings['rebuild_tweet_words']
    )
tweet_data = tweet_data.drop('tokens', axis = 1)

#### MUNG04 - generate a tokens x users count summary

In [21]:
def make_user_token_tally(td = tweet_data, tw = tweet_words):
    tw = tw.merge(right = td[['screen_name', 'tweet_id']],
                  how = 'left', on = 'tweet_id')
    tw = tw.drop(['tweet_id', 'words'], axis = 1).groupby('screen_name')
    tw = tw.value_counts().reset_index().set_index('screen_name')
    tw = tw.pivot(columns = 'tokens').fillna(0).astype(int)
    
    return tw

## execute code
user_token_tally = build_or_cache(
    address = 'B_Process/user_token_tally.csv',
    function = make_user_token_tally,
    build_bool = settings['rebuild_user_token']
    )

In [None]:
## TODO: remove all row index from dataframe outputs