In [1]:
##########==========##########==========##########==========##########==========

## HEAD

#### HEAD01 - load libraries

In [2]:
## import standard libraries
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

## HEAD
import pickle
from os import mkdir
from os.path import exists

## HAND
from time import sleep, time
import tweepy

## MUNG
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

#### HEAD02 - user settings and setting validation

In [3]:
## create toggle file for executing code versus loading cached results
set_cache = {
    ## do *not* limit the user roster to a small subset (for testing)
    'use_full_roster': {'order': 0, 'bool': False},
    ## generate user data from file on disk
    'collect_user_data': {'order': 1, 'bool': False},
    ## download tweets from Twitter api and make a dataset
    'collect_tweet_data': {'order': 2, 'bool': False},
    ## build a dataset of words from tweets
    'rebuild_word_data': {'order': 3, 'bool': False},
    ## build a linking dataset of words to tweets
    'rebuild_tweet_words': {'order': 4, 'bool': False},
    ## aggregate tweet_words to be a matrix of word usage rates for each user
    'rebuild_user_token': {'order': 5, 'bool': False},
    ##
    'rebuild_model_data': {'order': 6, 'bool': True},
    }

set_cache = pd.DataFrame(set_cache).T.sort_values('order')

## create toggles for other user settings
set_other = dict()

#### HEAD03 - ensure directory structure and validate cache settings

In [4]:
## check that essential directories exist; make directories as needed
def ensure_directories_exist(
    dir_list = ['A_Input', 'B_Process', 'C_Output', 'B_Process/cache']):
    for i in dir_list:
        if exists(i): pass
        else: mkdir(i)

## validate cache settings (ensure that settings are self-consistent)
def validate_cache_bools(sc = set_cache):
    the_bool = False
    for i in sc.index:
        if sc.loc[i, 'order'] == 0:
            pass
        elif sc.loc[i, 'bool'] or the_bool:
            the_bool = True
            sc.loc[i, 'bool'] = True
    return sc
        
## execute code
ensure_directories_exist()
set_cache = validate_cache_bools()

#### HEAD04 - create shells for future objects and load twitter credentials

In [5]:
## user_data
user_data = dict()

## tweet_data
tweet_data = dict()

## word_data
word_data = dict()

## user_word_data
user_word_data = dict()

## twitter_credentials
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')

#### HEAD05 - write functions to toggle between executing functions and loading cached results from code

In [6]:
## toggle function for regenerating objects versus loading from disk
def execute_or_cache(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        conn = open(address, 'wb')
        pickle.dump(x, conn)
    else:
        conn = open(address, 'rb')
        x = pickle.load(conn)
    conn.close()
    return x

In [7]:
##########==========##########==========##########==========##########==========

## HAND - Gather Twitter handles for test accounts

#### HAND01 - intialize twitter connection

In [8]:
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
twitter_api = tweepy.API(twitter_credentials)

#### HAND02 - load user data from file and extract handles

In [9]:
def load_user_data(addr = 'A_Input/twitter_handles.xlsx', sc = set_cache):
    
    ## read user data from disk
    ud = pd.read_excel(addr)
    
    ## sample from data if in test data
    if not sc.loc['use_full_roster', 'bool']:
        ud = ud.sample(125, random_state = 4431)
    
    ## extract user handles
    ud['handle'] = ud.url.str.replace('https://twitter.com/', '',
                                        regex = False).str.strip().str.lower()
    
    ## return user_data object
    return ud

## extract handles from roster urls
user_data = execute_or_cache(
    address = 'B_Process/cache/user_data.pickle',
    function = load_user_data,
    build_bool = set_cache.loc['collect_user_data', 'bool']
    )

In [10]:
##########==========##########==========##########==========##########==========

## PULL - Pull Twitter data from the API

#### PULL01 - query API for tweet data from each user

In [11]:
## extract tweet data from api object (helps pull_tweet_data)
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 
            'created_at': x[i].created_at, 
            'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name,
            'verified' : x[i].author.verified
        })
    return pd.DataFrame(tweet_data)

## pull data from the api
def pull_tweet_data(handles = user_data.handle, a = twitter_api):
    tweet_data = list()
    for i in handles:
        start_time = time()
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep_time = max(1 - (time() - start_time), 0)
            sleep(sleep_time)
        except:
            pass
    tweet_data = pd.concat(tweet_data)
    tweet_data['screen_name'] = tweet_data['screen_name'].str.lower()
    return tweet_data

## execute code
tweet_data = execute_or_cache(
    address = 'B_Process/cache/tweet_data.pickle',
    function = pull_tweet_data,
    build_bool = set_cache.loc['collect_tweet_data', 'bool']
    )

#### PULL02 - tabulate tweet statistics, divide users into train/tune/test sets

In [12]:
## combile tweet count and verification status at the user level
def enhance_user_tweet_data(td = tweet_data, ud = user_data):
    
    def np_size(x): return x.size
    
    
    ## calculate tweet summary statistics
    td_original = td
    td = td[['screen_name', 'verified']].copy()
    verified = td.groupby('screen_name').mean()
    tweets = td['screen_name'].value_counts()
    td = pd.concat([verified, tweets], axis = 1).reset_index()
    td.columns = ['handle', 'verified', 'tweets']
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, on = 'handle', how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index(drop = True)
    ud = ud.fillna({'tweets': 0}).astype({'tweets': int})
    
    ## divide users into train, and test subsets
    ml_set = pd.Series(['train', 'test'], name = 'ml_set').sample(
                n = ud.shape[0], replace = True, weights = [0.8, 0.2],
                random_state = 2006)
    ud['ml_set'] = ml_set.values
    ud.loc[ud.tweets == 0, 'ml_set'] = 'exclude'
    
    return ud, td_original

## execute code
user_data, tweet_data = enhance_user_tweet_data()

In [13]:
##########==========##########==========##########==========##########==========

## MUNG - Process Twitter data to model-ready format

#### MUNG01 - tokenize words and generate a word-level dataset

In [14]:
## tokenize, remove capitalization, and remove duplicate tokens
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    x = list(set(x))
    return x

## create word/token level dataset and identify valid word tokens
def make_word_data(td = tweet_data):
    
    ## tokenize tweet data text
    td['tokens'] = td.full_text.apply(nlp_tokenize_tweet)

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.tokens:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > max(
        word_data['count'].quantile(0.2), 3)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.word:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data.reset_index(drop = True)

## execute code
word_data = execute_or_cache(
    address = 'B_Process/cache/word_data.pickle',
    function = make_word_data,
    build_bool = set_cache.loc['rebuild_word_data', 'bool']
    )

#### MUNG03 - generate a tokens x tweets link database

In [15]:
## build a linking dataset of words to tweets
def make_tweet_token_data(td = tweet_data, wd = word_data):
    
    wd = wd.set_index('word')
    
    ## replicate tweet ids
    n = td.tokens.apply(len).values
    tweet_tokens = pd.Series(np.repeat(td.tweet_id.values, n), name = "tweet_id")
    tweet_tokens = pd.DataFrame(tweet_tokens)
    
    ## allocate words to the new dataset
    words = list()
    for i in td.tokens:
        words += i
    tweet_tokens['words'] = words
    
    ## convert words to tokens
    tweet_tokens['tokens'] = wd.loc[
        tweet_tokens.words.values, 'token'].values
    tweet_tokens = tweet_tokens.dropna()

    return tweet_tokens.reset_index(drop = True)

## execute code
tweet_words = execute_or_cache(
    address = 'B_Process/cache/tweet_words.pickle',
    function = make_tweet_token_data,
    build_bool = set_cache.loc['rebuild_tweet_words', 'bool']
    )

#### MUNG04 - generate a tokens x users count; drop tokens with only one user

In [16]:
## aggregate tweet_words to be a matrix of word usage rates for each user
def make_user_token_matrix(td = tweet_data, tw = tweet_words, ud = user_data):
    
    ## count of number of times each user wrote each token
    tw = tw.merge(right = td[['screen_name', 'tweet_id']],
                  how = 'left', on = 'tweet_id')
    tw = tw.drop(['tweet_id', 'words'], axis = 1).groupby('screen_name')
    tw = tw.value_counts()
    tw.name = 'count'
    tw = tw.reset_index().set_index('screen_name')
    tw = tw.pivot(columns = 'tokens').fillna(0).astype(int)
    tw = tw.droplevel(axis = 1, level = 0)
    
    ## remove tokens that fewer than 3 or more than 90 percent of users use
    valid_usage = (tw > 0).astype(int).sum().values
    valid_usage = (valid_usage > 2
                  ) & (valid_usage < int(tw.shape[0] * 0.9))
    tw = tw.loc[:, valid_usage]
    
    ## standardize matrix as words per 1,000 tweets
    denom = pd.DataFrame({'handle': tw.index}).merge(
        ud[['handle', 'tweets']],
        how = 'left', on = 'handle'
        ).set_index('handle').squeeze().fillna(1)
    denom.loc[denom < 1] = 1
    tw = (tw.divide(denom, axis = 0) * 1000).astype(int)
    tw = tw.reset_index().rename({'level_0':'screen_name'}, axis = 1)
    
    return tw

## execute code
user_token_matrix = execute_or_cache(
    address = 'B_Process/cache/user_token_matrix.pickle',
    function = make_user_token_matrix,
    build_bool = set_cache.loc['rebuild_user_token', 'bool']
    ).set_index('screen_name')

In [17]:
##########==========##########==========##########==========##########==========

## MODE – Train models and tune hyperparameters

#### MODE01 - build x/y train/test datasets

In [18]:
## unpack train and test datasets
def split_xy_data(ml_cat):
    i = user_data.loc[user_data.ml_set == ml_cat, 'handle'].values
    
    x = user_token_matrix.loc[i, :]
    
    y1 = user_data.set_index('handle').loc[i, 'group'] == 'USA House'
    y1 = y1.astype(int)
    
    y2 = user_data.set_index('handle').loc[i, 'party'] == 'Republican'
    y2 = y2.astype(int)
    
    return x, y1, y2

## execute code
train_x, train_y1, train_y2 = split_xy_data('train')
test_x, test_y1, test_y2    = split_xy_data('test')

In [19]:
## drop tweet_words to free up extra space

In [20]:
del tweet_words

#### MODE02 - formulate model database

In [21]:
## generate dataset of all model variations to be tested
def make_model_data():
    
    ## generate model formulations
    model_data = dict()
    for i in ['LR', 'NB', 'RF', 'AB']:
        for j in ['1S', '2S']:
            for k in ['FM', 'PC']:
                l = i + j + k
                model_data[l] = {'model': i, 'stage': j, 'x': k}
    model_data = pd.DataFrame(model_data).T
    
    ## create slots for time and accuracy scores
    model_data['precision'] = -1
    model_data['recall']    = -1
    model_data['f1']        = -1
    model_data['time']      = -1
    
    ## generate lists of hyperparameters to test
    model_params = {
        'LR': {'C': [2. ** i for i in range(-2, 6)]},
        'NB': {'priors': [(0.5, 0.5), (1 - train_y1.mean(), train_y1.mean()),
                          (1 - train_y2.mean(), train_y2.mean())]},
        'RF': {'n_estimators': [2**i for i in range(6, 10)],
               'min_samples_leaf': [2**i for i in range(2, 5)]},
        'AB':{'n_estimators':  [int(2**i) for i in range(6, 10)],
              'learning_rate': [2**i for i in range(-2, 3)]},
    }
    model_data_params = list()
    for i in model_data.index:
        model_data_params.append(model_params[model_data.loc[i, 'model']])
    model_data['params'] = model_data_params
    
    ## create slot for GridSearchCV object
    model_data['object'] = [list() for i in model_data.index]
    
    ## return results
    return model_data

## execute data
model_data = make_model_data()

#### MODE03 - fit models and find best parameters for each model

In [22]:
def fit_1s_models(md, x, y1, y2):
    
    ## iterate through all one stage models
    for i in md.index:
        if md.loc[i, 'stage'] == '2S': continue
        print('Model in progress: ' + i)
        
        ## FM v PC feature matrix
        if md.loc[i, 'x'] == 'PC': pass
        else:pass
        
        ## fit model
        
        ## extract statistics (precision, recall, f1, time elapsed)
        
    ## express object
    return md
        
def fit_2s_models(md, x, y1, y2):
    
    ## iterate through all one stage models
    for i in md.index:
        if md.loc[i, 'stage'] == '1S': continue
        print('Model in progress: ' + i)
        
        ## FM v PC feature matrix
        if md.loc[i, 'x'] == 'PC': pass
        else: pass
        
        ## fit stage 1 model
        
        ## fit stage 2 model
        
        ## extract statistics (precision, recall, f1, time elapsed)
        
    ## express object
    return md

def fit_all_models(md = model_data, x = train_x, y1 = train_y1, y2 = train_y2):
    md = fit_1s_models(md = md, x = x, y1 = y1, y2 = y2)
    md = fit_2s_models(md = md, x = x, y1 = y1, y2 = y2)
    return md

## execute data
model_data = execute_or_cache(
    address = 'B_Process/cache/model_data.pickle',
    function = fit_all_models,
    build_bool = set_cache.loc['rebuild_model_data', 'bool']
    )

Model in progress: LR1SFM
Model in progress: LR1SPC
Model in progress: NB1SFM
Model in progress: NB1SPC
Model in progress: RF1SFM
Model in progress: RF1SPC
Model in progress: AB1SFM
Model in progress: AB1SPC
Model in progress: LR2SFM
Model in progress: LR2SPC
Model in progress: NB2SFM
Model in progress: NB2SPC
Model in progress: RF2SFM
Model in progress: RF2SPC
Model in progress: AB2SFM
Model in progress: AB2SPC


In [23]:
##########==========##########==========##########==========##########==========

## EVAL

In [24]:
##########==========##########==========##########==========##########==========

## TOPI

In [25]:
##########==========##########==========##########==========##########==========

## FOOT

In [26]:
##########==========##########==========##########==========##########==========