In [1]:
##########==========##########==========##########==========##########==========

## HEAD

#### HEAD 01 - toggle user settings

In [2]:
## determine whether to cache data from some time consuming tasks
## selecting True for any option increases run time and memory usage
## selecting True for all options fully executes the code from start-to-finish
settings = {
    'num_parallel_cores': 13,
    'full_data_mode': True, ## model all data, not just a sample for code dev
    
    'collect_data': False, ## toggles twitter api pulls in PULL01-03
    
    'rebuild_word_data': False, ## rebuild vs cache load for word_data MUNG02
    'rebuild_tweet_words': False, ## rebuild vs cache load tweet_words MUNG03
    'rebuild_user_token': False, ## rebuild v cache load user_token_tally MUNG04
    
    'rebuild_logistic_hparams': False, ## rebuild v cache ml training MODE04
    'rebuild_bayes_hparams': False, ## rebuild v cache ml training MODE05
    'rebuild_forest_hparams': False, ## rebuild v cache ml training MODE06
    'rebuild_adaboost_hparams': False, ## rebuild v cache ml training MODE07
    
    'rebuild_important_features': False, ## rebuild/cache var importance EVAL04
    }

#### HEAD02 - load libraries

In [3]:
import tweepy
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from time import sleep
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.inspection import permutation_importance
from sklearn.cluster import AgglomerativeClustering

#### HEAD03 - load data files

In [4]:
## read in roster of handles
user_data = pd.read_excel("A_Input/twitter_handles.xlsx")

## drop all but a handful of cases if in test mode
if not settings['full_data_mode']:
    user_data = user_data.sample(125, random_state = 5542)

## read in twitter credentials; initialize api connection+
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
api = tweepy.API(twitter_credentials)

  warn(msg)


#### HEAD04 - create build or cache decision function

In [5]:
## build switching function to execute code or cache results
def build_or_cache_csv(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        x.to_csv(address, index = False)
        return x
    else:
        return pd.read_csv(address)
    
def build_or_cache_pickle(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        conn = open(address, 'wb')
        pickle.dump(x, conn)
        conn.close()
        return x
    else:
        conn = open(address, 'rb')
        x = pickle.load(conn)
        conn.close()
        return x

In [6]:
##########==========##########==========##########==========##########==========

## HAND – Gather Twitter handles for test accounts

#### HAND01 - extract handles from roster URLs

In [7]:
## extract handles from roster urls
user_data['handle'] = user_data.url.str.replace('https://twitter.com/', '',
            regex = False).str.strip().str.lower()

In [8]:
##########==========##########==========##########==========##########==========

## PULL - Pull Twitter data from the API

#### PULL01 - query API for each roster handle's user_timeline data

In [9]:
## extract tweet data from api object
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 
            'created_at': x[i].created_at, 
            'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name,
            'verified' : x[i].author.verified
        })
    return pd.DataFrame(tweet_data)

## define function to pull user tweet data and apply function to extract tweet data
def pull_tweet_data(handles = user_data.handle, a = api):
    tweet_data = list()
    for i in handles:
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep(1)
        except:
            pass
    tweet_data = pd.concat(tweet_data)
    tweet_data['screen_name'] = tweet_data['screen_name'].str.lower()
    return tweet_data

## execute code
if settings['collect_data']:
    tweet_data = pull_tweet_data()

#### PULL02 - tabulate tweet statistics, divide users into train/tune/test sets

In [10]:
## combile tweet count and verification status at the user level
def enhance_user_data(td, ud = user_data):
    
    def np_size(x): return x.size
    
    ## calculate tweet summary statistics
    td_original = tweet_data
    td = td.copy()
    verified = td.groupby('screen_name').mean()
    tweets = td['screen_name'].value_counts()
    td = pd.concat([verified, tweets], axis = 1).reset_index()
    td.columns = ['handle', 'verified', 'tweets']
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, on = 'handle', how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index(drop = True)
    ud = ud.fillna({'tweets': 0}).astype({'tweets': int})
    
    ## divide users into train, and test subsets
    ml_set = pd.Series(['train', 'test'], name = 'ml_set').sample(
                n = ud.shape[0], replace = True, weights = [0.8, 0.20],
                random_state = 2006)
    ud['ml_set'] = ml_set.values
    ud.loc[ud.tweets == 0, 'ml_set'] = 'exclude'
    
    return ud, td_original

## execute code
if settings['collect_data']:
    user_data, tweet_data = enhance_user_data(
        tweet_data[['screen_name', 'verified']])

#### PULL03 - save datasets to disk

In [11]:
## save user/tweet datasets to disk as csvs
if settings['collect_data']:
    user_data.to_csv('B_Process/user_data.csv', index = False)
    tweet_data.to_csv('B_Process/tweet_data.csv', index = False)
else:
    user_data = pd.read_csv('B_Process/user_data.csv')
    tweet_data = pd.read_csv('B_Process/tweet_data.csv')

In [12]:
##########==========##########==========##########==========##########==========

## MUNG - Process Twitter data to model-ready format

#### MUNG01 - parse tweet text into tokens

In [13]:
## tokenize, remove capitalization, and remove duplicate tokens
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    x = list(set(x))
    return x

## execute code
tweet_data['tokens'] = tweet_data.full_text.apply(nlp_tokenize_tweet)

#### MUNG02 - create word/token level dataset

In [14]:
## create word/token level dataset and identify valid word tokens
def make_word_data(td = tweet_data):

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.tokens:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > max(
        word_data['count'].quantile(0.2), 3)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.word:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data.reset_index(drop = True)

## execute code
word_data = build_or_cache_csv(
    address = 'B_Process/word_data.csv',
    function = make_word_data,
    build_bool = settings['rebuild_word_data']
    )

  return pd.read_csv(address)


#### MUNG03 - generate a tokens x tweets link database

In [15]:
def make_tweet_token_data(td = tweet_data, wd = word_data):
    
    wd = wd.set_index('word')
    
    ## replicate tweet ids
    n = td.tokens.apply(len).values
    tweet_tokens = pd.Series(np.repeat(td.tweet_id.values, n), name = "tweet_id")
    tweet_tokens = pd.DataFrame(tweet_tokens)
    
    ## allocate words to the new dataset
    words = list()
    for i in td.tokens:
        words += i
    tweet_tokens['words'] = words
    
    ## convert words to tokens
    tweet_tokens['tokens'] = wd.loc[
        tweet_tokens.words.values, 'token'].values
    tweet_tokens = tweet_tokens.dropna()

    return tweet_tokens.reset_index(drop = True)

## execute code
tweet_words = build_or_cache_csv(
    address = 'B_Process/tweet_words.csv',
    function = make_tweet_token_data,
    build_bool = settings['rebuild_tweet_words']
    )
tweet_data = tweet_data.drop('tokens', axis = 1)

#### MUNG04 - generate a tokens x users count; drop tokens with only one user

In [16]:
def make_user_token_matrix(td = tweet_data, tw = tweet_words, ud = user_data):
    
    ## count of number of times each user wrote each token
    tw = tw.merge(right = td[['screen_name', 'tweet_id']],
                  how = 'left', on = 'tweet_id')
    tw = tw.drop(['tweet_id', 'words'], axis = 1).groupby('screen_name')
    tw = tw.value_counts()
    tw.name = 'count'
    tw = tw.reset_index().set_index('screen_name')
    tw = tw.pivot(columns = 'tokens').fillna(0).astype(int)
    tw = tw.droplevel(axis = 1, level = 0)
    
    ## remove tokens that fewer than 3 or more than 80 percent of users use
    valid_usage = (tw > 0).astype(int).sum().values
    valid_usage = (valid_usage > 2
                  ) & (valid_usage < int(tw.shape[0] * 0.8))
    tw = tw.loc[:, valid_usage]
    
    ## standardize matrix as words per 1,000 tweets
    denom = pd.DataFrame({'handle': tw.index}).merge(
        ud[['handle', 'tweets']],
        how = 'left', on = 'handle'
        ).set_index('handle').squeeze().fillna(1)
    denom.loc[denom < 1] = 1
    tw = (tw.divide(denom, axis = 0) * 1000).astype(int)
    tw = tw.reset_index().rename({'level_0':'screen_name'}, axis = 1)
    
    return tw


## execute code
user_token_matrix = build_or_cache_csv(
    address = 'B_Process/user_token_matrix.csv',
    function = make_user_token_matrix,
    build_bool = settings['rebuild_user_token']
    ).set_index('screen_name')

In [17]:
##########==========##########==========##########==========##########==========

## HYPE – Train models and tune hyperparameters

#### HYPE01 - build x/y train/test datasets

In [18]:
## unpack train and test datasets
def split_xy_data(ml_cat):
    i = user_data.loc[user_data.ml_set == ml_cat, 'handle'].values
    
    x = user_token_matrix.loc[i, :]
    
    y1 = user_data.set_index('handle').loc[i, 'group'] == 'USA House'
    y1 = y1.astype(int)
    
    y2 = user_data.set_index('handle').loc[i, 'party'] == 'Republican'
    y2 = y2.astype(int)
    
    return x, y1, y2

## execute code
train_x, train_y1, train_y2 = split_xy_data('train')
test_x, test_y1, test_y2    = split_xy_data('test')

In [19]:
## delete objects to clear up memory in prep for modeling
del tweet_words, tweet_data, user_token_matrix, word_data

#### HYPE02 - build pca versions of x dataset

In [20]:
## fit pca model in order to simplify and enhance feature matrix
model_pca = PCA().fit(train_x)

## generate pca transformations of all feature matrices
train_x_pca = model_pca.transform(train_x)
test_x_pca  = model_pca.transform(test_x)

In [21]:
## time trial pca using the GridSearchCV interface (for consistency)
pca_time_cost = GridSearchCV(
    estimator = PCA(),
    param_grid = {'svd_solver': ['auto' for i in range(0, 5)]},
    n_jobs = settings['num_parallel_cores'],
    scoring = lambda a,b: 1,
    refit = False
    ).fit(train_x)
pca_time_cost = pca_time_cost.cv_results_['mean_fit_time'].mean()

#### HYPE03 - formulate generic function for hyperparameter search

In [22]:
from sklearn.base import clone

## find best hyperparameters
def find_hparams():
    
    model1 = clone(the_model)
    model2 = clone(the_model)
    model3 = clone(the_model)
    model4 = clone(the_model)
    model5 = clone(the_model)
    model6 = clone(the_model)
    
    only_s2 = train_y1.astype(bool).values
    
    model1 = model1.fit(X = train_x, y = train_y1)
    model2 = model2.fit(X = train_x.loc[only_s2, :], y = train_y2.loc[only_s2])
    model3 = model3.fit(X = train_x_pca, y = train_y1)
    model4 = model4.fit(X = train_x_pca[only_s2, :], y = train_y2.loc[only_s2])
    model5 = model5.fit(X = train_x, y = train_y2)
    model6 = model6.fit(X = train_x_pca, y = train_y2)
    
    return {'feature_stage1': model1, 'feature_stage2': model2,
            'pca_stage1': model3, 'pca_stage2': model4,
            'feature_unitary': model5, 'pca_unitary': model6}


#### HYPE04 - find best logistic regression model parameters

In [23]:
## formulate model
the_model = GridSearchCV( 
    estimator = LogisticRegression(penalty = 'l1',
                                   class_weight = 'balanced',
                                   solver = 'saga',
                                   max_iter = 2**10
                                  ),
    scoring = 'f1',
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {'C': [2. ** i for i in range(-2, 6)]}
)

## execute code
logistic_hparams = build_or_cache_pickle(
    address = 'B_Process/models/logistic_hparams.pickle',
    function = find_hparams,
    build_bool = settings['rebuild_logistic_hparams']
    )

#### HYPE05 - find best naive bayes model parameters

In [24]:
## formulate model
the_model = GridSearchCV( 
    estimator = GaussianNB(),
    scoring = 'f1',
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {'priors': [
        (0.5, 0.5), 
        (1 - train_y1.mean(), train_y1.mean()),
        (1 - train_y2.mean(), train_y2.mean()),
    ]}
)

## execute code
bayes_hparams = build_or_cache_pickle(
    address = 'B_Process/models/bayes_hparams.pickle',
    function = find_hparams,
    build_bool = settings['rebuild_bayes_hparams']
    )

#### HYPE06 - find best random forest model parameters

In [25]:
## formulate model
the_model = GridSearchCV( 
    estimator = RandomForestClassifier(class_weight = 'balanced'),
    scoring = 'f1',
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {
        'n_estimators': [2**i for i in range(6, 10)],
        'min_samples_leaf': [2**i for i in range(2, 5)]
    }
)

## execute code
forest_hparams = build_or_cache_pickle(
    address = 'B_Process/models/forest_hparams.pickle',
    function = find_hparams,
    build_bool = settings['rebuild_forest_hparams']
    )

#### HYPE07 - find best AdaBoost model parameters and train model

In [26]:
## formulate model
the_model = GridSearchCV( 
    estimator = AdaBoostClassifier(base_estimator = None),
    scoring = 'f1',
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {
        'n_estimators':  [int(2**i) for i in range(6, 10)],
        'learning_rate': [2**i for i in range(-2, 3)]
    }
)

## execute code
adaboost_hparams = build_or_cache_pickle(
    address = 'B_Process/models/adaboost_hparams.pickle',
    function = find_hparams,
    build_bool = settings['rebuild_adaboost_hparams']
    )

In [27]:
##########==========##########==========##########==========##########==========

## EVAL - Analyse model performance and interpret

#### EVAL01 - compile hyperparameter search statistics

In [28]:
def extract_hparam_stats(x):
    
    n = x.n_features_in_
    x = x.cv_results_
    
    ## extract performance stats
    perform_stats_i = ['mean_fit_time', 'mean_score_time', 'mean_test_score',
                      'rank_test_score']
    perform_stats = [x[i] for i in perform_stats_i]
    perform_stats = pd.DataFrame(perform_stats).T
    perform_stats.columns = perform_stats_i
    
    ## extract parameters
    params = pd.DataFrame(x['params'])
    perform_stats = {'stats': perform_stats, 'params': params}
    perform_stats = pd.concat(perform_stats, axis = 1)
    
    ## count features
    perform_stats[('features', 'n')] = n
    
    
    return perform_stats.sort_values(('stats', 'rank_test_score'))

def compile_hparam_stats(all_models = {
    'logistic': logistic_hparams, 'bayes': bayes_hparams,
    'forest': forest_hparams, 'adaboost': adaboost_hparams},
    pca_time = pca_time_cost):
    
    ## make counter and results container
    hparam_data = [None for i in range(0, 24)]
    k = 0
    
    ## extract hyperparameter search statistics for all models
    for i in all_models.keys():
        for j in all_models[i].keys():
            hparam_data[k] = extract_hparam_stats(all_models[i][j])
            hparam_data[k][('model', 'algorithm')] = i
            hparam_data[k][('model', 'variant')] = j
            k += 1
            
    ## compile as a dataframe and return results
    hparam_data = pd.concat(hparam_data, axis = 0).reset_index(drop = True)
    hparam_data = hparam_data.reindex(sorted(hparam_data.columns), axis = 1)
    ## adjust fit times to account for pca model fitting
    i = hparam_data[('model', 'variant')].isin(['pca_unitary', 'pca_stage1'])
    j = ('stats', 'mean_fit_time')
    hparam_data.loc[i, j] = hparam_data.loc[i, j].values + pca_time
    
    
    return hparam_data

## execute code
hparam_stats = compile_hparam_stats()
hparam_stats.to_csv('C_Output/hparam_stats.csv')

#### EVAL02 - compile best model performance statistics

In [29]:
def construct_best_model_stats(stats = hparam_stats):
    
    ## filter existing stats from hparams to populate dataset
    stats = stats.drop('params', axis = 1).loc[
        stats[('stats', 'rank_test_score')] == 1, ]
    stats = stats.drop(('stats', 'rank_test_score'), axis = 1)
    stats[('model', 'variant')] = stats[('model', 'variant')].str.replace(
        '[0-9]$', '', regex = True)
    stats = stats.groupby([('model', 'algorithm'), ('model', 'variant')]).sum()
    stats = stats.reset_index()
    
    ## delete test_scores for multi-stage models
    i = stats[('model', 'variant')].isin(['feature_stage', 'pca_stage'])
    stats.loc[i, ('stats', 'mean_test_score')] = np.nan
    
    ## clean up columns
    stats[('time', 'run_time')] = stats[('stats', 'mean_fit_time')] +\
        stats[('stats', 'mean_score_time')]
    del stats[('stats', 'mean_fit_time')]
    del stats[('stats', 'mean_score_time')]
    del stats[('stats', 'mean_test_score')]
    
    ## add empty columns for performance data
    stats[('perform', 'precision')] = np.nan
    stats[('perform', 'recall')] = np.nan
    stats[('perform', 'f1')] = np.nan
    stats[('perform', 'f1_train')] = np.nan
    
    return stats

def score_best_model_performance(all_models = {
    'logistic': logistic_hparams, 'bayes': bayes_hparams,
    'forest': forest_hparams, 'adaboost': adaboost_hparams}):
    
    ## generate container for best model statistics
    i = [('model', 'algorithm'), ('model', 'variant')]
    bms = construct_best_model_stats().set_index(i)
    
    ## iterate through models
    for i in bms.index:
        
        ## select dataset
        if i[1] in ['pca_stage', 'pca_unitary']:
            trx = train_x_pca
            tex = test_x_pca
        else:
            trx = train_x
            tex = test_x
            
        ## make prediction
        train_predict = train_y1 * 0
        test_predict  = test_y1 * 0
        
        if i[1] in ['pca_stage', 'feature_stage']:
            stage1 = all_models[i[0]][i[1] + '1'].best_estimator_
            stage2 = all_models[i[0]][i[1] + '2'].best_estimator_
            try:
                train_predict = stage1.predict(trx) * stage2.predict(trx)
                test_predict  = stage1.predict(tex) * stage2.predict(tex)
            except: pass
        else:
            stage0 = all_models[i[0]][i[1]].best_estimator_
            try:
                train_predict = stage0.predict(trx)
                test_predict  = stage0.predict(tex)
            except: pass
        
        ## calculate statistics
        bms.loc[i, ('perform', 'precision')] = precision_score(
            y_true = test_y2, y_pred = test_predict)
        bms.loc[i, ('perform', 'recall')] = recall_score(
            y_true = test_y2, y_pred = test_predict)
        bms.loc[i, ('perform', 'f1')] = f1_score(
            y_true = test_y2, y_pred = test_predict)
        bms.loc[i, ('perform', 'f1_train')] = f1_score(
            y_true = train_y2, y_pred = train_predict)

    ## return model
    return bms.round(3)
    
## execute code
best_model_stats = score_best_model_performance()

#### EVAL03 - calculate comparison performance stats for random guess model

In [30]:
def estimate_guess_confuse_matrix(y1, y2):
    
    ## tabulate probability of a positive at random
    pos1  = y1.mean()
    pos2  = y2.mean()
    
    ## calculate confusion matrix stats
    confusion_matrix = {
        'neg': {
            'neg': (1 - pos1) * (1 - pos2),
            'pos': (1 - pos1) * pos2
            },
        'pos': {
            'neg': pos1 * (1 - pos2),
            'pos': pos1 * pos2

            }
        }
    confusion_matrix = pd.DataFrame(confusion_matrix)
    
    ## check results and return
    assert confusion_matrix.sum().sum() == 1
    return confusion_matrix

def estimate_staged_guess_confuse_matrix(y1p, y1o, y2p, y2o):
    
    ## generate confusion matrix for each stage
    stage1 = estimate_guess_confuse_matrix(y1p, y1o)
    stage2 = estimate_guess_confuse_matrix(y2p, y2o)
    
    ## generate confusion matrix for both stages combined
    joint_matrix = stage1 * 0
    joint_matrix.loc['pos', 'pos'] = stage1.loc['pos', 'pos'] *\
                                    stage2.loc['pos', 'pos']
    joint_matrix.loc['pos', 'neg'] = stage1.loc['pos', 'neg'] *\
                                    stage2.loc['pos', 'neg'] +\
                                    stage1.loc['pos', 'neg'] *\
                                    stage2.loc['pos', 'pos'] +\
                                    stage1.loc['pos', 'pos'] *\
                                    stage2.loc['pos', 'neg']
    joint_matrix.loc['neg', 'pos'] = stage1.loc['neg', 'pos'] *\
                                    stage2.loc['neg', 'pos'] +\
                                    stage1.loc['neg', 'pos'] *\
                                    stage2.loc['pos', 'pos'] +\
                                    stage1.loc['pos', 'pos'] *\
                                    stage2.loc['neg', 'pos']
    joint_matrix.loc['neg', 'neg'] = 1 - joint_matrix.sum().sum()
    
    ## return result
    return joint_matrix

def score_guess_performance(cm, algo_var):
    
    ## create container object
    performance = dict()
    performance[('model', 'algorithm')] = algo_var[0]
    performance[('model', 'variant')]   = algo_var[1]
    performance[('features', 'n')]      = 0
    performance[('time', 'run_time')]   = np.nan
    
    ## calculate precision and recall
    performance[('perform', 'precision')] = cm.loc['pos', 'pos'] / (
        cm.loc['pos', 'pos'] + cm.loc['pos', 'neg'])
    performance[('perform', 'recall')] = cm.loc['pos', 'pos'] / (
        cm.loc['pos', 'pos'] + cm.loc['neg', 'pos'])
    
    ## calculate f1
    performance[('perform', 'f1')] = 2 *\
        performance[('perform', 'precision')] *\
        performance[('perform', 'recall')]/\
        (performance[('perform', 'precision')] +\
        performance[('perform', 'recall')])
    performance[('perform', 'f1_train')] = None
    
    ## round statistics
    performance[('perform', 'f1')] = performance[(
        'perform', 'f1')].round(3)
    performance[('perform', 'precision')] = performance[(
        'perform', 'precision')].round(3)
    performance[('perform', 'recall')] = performance[(
        'perform', 'recall')].round(3)
    
    ## package and return object
    performance = pd.DataFrame(pd.Series(performance)).T
    performance = performance.set_index(
        [('model', 'algorithm'), ('model', 'variant')])
    
    return performance

## execute code - multistage model
con_mat = estimate_staged_guess_confuse_matrix(
    y1p = train_y1, y1o = test_y1, y2p = train_y2, y2o = test_y2)
random_performance = score_guess_performance(con_mat, ('random', 'stage'))

con_mat = estimate_staged_guess_confuse_matrix(
    y1p = train_y1, y1o = train_y1, y2p = train_y2, y2o = train_y2)
random_performance.loc[('random', 'stage'), ('perform', 'f1_train')
    ] = score_guess_performance(con_mat, ('random', 'stage')).loc[
    ('random', 'stage'), ('perform', 'f1')]

## execute code - single stage model
con_mat = estimate_guess_confuse_matrix(train_y2, test_y2)
x = score_guess_performance(con_mat, ('random', 'unitary'))
con_mat = estimate_guess_confuse_matrix(train_y2, train_y2)
x.loc[('random', 'unitary'), ('perform', 'f1_train')
    ] = score_guess_performance(con_mat, ('random', 'unitary')).loc[
    ('random', 'unitary'), ('perform', 'f1')]

## package
random_performance = pd.concat([random_performance, x], axis = 0)
best_model_stats = pd.concat([best_model_stats, random_performance])
del con_mat, random_performance, x

In [31]:
## write best_model_stats to disk
with open('B_Process/best_model_stats.pickle', 'wb') as conn:
    pickle.dump(best_model_stats, conn)
best_model_stats.to_csv('C_Output/best_model_stats.csv')

print('NOTE: do manual interpretation of best_model_stats to choose models')
#### EVAL04 - determine variable importance

In [32]:
print('Reminder: do manual interpretation of best_model_stats to choose models')

Reminder: do manual interpretation of best_model_stats to choose models


In [33]:
## determine most important feature for best models
def find_important_features(hparam):
    mod = hparam['feature_unitary'].best_estimator_
    important = permutation_importance(
        estimator = mod, X = test_x, y = test_y2, scoring = 'f1',
        n_jobs = settings['num_parallel_cores'], n_repeats = 5)
    return important

def loop_through_models(
    mod_list = [forest_hparams, bayes_hparams], func = find_important_features):
    mod_results = list()
    for i in mod_list: mod_results.append(func(i))
    return mod_results

def convert_to_dict(x):
    y = dict()
    y['forest'] = x[0]
    y['bayes'] = x[1]
    return y

## execute code
important_features = build_or_cache_pickle(
    address = 'B_Process/models/important_features.pickle',
    function = loop_through_models,
    build_bool = settings['rebuild_important_features']
    )
important_features = convert_to_dict(important_features)

In [34]:
## compile importance results

def analyze_importance(important = important_features):
    
    ## extract importance scores
    for i in important.keys(): important[i] = important[i].importances_mean
    important = pd.DataFrame(important)
    
    ## filter and label
    important.index = train_x.columns
    important = important.loc[important.max(axis = 1) > 0, :]
    important['mean'] = important.mean(axis = 1)
    important = important.sort_values('mean', ascending = False).round(3)
    
    ## tabulate differences between groups in usage of words
    all_groups = ((1 - train_y1) * 1)
    all_groups = all_groups + (train_y2 * 3)
    all_groups = all_groups + (train_y1 * (1 - train_y2) * 2)
    x = train_x.copy()
    x['group'] = all_groups
    x = x.groupby('group').mean().round().astype(int)
    x = x.T.loc[important.index, :]
    for i in x.columns: x[i] = x[i] == x.max(axis = 1).values
    x = x.astype(int)
    important = pd.concat([important, x], axis = 1)
    important = important.sort_values([1, 2, 3], ascending = False)
    
    return important

## execute code
word_importance = analyze_importance()
word_importance.to_csv('C_Output/word_importance.csv')

#### EVAL07 - extract softmax scores

In [62]:
def find_y2_prob(x = forest_hparams, ud = user_data.copy()):
    
    ## calculate probabilities
    f = x['feature_unitary'].best_estimator_
    prob = np.concatenate(
        [f.predict_proba(train_x), f.predict_proba(test_x)], axis = 0)
    prob = prob[:, 1]
    prob = pd.Series(prob, name = 'prob')
    prob = pd.DataFrame(prob)
    prob.index = pd.concat([pd.Series(train_x.index), pd.Series(test_x.index)])
    prob['y2'] = pd.concat([pd.Series(train_y2), pd.Series(test_y2)])
    
    ## merge into user object and calculate statistics
    ud['state'] = ud['state'].str.strip()
    ud = ud.merge(prob, left_on = 'handle', right_index = True)
    ud = ud[['party', 'state', 'prob']].groupby(['party', 'state']).mean()
    
    return ud
    
    

## execute code
state_scores = find_y2_prob()
state_scores.to_csv('C_Output/state_scores.csv')
state_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,prob
party,state,Unnamed: 2_level_1
Conservative,Alberta,0.160214
Conservative,British Columbia,0.167948
Conservative,Manitoba,0.217902
Conservative,New Brunswick,0.153643
Conservative,Nova Scotia,0.113046
...,...,...
Republican,Virginia,0.888933
Republican,Washington,0.713758
Republican,West Virginia,0.858978
Republican,Wisconsin,0.859306


In [36]:
##########==========##########==========##########==========##########==========

## FOOT - display objects as needed