In [1]:
##########==========##########==========##########==========##########==========

## HEAD

#### HEAD 01 - toggle user settings

In [2]:
## determine whether to cache data from some time consuming tasks
settings = {
    'num_parallel_cores': 13,
    'test_mode': False, ## when true, only a small of data is collected
    
    'collect_data': False, ## toggles twitter api pulls in PULL01-03
    
    'rebuild_word_data': False, ## rebuild vs cache load for word_data MUNG02
    'rebuild_tweet_words': False, ## rebuild vs cache load tweet_words MUNG03
    'rebuild_user_token': False, ## rebuild v cache load user_token_tally MUNG04
    
    'use_pca': False, ## use a pca simplification of the features matrix
    
    'redo_hparam_logistic': False, ## recalculate/cache TRAI03
    'redo_hparam_naive_bayes': False, ## recalculate/cache TRAI04
    'redo_hparam_random_forest': False, ## recalculate/cache TRAI05
    'redo_hparam_adaboost': False ## recalculate/cache TRAI06
    }

#### HEAD02 - load libraries

In [3]:
import tweepy
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from time import sleep
from os.path import exists
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

#### HEAD03 - load data files

In [4]:
## read in roster of handles
user_data = pd.read_excel("A_Input/twitter_handles.xlsx")

## drop all but a handful of cases if in test mode
if settings['test_mode']:
    user_data = user_data.sample(125, random_state = 5542)

## read in twitter credentials; initialize api connection+
twitter_credentials = pd.read_csv('../api_keys/twitter.csv').set_index('item')
twitter_credentials = tweepy.OAuth1UserHandler(
    consumer_key = twitter_credentials.loc['API Key', 'string'],
    consumer_secret = twitter_credentials.loc['API Key Secret', 'string'],
    access_token = twitter_credentials.loc['Access Token', 'string'],
   access_token_secret = twitter_credentials.loc['Access Token Secret', 'string']
    )
api = tweepy.API(twitter_credentials)

  warn(msg)


#### HEAD04 - create build or cache decision function

In [5]:
## build switching function to execute code or cache results
def build_or_cache_csv(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        x.to_csv(address, index = False)
        return x
    else:
        return pd.read_csv(address)
    
def build_or_cache_pickle(address, function, build_bool):
    if build_bool or not exists(address):
        x = function()
        conn = open(address, 'wb')
        pickle.dump(x, conn)
        conn.close()
        return x
    else:
        conn = open(address, 'rb')
        x = pickle.load(conn)
        conn.close()
        return x

In [6]:
##########==========##########==========##########==========##########==========

## HAND – Gather Twitter handles for test accounts

#### HAND01 - extract handles from roster URLs

In [7]:
## extract handles from roster urls
user_data['handle'] = user_data.url.str.replace('https://twitter.com/', '',
            regex = False).str.strip().str.lower()

In [8]:
##########==========##########==========##########==========##########==========

## PULL - Pull Twitter data from the API

#### PULL01 - query API for each roster handle's user_timeline data

In [9]:
## extract tweet data from api object
def refine_tweet_data(x):
    tweet_data = list()
    for i in range(0, len(x)):
        tweet_data.append({
            'tweet_id': x[i].id, 
            'created_at': x[i].created_at, 
            'lang': x[i].lang,
            'full_text': x[i].full_text,
            'screen_name': x[i].author.screen_name,
            'verified' : x[i].author.verified
        })
    return pd.DataFrame(tweet_data)

## define function to pull user tweet data and apply function to extract tweet data
def pull_tweet_data(handles = user_data.handle, a = api):
    tweet_data = list()
    for i in handles:
        try:
            user_tweets = a.user_timeline(
                screen_name = i, count = 200, tweet_mode = 'extended', 
                exclude_replies = True, include_rts = False)
            tweet_data.append(refine_tweet_data(user_tweets))
            sleep(1)
        except:
            pass
    tweet_data = pd.concat(tweet_data)
    tweet_data['screen_name'] = tweet_data['screen_name'].str.lower()
    return tweet_data

## execute code
if settings['collect_data']:
    tweet_data = pull_tweet_data()

#### PULL02 - tabulate tweet statistics, divide users into train/tune/test sets

In [10]:
## combile tweet count and verification status at the user level
def enhance_user_data(td, ud = user_data):
    
    def np_size(x): return x.size
    
    ## calculate tweet summary statistics
    td_original = tweet_data
    td = td.copy()
    verified = td.groupby('screen_name').mean()
    tweets = td['screen_name'].value_counts()
    td = pd.concat([verified, tweets], axis = 1).reset_index()
    td.columns = ['handle', 'verified', 'tweets']
    
    ## merge statistics into the user_data object
    ud = pd.merge(ud, td, on = 'handle', how = 'left')
    td_original = td_original.drop(['verified'], axis = 1)
    ud = ud.drop(['url'], axis = 1).reset_index(drop = True)
    ud = ud.fillna({'tweets': 0}).astype({'tweets': int})
    
    ## divide users into train, and test subsets
    ml_set = pd.Series(['train', 'test'], name = 'ml_set').sample(
                n = ud.shape[0], replace = True, weights = [0.8, 0.20],
                random_state = 2006)
    ud['ml_set'] = ml_set.values
    ud.loc[ud.tweets == 0, 'ml_set'] = 'exclude'
    
    return ud, td_original

## execute code
if settings['collect_data']:
    user_data, tweet_data = enhance_user_data(
        tweet_data[['screen_name', 'verified']])

#### PULL03 - save datasets to disk

In [11]:
## save user/tweet datasets to disk as csvs
if settings['collect_data']:
    user_data.to_csv('B_Process/user_data.csv', index = False)
    tweet_data.to_csv('B_Process/tweet_data.csv', index = False)
else:
    user_data = pd.read_csv('B_Process/user_data.csv')
    tweet_data = pd.read_csv('B_Process/tweet_data.csv')

In [12]:
##########==========##########==========##########==========##########==========

## MUNG - Process Twitter data to model-ready format

#### MUNG01 - parse tweet text into tokens

In [13]:
## tokenize, remove capitalization, and remove duplicate tokens
def nlp_tokenize_tweet(x):
    x = x.lower()
    x = word_tokenize(x)
    x = list(set(x))
    return x

## execute code
tweet_data['tokens'] = tweet_data.full_text.apply(nlp_tokenize_tweet)

#### MUNG02 - create word/token level dataset

In [14]:
## create word/token level dataset and identify valid word tokens
def make_word_data(td = tweet_data):

    ## flatten token lists and count occurances
    word_data = list()
    for i in td.tokens:
        word_data += i
    word_data = pd.Series(word_data, name = 'count').value_counts()
    word_data = word_data.sort_values(ascending = False)
    word_data = pd.DataFrame(word_data)
    
    ## determine which tokens occur often enough to warrant inclusion
    word_data['valid'] = word_data['count'] > max(
        word_data['count'].quantile(0.2), 3)
    word_data['word'] = word_data.index
    
    ## determine part of speech for eligible tokens
    speech_part = word_data['word'].loc[word_data['valid']].values
    speech_part = pos_tag(speech_part)
    speech_part = [i[1][0].lower() for i in speech_part]
    word_data['pos'] = '.'
    word_data.loc[word_data['valid'], 'pos'] = speech_part
    
    ## lemmatize
    WNL = WordNetLemmatizer()
    word_data['token'] = None
    for i in word_data.word:
        if not word_data.loc[i, 'valid']: 
            break
        if word_data.loc[i, 'pos'] in 'abcdefghijklmnopqrstuvwxyz':
            try:
                word_data.loc[i, 'token'] = WNL.lemmatize(
                    word_data.loc[i, 'word'],
                    pos = word_data.loc[i, 'pos']
                )
            except:
                word_data.loc[i, 'token'] = word_data.loc[i, 'word']
        else:
            word_data.loc[i, 'valid'] = False
        
    return word_data.reset_index(drop = True)

## execute code
word_data = build_or_cache_csv(
    address = 'B_Process/word_data.csv',
    function = make_word_data,
    build_bool = settings['rebuild_word_data']
    )

  return pd.read_csv(address)


#### MUNG03 - generate a tokens x tweets link database

In [15]:
def make_tweet_token_data(td = tweet_data, wd = word_data):
    
    wd = wd.set_index('word')
    
    ## replicate tweet ids
    n = td.tokens.apply(len).values
    tweet_tokens = pd.Series(np.repeat(td.tweet_id.values, n), name = "tweet_id")
    tweet_tokens = pd.DataFrame(tweet_tokens)
    
    ## allocate words to the new dataset
    words = list()
    for i in td.tokens:
        words += i
    tweet_tokens['words'] = words
    
    ## convert words to tokens
    tweet_tokens['tokens'] = wd.loc[
        tweet_tokens.words.values, 'token'].values
    tweet_tokens = tweet_tokens.dropna()

    return tweet_tokens.reset_index(drop = True)

## execute code
tweet_words = build_or_cache_csv(
    address = 'B_Process/tweet_words.csv',
    function = make_tweet_token_data,
    build_bool = settings['rebuild_tweet_words']
    )
tweet_data = tweet_data.drop('tokens', axis = 1)

#### MUNG04 - generate a tokens x users count; drop tokens with only one user

In [16]:
def make_user_token_matrix(td = tweet_data, tw = tweet_words, ud = user_data):
    
    ## count of number of times each user wrote each token
    tw = tw.merge(right = td[['screen_name', 'tweet_id']],
                  how = 'left', on = 'tweet_id')
    tw = tw.drop(['tweet_id', 'words'], axis = 1).groupby('screen_name')
    tw = tw.value_counts()
    tw.name = 'count'
    tw = tw.reset_index().set_index('screen_name')
    tw = tw.pivot(columns = 'tokens').fillna(0).astype(int)
    tw = tw.droplevel(axis = 1, level = 0)
    
    ## remove tokens that fewer than 3 or more than 80 percent of users use
    valid_usage = (tw > 0).astype(int).sum().values
    valid_usage = (valid_usage > 2
                  ) & (valid_usage < int(tw.shape[0] * 0.8))
    tw = tw.loc[:, valid_usage]
    
    ## standardize matrix as words per 1,000 tweets
    denom = pd.DataFrame({'handle': tw.index}).merge(
        ud[['handle', 'tweets']],
        how = 'left', on = 'handle'
        ).set_index('handle').squeeze().fillna(1)
    denom.loc[denom < 1] = 1
    tw = (tw.divide(denom, axis = 0) * 1000).astype(int)
    tw = tw.reset_index().rename({'level_0':'screen_name'}, axis = 1)
    
    return tw


## execute code
user_token_matrix = build_or_cache_csv(
    address = 'B_Process/user_token_matrix.csv',
    function = make_user_token_matrix,
    build_bool = settings['rebuild_user_token']
    ).set_index('screen_name')

In [17]:
##########==========##########==========##########==========##########==========

## TRAI – Train models and tune hyperparameters

#### TRAI00 - Unpack train, and test datasets

In [18]:
## unpack train and test datasets
def split_xy_data(ml_cat):
    i = user_data.loc[user_data.ml_set == ml_cat, 'handle'].values
    
    x = user_token_matrix.loc[i, :]
    
    y1 = user_data.set_index('handle').loc[i, 'group'] == 'USA House'
    y1 = y1.astype(int)
    
    y2 = user_data.set_index('handle').loc[i, 'party'] == 'Republican'
    y2 = y2.astype(int)
    
    return x, y1, y2

## execute code
train_x, train_y1, train_y2 = split_xy_data('train')
test_x, test_y1, test_y2    = split_xy_data('test')

In [19]:
## delete objects to clear up memory in prep for modeling
del tweet_words, tweet_data, user_token_matrix, word_data

#### TRAI01 - generate PCA simplification of features matrix

In [20]:
## fit pca model in order to simplify and enhance feature matrix
model_pca = PCA().fit(train_x)

## generate pca transformations of all feature matrices
train_x_pca = model_pca.transform(train_x)
test_x_pca  = model_pca.transform(test_x)

if settings['use_pca']:
    train_x = train_x_pca
    test_x  = test_x_pca

#### TRAI02 - estimate model performance at random chance (for comparison)

In [21]:
## estimate the confusion matrix that would result by random chance
   ## if we predicted labels based on frequently in the training set.
   ## This function predicts results for a single stage.
def estimate_random_confusion_matrix(y1, y2):
    
    ## tabulate probability of a positive at random
    pos1  = y1.mean()
    pos2  = y2.mean()
    
    ## calculate confusion matrix stats
    confusion_matrix = {
        'neg': {
            'neg': (1 - pos1) * (1 - pos2),
            'pos': (1 - pos1) * pos2
            },
        'pos': {
            'neg': pos1 * (1 - pos2),
            'pos': pos1 * pos2

            }
        }
    confusion_matrix = pd.DataFrame(confusion_matrix)
    
    ## check results and return
    assert confusion_matrix.sum().sum() == 1
    return confusion_matrix

## generate the joint confusion matrix for a multi-stage random chance model
def estimate_multistage_random_confusion_matrix(y1p, y1o, y2p, y2o, name):
    
    ## generate confusion matrix for each stage
    stage1 = estimate_random_confusion_matrix(y1p, y1o)
    stage2 = estimate_random_confusion_matrix(y2p, y2o)
    
    ## generate confusion matrix for both stages combined
    joint_matrix = stage1 * 0
    joint_matrix.loc['pos', 'pos'] = stage1.loc['pos', 'pos'] *\
                                    stage2.loc['pos', 'pos']
    joint_matrix.loc['pos', 'neg'] = stage1.loc['pos', 'neg'] *\
                                    stage2.loc['pos', 'neg'] +\
                                    stage1.loc['pos', 'neg'] *\
                                    stage2.loc['pos', 'pos'] +\
                                    stage1.loc['pos', 'pos'] *\
                                    stage2.loc['pos', 'neg']
    joint_matrix.loc['neg', 'pos'] = stage1.loc['neg', 'pos'] *\
                                    stage2.loc['neg', 'pos'] +\
                                    stage1.loc['neg', 'pos'] *\
                                    stage2.loc['pos', 'pos'] +\
                                    stage1.loc['pos', 'pos'] *\
                                    stage2.loc['neg', 'pos']
    joint_matrix.loc['neg', 'neg'] = 1 - joint_matrix.sum().sum()
    
    ## calculate precision and recall
    performance = dict()
    performance['Precision'] = joint_matrix.loc['pos', 'pos'] / (
        joint_matrix.loc['pos', 'pos'] + joint_matrix.loc['pos', 'neg'])
    performance['Recall'] = joint_matrix.loc['pos', 'pos'] / (
        joint_matrix.loc['pos', 'pos'] + joint_matrix.loc['neg', 'pos'])
    performance = pd.DataFrame({name: performance}).T.round(3)
    
    return performance

## execute code (and create performance statistics container object)
performance_stats = list()
performance_stats.append(estimate_multistage_random_confusion_matrix(
    train_y1, train_y1, train_y2, train_y2, ('random', 'train')))
performance_stats.append(estimate_multistage_random_confusion_matrix(
    train_y1, test_y1, train_y2, test_y2, ('random', 'test')))
performance_stats = pd.concat(performance_stats)

#### TRAI03 - Build a two-stage logistic model using features

In [22]:
## extract performance statistics from cross-validation
def capture_generic_cv_stats(cv_results, param_list):
    param_list = ['param_' + x for x in param_list]
    i = ['mean_fit_time', 'mean_test_score'] + param_list
    statistics = [cv_results[ii] for ii in i]
    statistics = pd.DataFrame(statistics)
    statistics.index = i
    return statistics

## use grid search with 5-fold cross-validation to find best hyperparameters
   ## note: deliberately keeping the_model out of find_generic_hparams arg list
   ## to prevent function from using a frozen snapsnot of the object
the_model = GridSearchCV( 
    estimator = LogisticRegression(penalty = 'l1',
                                   class_weight = 'balanced',
                                   solver = 'saga',
                                   max_iter = 2**10
                                  ),
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {'C': [2. ** i for i in range(-2, 6)]}
)

def find_generic_hparams(x = train_x, y1 = train_y1, y2 = train_y2):
    ## fit models
    model1 = the_model.fit(X = train_x, y = train_y1)
    model2 = the_model.fit(X = train_x, y = train_y2, sample_weight = train_y1)
    
    ## return best fit hyperparameters
    hyperparameters = pd.DataFrame({'model1': model1.best_params_,
                                    'model2':model2.best_params_})
    hparms = {
        'hparam': hyperparameters,
        'fit1': capture_generic_cv_stats(
            model1.cv_results_, list(hyperparameters.index)),
        'fit2': capture_generic_cv_stats(
            model2.cv_results_, list(hyperparameters.index)),
        }
    for i in hyperparameters.index:
        hparms[i] = hyperparameters.loc[i, :].values
    return hparms

## execute code  logistic_hyperparameters = find_logistic_hparams()
logistic_hyperparameters = build_or_cache_pickle(
    address = 'B_Process/logistic_hyperparameters.pickle',
    function = find_generic_hparams,
    build_bool = settings['redo_hparam_logistic']
)

In [23]:
## train model using the best hyperparameters
def train_logistic_models(x = train_x, y1 = train_y1, y2 = train_y2, C = [1, 1]):
    
    ## formulate models
    model1 = LogisticRegression(penalty = 'l1', class_weight = 'balanced',
                               solver = 'saga', max_iter = 1e3, C = C[0])
    model2 = LogisticRegression(penalty = 'l1', class_weight = 'balanced',
                               solver = 'saga', max_iter = 1e3, C = C[1])
    ## fit models
    model1 = model1.fit(X = train_x, y = train_y1)
    model2 = model2.fit(X = train_x, y = train_y2, sample_weight = train_y1)
    
    ## return models
    return model1, model2

## execute code
logistic_models = train_logistic_models(C = logistic_hyperparameters['C'])



In [24]:
## evaluate model performance the training and testing datasets
def evaluate_generic_model(mod, name, tr_x = train_x, tr_y2 = train_y2,
                           te_x = test_x, te_y2 = test_y2):
    
    ## predict values on training and testing sets
    prediction = {
        'train': mod[0].predict(tr_x) * mod[1].predict(tr_x),
        'test':  mod[0].predict(te_x) * mod[1].predict(te_x)
        }
    
    ## score predictions
    performance = {
        (name, 'train'): {
            'Precision': precision_score(
                            y_true = train_y2, y_pred = prediction['train']), 
            'Recall': recall_score(
                            y_true = train_y2, y_pred = prediction['train']),
            'F1': f1_score(
                            y_true = train_y2, y_pred = prediction['train'])
            },
        (name, 'test'):  {
            'Precision': precision_score(
                            y_true = test_y2, y_pred = prediction['test']), 
            'Recall': recall_score(
                            y_true = test_y2, y_pred = prediction['test']),
            'F1': f1_score(
                            y_true = test_y2, y_pred = prediction['test'])
            }
        }
    performance = pd.DataFrame(performance).round(3).T
    
    return performance

## execute code
logistic_performance = evaluate_generic_model(logistic_models, 'logistic')
performance_stats = pd.concat([logistic_performance,performance_stats], axis = 0)

#### TRAI04 - Build a two-stage naive bayes model using features

In [25]:
## define the new model
the_model = GridSearchCV( 
    estimator = GaussianNB(),
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {'priors': [
        (0.5, 0.5), 
        (1 - train_y1.mean(), train_y1.mean()),
        (1 - train_y2.mean(), train_y2.mean()),
    ]}
)

## define new training function
def train_naive_bayes_models(priors,
    x = train_x, y1 = train_y1, y2 = train_y2):
    
    ## formulate models
    model1 = GaussianNB(priors = priors[0])
    model2 = GaussianNB(priors = priors[1])

    ## fit models
    model1 = model1.fit(X = train_x, y = train_y1)
    model2 = model2.fit(X = train_x, y = train_y2, sample_weight = train_y1)
    
    ## return models
    return model1, model2

## execute modeling code, recycling as much as possible from logistic
naive_bayes_hyperparameters = build_or_cache_pickle(
    address = 'B_Process/naive_bayes_hyperparameters.pickle',
    function = find_generic_hparams,
    build_bool = settings['redo_hparam_naive_bayes']
)
naive_bayes_models = train_naive_bayes_models(
    priors = naive_bayes_hyperparameters['priors'])
naive_bayes_performance = evaluate_generic_model(
    naive_bayes_models, 'naive_bayes')
performance_stats = pd.concat(
    [naive_bayes_performance, performance_stats], axis = 0)

#### TRAI05 - build a two-stage random forest model

In [26]:
## define the new model
the_model = GridSearchCV( 
    estimator = RandomForestClassifier(class_weight = 'balanced'),
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {
        'n_estimators': [2**i for i in range(6, 10)],
        'min_samples_leaf': [2**i for i in range(2, 5)]
    }
)

## define new training function
def train_random_forest_models(n_estimators,  min_samples_leaf,
    x = train_x, y1 = train_y1, y2 = train_y2):
    
    ## formulate models
    model1 = RandomForestClassifier(
        n_estimators = n_estimators[0],
        min_samples_leaf = min_samples_leaf[0]
    )
    model2 = RandomForestClassifier(
        n_estimators = n_estimators[1],
        min_samples_leaf = min_samples_leaf[1]
    )

    ## fit models
    model1 = model1.fit(X = train_x, y = train_y1)
    model2 = model2.fit(X = train_x, y = train_y2, sample_weight = train_y1)
    
    ## return models
    return model1, model2

## execute modeling code, recycling as much as possible from logistic
random_forest_hyperparameters = build_or_cache_pickle(
    address = 'B_Process/random_forest_hyperparameters.pickle',
    function = find_generic_hparams,
    build_bool = settings['redo_hparam_random_forest']
)
random_forest_models = train_random_forest_models(
    n_estimators = random_forest_hyperparameters['n_estimators'],
    min_samples_leaf = random_forest_hyperparameters['min_samples_leaf']
)
random_forest_performance = evaluate_generic_model(
    random_forest_models, 'random_forest')
performance_stats = pd.concat(
    [random_forest_performance, performance_stats], axis = 0)

#### TRAI06 - build a two-stage AdaBoost model

In [27]:
## define the new model
the_model = GridSearchCV( 
    estimator = AdaBoostClassifier(base_estimator = None),
    cv = 5, 
    n_jobs = settings['num_parallel_cores'], 
    param_grid = {
        'n_estimators':  [int(2**i) for i in range(6, 10)],
        'learning_rate': [2**i for i in range(-2, 3)]
    }
)

## define new training function
def train_adaboost_models(n_estimators, learning_rate,
    x = train_x, y1 = train_y1, y2 = train_y2):
    
    ## formulate models
    model1 = AdaBoostClassifier(
        n_estimators = n_estimators[0],
        learning_rate = learning_rate[0]
    )
    model2 = AdaBoostClassifier(
        n_estimators = n_estimators[1],
        learning_rate = learning_rate[1]
    )

    ## fit models
    model1 = model1.fit(X = train_x, y = train_y1)
    model2 = model2.fit(X = train_x, y = train_y2, sample_weight = train_y1)
    
    ## return models
    return model1, model2

## execute modeling code, recycling as much as possible from logistic
adaboost_hyperparameters = build_or_cache_pickle(
    address = 'B_Process/adaboost_hyperparameters.pickle',
    function = find_generic_hparams,
    build_bool = settings['redo_hparam_adaboost']
)
adaboost_models = train_adaboost_models(
    n_estimators = adaboost_hyperparameters['n_estimators'].astype(int),
    learning_rate = adaboost_hyperparameters['learning_rate']
)
adaboost_performance = evaluate_generic_model(
    adaboost_models, 'adaboost')
performance_stats = pd.concat(
    [adaboost_performance, performance_stats], axis = 0)

#### TRAI07 - save performance results

In [28]:
## save performance results to disk
addr = 'B_Process/performance_stats{0}.csv'
if settings['use_pca']: addr = addr.format('_pca')
else: addr = addr.format('')
performance_stats.to_csv(addr)

In [None]:
print('TODO: Add prediction runtimes to hyperparameters, add f1 to random model')

In [29]:
##########==========##########==========##########==========##########==========

## FOOT - display useful statistics

In [30]:
user_data.ml_set.value_counts()

train      646
test       139
exclude     11
Name: ml_set, dtype: int64

In [31]:
performance_stats

Unnamed: 0,Unnamed: 1,Precision,Recall,F1
adaboost,train,1.0,0.821,0.902
adaboost,test,0.971,0.791,0.872
random_forest,train,1.0,0.858,0.923
random_forest,test,0.976,0.93,0.952
naive_bayes,train,1.0,0.826,0.905
naive_bayes,test,0.944,0.791,0.861
logistic,train,1.0,0.821,0.902
logistic,test,0.946,0.814,0.875
random,train,0.191,0.191,
random,test,0.191,0.156,
