<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"><li><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Manual-verification-of-predicted-results" data-toc-modified-id="Manual-verification-of-predicted-results-0.0.1"><span class="toc-item-num">0.0.1&nbsp;&nbsp;</span>Manual verification of predicted results</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import os, sys, re, json, nltk
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from scipy.sparse import hstack


# all LIWC fields
#ALL_FIELDS = [
#    'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 
#    'Sixltr', 'Dic', 'function', 'pronoun', 'ppron', 'i', 
#    'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 
#    'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 
#    'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 
#    'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 
#    'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 
#    'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 
#    'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 
#    'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 
#    'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 
#    'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 
#    'netspeak', 'assent', 'nonflu', 'filler', 'AllPunc', 'Period', 
#    'Comma', 'Colon', 'SemiC', 'QMark', 'Exclam', 'Dash', 'Quote', 
#    'Apostro', 'Parenth', 'OtherP'
#]
ALL_FIELDS = [
    "i", "we", "you", "they", "posemo", "negemo", "anx", "anger", "sad", "social",
    "family", "friend", "cogproc", "focuspast", "focuspresent", "focusfuture",
    "relig", "death"
]
    
    
def read_csv(f, index="id", parse_dates=None):
    """Read in csv and return data frame."""
    
    df = pd.read_csv(f, encoding="ISO-8859-1", infer_datetime_format=True,
                     parse_dates=parse_dates)
    df.set_index(index, inplace=True)
    df.index = df.index.map(str)
    return df


def get_liwc(df, fields=None, binary=False):
    """Return LIWC features."""
    
    # detect overall sentiment
    df2 = df.copy() # prevent SettingWithCopyWarning
    sent = df2['posemo'] - df2['negemo']
    df2.loc[:,'posemo'] = np.where(sent > 0., 1, 0)
    df2.loc[:,'negemo'] = np.where(sent < 0., 1, 0)
    
    # set fields
    if fields is None: fields = ALL_FIELDS
        
    # get matrix
    X = df2[fields].as_matrix()
    if binary:
        X = (X > 0).astype(int)
    print("X.shape: {}".format(X.shape))
    print("X: {}".format(X))
    return X 


def add_liwc(X, df, fields=None, binary=False):
    """Add fields to X from df."""
    
    X_liwc = get_liwc(df, fields, binary)
    return hstack((X, X_liwc))

In [2]:
# csv files
hmc_file = "data/hawaii_missile_crisis-all-labelled-LIWC2015.csv"
us_file = "data/us_tweets-20180113-LIWC2015.csv"
tex_file = "data/offenders-LIWC2015.csv"

# data frames
hmc_df = read_csv(hmc_file, parse_dates=['created_at'])
us_df = read_csv(us_file, parse_dates=['created_at'])
tex_df = read_csv(tex_file, index="Execution #")

In [3]:
# build dataset
ms_df = hmc_df[(hmc_df['mortality_salience'] == 1)].sort_values(by=['created_at'])
pre_df = hmc_df[(hmc_df['created_at'] >= datetime(2017, 1, 14, 18, 7, 0)) &
                #(hmc_df['created_at'] <= datetime(2017, 1, 14, 18, 21, 0))].sort_values(by=['created_at'])
                (hmc_df['created_at'] <= datetime(2017, 1, 14, 23, 59, 59))].sort_values(by=['created_at'])[:len(ms_df)//2]
post_df = hmc_df[(hmc_df['created_at'] >= datetime(2018, 3, 3, 18, 7, 0)) &
                 #(hmc_df['created_at'] <= datetime(2018, 3, 3, 18, 21, 0))].sort_values(by=['created_at'])
                 (hmc_df['created_at'] <= datetime(2018, 3, 3, 23, 59, 59))].sort_values(by=['created_at'])[:len(ms_df)//2]
hi_df = pd.concat([pre_df, ms_df, post_df]).sort_values(by=['created_at'])
print("mortality_salience tweets: {}".format(len(hi_df[hi_df['mortality_salience'] == 1])))
print("non-mortality_salience tweets: {}".format(len(hi_df[hi_df['mortality_salience'] == 0])))

mortality_salience tweets: 488
non-mortality_salience tweets: 488


In [4]:
import operator
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score


def tweet_token_filter(x, pos=False, stem=False):
    """Filter tweet tokens."""
    
    # remove urls
    x = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', x).strip()
    
    # tokenize, lowercase, and remove handles
    tt = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    x = ' '.join(tt.tokenize(x))
    
    # remove all special characters
    x = re.sub('[^A-Za-z0-9\-\']+', ' ', x).strip()
    tokens = tt.tokenize(x)
    
    # do stemming
    if stem:
        ps = PorterStemmer()
        tokens = [ps.stem(i) for i in tokens]
        
    # extract part of speech
    if pos:
        tokens = ["{}_{}".format(i[1], i[0]) for i in nltk.pos_tag(tokens)]
        
    #print(tokens)
    return tokens


def analyze(v, classifier, X_train, y_train, X_test, y_test, do_liwc=False,
            train_text_col='text', test_text_col='text', ms_test=False):
    """Analyze."""
    
    # transform training tweet text to vector
    X_train_transformed = v.fit_transform(X_train[train_text_col])
    print("training shape after vectorization: {}".format(X_train_transformed.shape))
    
    #print(v.vocabulary_)
    
    # feature selection
    #fs_clf = ExtraTreesClassifier()
    fs_clf = LinearSVC(C=0.05, penalty="l1", dual=False)
    fs_clf = fs_clf.fit(X_train_transformed, y_train)
    #print("feature importances: {}".format(fs_clf.feature_importances_))
    fs_model = SelectFromModel(fs_clf, prefit=True)
    X_train_transformed = fs_model.transform(X_train_transformed)
    print("training shape after feature selection: {}".format(X_train_transformed.shape))
    
    # add ms_test feature which is the class label; use to test that
    # machinery in this function is working; we expect 100% accuracy if
    # ms_test is set
    if ms_test:
        X_train_transformed = hstack((X_train_transformed,
                                      np.reshape(y_train.as_matrix(), (-1, 1))))
        print("training shape after ms_test added: {}".format(X_train_transformed.shape))
    
    # add LIWC features to training vector or not or use it only
    if do_liwc is True:
        X_train_transformed = add_liwc(X_train_transformed, X_train, binary=v.binary)
        print("training shape after LIWC features added: {}".format(X_train_transformed.shape))
    elif do_liwc is False: X_train_transformed = X_train_transformed.toarray()
    elif do_liwc == "only":
        X_train_transformed = get_liwc(X_train, binary=v.binary)
    else:
        raise RuntimeError("Unknown value for do_liwc: {}".format(do_liwc))

    # perform 10-fold cross validation
    clf = classifier(None)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, 
                                scoring="accuracy", cv=10)
    print("cv_scores: {}".format(cv_scores))
    print("cv_scores mean: {}".format(cv_scores.mean()))
    
    # train classifier
    clf = classifier(None)
    clf.fit(X_train_transformed, y_train)
    
    # transform test tweet text to vector
    X_test_transformed = v.transform(X_test[test_text_col])
    print("testing shape after vectorization: {}".format(X_test_transformed.shape))
    
    # use selected features
    X_test_transformed = fs_model.transform(X_test_transformed)
    print("testing shape after feature_selection: {}".format(X_test_transformed.shape))
    
    # add ms_test feature which is the class label; use to test that
    # machinery in this function is working; we expect 100% accuracy if
    # ms_test is set
    if ms_test:
        if y_test is None:
            y_test = np.random.random_integers(0, 1, (X_test_transformed.shape[0], 1))
            X_test_transformed = hstack((X_test_transformed, y_test))
        else:
            X_test_transformed = hstack((X_test_transformed,
                                         np.reshape(y_test.as_matrix(), (-1, 1))))
        print("testing shape after ms_test added: {}".format(X_test_transformed.shape))
    if y_test is not None:
        print("y_test: {}".format(y_test))
        print("y_test sum: {}".format(y_test.sum()))
    
    # add LIWC features to test vector or not or use it only
    if do_liwc is True:
        X_test_transformed = add_liwc(X_test_transformed, X_test, binary=v.binary)
        print("testing shape after LIWC features added: {}".format(X_test_transformed.shape))
    elif do_liwc is False: X_test_transformed = X_test_transformed.toarray()
    elif do_liwc == "only":
        X_test_transformed = get_liwc(X_test, binary=v.binary)
    else:
        raise RuntimeError("Unknown value for do_liwc: {}".format(do_liwc))
        
    # predict on test
    y_test_predicted = clf.predict(X_test_transformed)
    print(np.sum(y_test_predicted))
    
    if y_test is not None:
        # print reports
        score = average_precision_score(y_test, y_test_predicted)
        print("average_precision_score: {}".format(score))
        print(classification_report(y_test, y_test_predicted))
        
        # print vocabulary weights
        if do_liwc != "only":
            vocab_weights = sorted([(word, fs_clf.coef_[0, index]) for word, index in 
                                    v.vocabulary_.items() if fs_clf.coef_[0, index] > 0.],
                                   key=lambda x: x[1])
            #print("20 lowest weights:")
            #print("\n".join(map(str, vocab_weights[:20])))
            #print("\n")
            #print("20 highest weights:")
            #print("\n".join(map(str, vocab_weights[-20:])))
    else:
        score = None
        vocab_weights = []
        
    return v, clf, y_test_predicted, score, vocab_weights


def count_vec(tokenizer, ngram, binary):
    """Count vectorizer."""
    return CountVectorizer(tokenizer=tokenizer, stop_words="english",
                           ngram_range=(1, ngram), binary=binary) 


def tfidf_vec(tokenizer, ngram, binary):
    """TF/IDF vectorizer."""
    return TfidfVectorizer(tokenizer=tokenizer, stop_words="english", 
                           max_df=0.9, min_df=3, ngram_range=(1, ngram), 
                           binary=binary) 


def rank_clf_methods(tokenizers, vectorizers, classifiers, X_train, y_train,
                     X_test, y_test):
    """Iterate over tokenization and vectorizer methods, different
    classifiers, binary or not, ngrams, and features and sort by
    accuracy."""
    
    clfs = []
    for tok_type in tokenizers:
        for vec_type in vectorizers:
            for clf_type in classifiers:
                #for liwc in ["only", False, True]:
                for liwc in [False, True]:
                    for binary in [False, True]:
                        for ngram in range(1, 2):
                            print("#" * 80)
                            print("tok: {}".format(tok_type))
                            print("vec: {}".format(vec_type))
                            print("clf: {}".format(clf_type))
                            print("liwc: {}".format(liwc))
                            print("binary: {}".format(binary))
                            print("ngram: {}".format(ngram))
                            v, clf, y_pred, score, vocab_weights = \
                                analyze(vectorizers[vec_type](tokenizers[tok_type],
                                                              ngram, binary), 
                                        classifiers[clf_type],  X_train, y_train, 
                                        X_test, y_test, liwc)#, ms_test=True)
                            id ='{}_{}_{}_liwc-{}_binary-{}_ngram-{}'.format(tok_type,
                                                                             vec_type,
                                                                             clf_type, 
                                                                             liwc, binary,
                                                                             ngram)
                            clfs.append([score, {
                                'id': id,
                                'score': score,
                                'vocab_weights': vocab_weights,
                                'tok': tok_type,
                                'clf': clf_type,
                                'vec': vec_type,
                                'liwc': liwc,
                                'binary': binary,
                                'ngram': ngram,
                            }])
    return sorted(clfs, key=operator.itemgetter(0), reverse=True)

    
tokenizers = {
    'tt': lambda x: tweet_token_filter(x),
    'pos': lambda x: tweet_token_filter(x, pos=True),
    'stem': lambda x: tweet_token_filter(x, stem=True),
}

vectorizers = {
    'countvec': count_vec,
    'tfidfvec': tfidf_vec,
}

classifiers = {
    'logreg': lambda x: LogisticRegression(),
    'svm': lambda x: SVC(kernel="linear"),
    'nb_bern': lambda x: BernoulliNB(),
    'nb_mult': lambda x: MultinomialNB(),
}

In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(hi_df, hi_df['mortality_salience'],
                                                    stratify=hi_df['mortality_salience'],
                                                    random_state=69)
print("mortality_salience tweets in train: {}".format(len(X_train[X_train['mortality_salience'] == 1])))
print("non-mortality_salience tweets in train: {}".format(len(X_train[X_train['mortality_salience'] == 0])))
print("mortality_salience tweets in test: {}".format(len(X_test[X_test['mortality_salience'] == 1])))
print("non-mortality_salience tweets in test: {}".format(len(X_test[X_test['mortality_salience'] == 0])))
clfs = rank_clf_methods(tokenizers, vectorizers, classifiers, X_train, y_train,
                        X_test, y_test)

mortality_salience tweets in train: 366
non-mortality_salience tweets in train: 366
mortality_salience tweets in test: 122
non-mortality_salience tweets in test: 122
################################################################################
tok: tt
vec: countvec
clf: logreg
liwc: False
binary: False
ngram: 1
training shape after vectorization: (732, 1904)
training shape after feature selection: (732, 18)
cv_scores: [ 0.7027027   0.89189189  0.77027027  0.72972973  0.7027027   0.72972973
  0.76388889  0.70833333  0.69444444  0.77777778]
cv_scores mean: 0.7471471471471471
testing shape after vectorization: (244, 1904)
testing shape after feature_selection: (244, 18)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    

cv_scores: [ 0.74324324  0.83783784  0.75675676  0.77027027  0.75675676  0.72972973
  0.80555556  0.75        0.72222222  0.81944444]
cv_scores mean: 0.7691816816816817
testing shape after vectorization: (244, 1904)
testing shape after feature_selection: (244, 18)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856    1
952242727025963008    1
952243123970699264    1
970012263993303040    0
952244857447198720    1
952244027981668352    1
952241167680155648    1
969997888397893632    0
952244798773125120    1
970011859146297344    0
969998524199784448    0
952244489329958912    1
952241731398963200    1
    

training shape after vectorization: (732, 1904)
training shape after feature selection: (732, 18)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 36)
cv_scores: [ 0.71621622  0.85135135  0.68918919  0.77027027  0.71621622  0.71621622
  0.77777778  0.69444444  0.73611111  0.81944444]
cv_scores mean: 0.7487237237237239
testing shape after vectorization: (244, 1904)
testing shape after feature_selection: (244, 18)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 1904)
training shape after feature selection: (732, 18)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 36)
cv_scores: [ 0.74324324  0.7972973   0.77027027  0.78378378  0.77027027  0.77027027
  0.76388889  0.70833333  0.70833333  0.80555556]
cv_scores mean: 0.7621246246246245
testing shape after vectorization: (244, 1904)
testing shape after feature_selection: (244, 18)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 1904)
training shape after feature selection: (732, 18)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 36)
cv_scores: [ 0.77027027  0.81081081  0.75675676  0.7972973   0.77027027  0.77027027
  0.77777778  0.72222222  0.69444444  0.81944444]
cv_scores mean: 0.7689564564564565
testing shape after vectorization: (244, 1904)
testing shape after feature_selection: (244, 18)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 300)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.64864865  0.75675676  0.7027027   0.74324324  0.77027027  0.74324324
  0.75        0.73611111  0.68055556  0.77777778]
cv_scores mean: 0.730930930930931
testing shape after vectorization: (244, 300)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856   

training shape after vectorization: (732, 300)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.66216216  0.75675676  0.71621622  0.71621622  0.71621622  0.7027027
  0.76388889  0.69444444  0.73611111  0.81944444]
cv_scores mean: 0.7284159159159159
testing shape after vectorization: (244, 300)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856   

training shape after vectorization: (732, 300)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.66216216  0.7972973   0.71621622  0.77027027  0.75675676  0.71621622
  0.76388889  0.66666667  0.68055556  0.81944444]
cv_scores mean: 0.7349474474474473
testing shape after vectorization: (244, 300)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 300)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.60810811  0.72972973  0.66216216  0.77027027  0.71621622  0.71621622
  0.73611111  0.625       0.65277778  0.80555556]
cv_scores mean: 0.7022147147147147
testing shape after vectorization: (244, 300)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 2466)
training shape after feature selection: (732, 30)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 48)
cv_scores: [ 0.83783784  0.89189189  0.78378378  0.86486486  0.82432432  0.81081081
  0.80555556  0.76388889  0.75        0.91666667]
cv_scores mean: 0.8249624624624625
testing shape after vectorization: (244, 2466)
testing shape after feature_selection: (244, 30)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 2466)
training shape after feature selection: (732, 30)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 48)
cv_scores: [ 0.85135135  0.91891892  0.7972973   0.82432432  0.86486486  0.7972973
  0.83333333  0.76388889  0.79166667  0.90277778]
cv_scores mean: 0.8345720720720721
testing shape after vectorization: (244, 2466)
testing shape after feature_selection: (244, 30)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
95224397351858585

training shape after vectorization: (732, 2466)
training shape after feature selection: (732, 30)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 48)
cv_scores: [ 0.78378378  0.83783784  0.77027027  0.85135135  0.83783784  0.74324324
  0.80555556  0.73611111  0.72222222  0.86111111]
cv_scores mean: 0.7949324324324325
testing shape after vectorization: (244, 2466)
testing shape after feature_selection: (244, 30)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 2466)
training shape after feature selection: (732, 30)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 48)
cv_scores: [ 0.81081081  0.85135135  0.74324324  0.82432432  0.86486486  0.7027027
  0.83333333  0.70833333  0.72222222  0.875     ]
cv_scores mean: 0.7936186186186187
testing shape after vectorization: (244, 2466)
testing shape after feature_selection: (244, 30)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
95224397351858585

ngram: 1
training shape after vectorization: (732, 439)
training shape after feature selection: (732, 3)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 21)
cv_scores: [ 0.72972973  0.71621622  0.66216216  0.77027027  0.74324324  0.74324324
  0.70833333  0.65277778  0.72222222  0.70833333]
cv_scores mean: 0.7156531531531531
testing shape after vectorization: (244, 439)
testing shape after feature_selection: (244, 3)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
95224397351

training shape after vectorization: (732, 439)
training shape after feature selection: (732, 3)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 21)
cv_scores: [ 0.71621622  0.71621622  0.67567568  0.74324324  0.75675676  0.67567568
  0.70833333  0.61111111  0.69444444  0.79166667]
cv_scores mean: 0.708933933933934
testing shape after vectorization: (244, 439)
testing shape after feature_selection: (244, 3)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856   

training shape after vectorization: (732, 439)
training shape after feature selection: (732, 3)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 21)
cv_scores: [ 0.72972973  0.7027027   0.64864865  0.77027027  0.71621622  0.68918919
  0.70833333  0.625       0.69444444  0.79166667]
cv_scores mean: 0.7076201201201202
testing shape after vectorization: (244, 439)
testing shape after feature_selection: (244, 3)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 439)
training shape after feature selection: (732, 3)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 21)
cv_scores: [ 0.66216216  0.68918919  0.62162162  0.7972973   0.68918919  0.67567568
  0.70833333  0.54166667  0.65277778  0.72222222]
cv_scores mean: 0.6760135135135135
testing shape after vectorization: (244, 439)
testing shape after feature_selection: (244, 3)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 1713)
training shape after feature selection: (732, 21)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 39)
cv_scores: [ 0.82432432  0.89189189  0.77027027  0.83783784  0.82432432  0.7972973
  0.81944444  0.73611111  0.72222222  0.83333333]
cv_scores mean: 0.8057057057057058
testing shape after vectorization: (244, 1713)
testing shape after feature_selection: (244, 21)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
95224397351858585

training shape after vectorization: (732, 1713)
training shape after feature selection: (732, 21)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 39)
cv_scores: [ 0.81081081  0.87837838  0.74324324  0.85135135  0.82432432  0.77027027
  0.80555556  0.75        0.70833333  0.84722222]
cv_scores mean: 0.798948948948949
testing shape after vectorization: (244, 1713)
testing shape after feature_selection: (244, 21)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
95224397351858585

training shape after vectorization: (732, 1713)
training shape after feature selection: (732, 21)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 39)
cv_scores: [ 0.75675676  0.83783784  0.7972973   0.81081081  0.7972973   0.78378378
  0.80555556  0.73611111  0.70833333  0.86111111]
cv_scores mean: 0.7894894894894895
testing shape after vectorization: (244, 1713)
testing shape after feature_selection: (244, 21)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 1713)
training shape after feature selection: (732, 21)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 39)
cv_scores: [ 0.78378378  0.83783784  0.78378378  0.85135135  0.7972973   0.75675676
  0.79166667  0.75        0.69444444  0.84722222]
cv_scores mean: 0.7894144144144144
testing shape after vectorization: (244, 1713)
testing shape after feature_selection: (244, 21)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
9522439735185858

training shape after vectorization: (732, 340)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.7027027   0.72972973  0.66216216  0.77027027  0.77027027  0.71621622
  0.76388889  0.72222222  0.73611111  0.79166667]
cv_scores mean: 0.7365240240240241
testing shape after vectorization: (244, 340)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 340)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.75675676  0.78378378  0.64864865  0.7972973   0.77027027  0.72972973
  0.76388889  0.70833333  0.70833333  0.81944444]
cv_scores mean: 0.7486486486486487
testing shape after vectorization: (244, 340)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 340)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.66216216  0.71621622  0.67567568  0.78378378  0.77027027  0.71621622
  0.75        0.69444444  0.70833333  0.86111111]
cv_scores mean: 0.7338213213213212
testing shape after vectorization: (244, 340)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856  

training shape after vectorization: (732, 340)
training shape after feature selection: (732, 7)
X.shape: (732, 18)
X: [[0 0 1 ..., 0 1 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]]
training shape after LIWC features added: (732, 25)
cv_scores: [ 0.62162162  0.68918919  0.63513514  0.77027027  0.75675676  0.7027027
  0.72222222  0.61111111  0.65277778  0.79166667]
cv_scores mean: 0.6953453453453454
testing shape after vectorization: (244, 340)
testing shape after feature_selection: (244, 7)
y_test: id
970004318572204033    0
952243493287575553    1
820353527054618624    0
969998694165565440    0
820349256791793664    0
952243756249497600    1
820349864001212416    0
952243004324003845    1
970010662368718848    0
969999048185856000    0
820351039228542976    0
952241362463727616    1
952242684520886272    1
952242358749339648    1
970003413793521664    0
820349443387949057    0
970011124505587712    0
952243973518585856   

In [6]:
# print top classification method in terms of score
top_clf = clfs[0][1]
print(top_clf)

# predict mortality_salience labels on US tweets using only hawaii tweets
# and track accuracy score and number of positive predictions
us_clfs = []
for eval_score, info in clfs:
    v, clf, y_us_pred, score, vocab_weights = \
        analyze(vectorizers[info['vec']](tokenizers[info['tok']], 
                                         info['ngram'], info['binary']), 
                classifiers[info ['clf']], hi_df, hi_df['mortality_salience'], 
                us_df, None, do_liwc=info ['liwc'])#, ms_test=True)
    print("y_us_pred: {}".format(y_us_pred))
    print("total us tweets: {}".format(len(y_us_pred)))
    ms_count = np.sum(y_us_pred)
    print("predicted mortality_salience in hawaii: {}".format(ms_count))
    us_clfs.append([ms_count, eval_score, { 'info': info,
                                            'vectorizer': v,
                                            'classifier': clf,
                                            'y_pred': y_us_pred,
                                            'eval_score': eval_score,
                                            'count': ms_count,
                                            'vocab_weights': vocab_weights }])

{'id': 'pos_countvec_logreg_liwc-True_binary-True_ngram-1', 'score': 0.74837478801582824, 'vocab_weights': [('VBZ_is', 0.066009787114163165), ('NN_hawaii', 0.086882415773714958), ('JJ_real', 0.090620400368584667), ('NN_everyone', 0.097785201298334179), ('JJ_ballistic', 0.098726857391658457), ('DT_all', 0.10621920447925723), ('PRP_we', 0.18407143223052258), ('RB_just', 0.20796172039655209), ('VBD_got', 0.21729562292510352), ('NN_threat', 0.27079149352628051), ('JJ_safe', 0.29440384074143044), ('NNS_sirens', 0.31508083649014856), ('WP_what', 0.32797840538376316), ('NN_wtf', 0.33051636961481712), ('DT_this', 0.4942601231867732), ('NN_missile', 0.78522861221732776)], 'tok': 'pos', 'clf': 'logreg', 'vec': 'countvec', 'liwc': True, 'binary': True, 'ngram': 1}
training shape after vectorization: (976, 2983)
training shape after feature selection: (976, 42)
X.shape: (976, 18)
X: [[1 0 1 ..., 1 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 .

training shape after vectorization: (976, 2983)
training shape after feature selection: (976, 42)
cv_scores: [ 0.78571429  0.70408163  0.85714286  0.83673469  0.83673469  0.81632653
  0.81632653  0.78571429  0.75        0.86458333]
cv_scores mean: 0.8053358843537415
testing shape after vectorization: (7584, 2983)
testing shape after feature_selection: (7584, 42)
1067
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 1067
training shape after vectorization: (976, 2051)
training shape after feature selection: (976, 30)
X.shape: (976, 18)
X: [[1 0 1 ..., 1 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
training shape after LIWC features added: (976, 48)
cv_scores: [ 0.76530612  0.74489796  0.82653061  0.80612245  0.76530612  0.80612245
  0.78571429  0.78571429  0.85416667  0.89583333]
cv_scores mean: 0.8035714285714286
testing shape after vectorization: (7584, 2051)
testing shape af

training shape after vectorization: (976, 2051)
training shape after feature selection: (976, 30)
X.shape: (976, 18)
X: [[1 0 1 ..., 1 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
training shape after LIWC features added: (976, 48)
cv_scores: [ 0.76530612  0.70408163  0.7755102   0.82653061  0.75510204  0.84693878
  0.84693878  0.70408163  0.77083333  0.875     ]
cv_scores mean: 0.78703231292517
testing shape after vectorization: (7584, 2051)
testing shape after feature_selection: (7584, 30)
X.shape: (7584, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
testing shape after LIWC features added: (7584, 48)
1082
y_us_pred: [0 0 0 ..., 0 0 1]
total us tweets: 7584
predicted mortality_salience in hawaii: 1082
training shape after vectorization: (976, 2983)
training shape after feature selection: (976, 44)
cv_scores: [ 0.80612245  0.70408

cv_scores: [ 0.7755102   0.70408163  0.76530612  0.82653061  0.76530612  0.85714286
  0.7755102   0.81632653  0.82291667  0.875     ]
cv_scores mean: 0.7983630952380952
testing shape after vectorization: (7584, 435)
testing shape after feature_selection: (7584, 13)
X.shape: (7584, 18)
X: [[ 11.11   0.     0.   ...,   0.     0.     0.  ]
 [  0.     5.26   0.   ...,   0.     0.     0.  ]
 [  0.     0.     4.55 ...,   0.     0.     0.  ]
 ..., 
 [  4.55   0.     4.55 ...,   0.     0.     0.  ]
 [ 19.05   0.     0.   ...,   0.     0.     0.  ]
 [ 11.11   0.     0.   ...,   0.     0.     0.  ]]
testing shape after LIWC features added: (7584, 31)
933
y_us_pred: [0 0 0 ..., 0 0 1]
total us tweets: 7584
predicted mortality_salience in hawaii: 933
training shape after vectorization: (976, 2301)
training shape after feature selection: (976, 28)
cv_scores: [ 0.71428571  0.70408163  0.7755102   0.78571429  0.73469388  0.7244898
  0.75510204  0.69387755  0.78125     0.86458333]
cv_scores mean: 0.75

training shape after vectorization: (976, 2301)
training shape after feature selection: (976, 28)
cv_scores: [ 0.74489796  0.76530612  0.75510204  0.7755102   0.69387755  0.75510204
  0.75510204  0.67346939  0.75        0.875     ]
cv_scores mean: 0.7543367346938774
testing shape after vectorization: (7584, 2301)
testing shape after feature_selection: (7584, 28)
931
y_us_pred: [0 0 1 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 931
training shape after vectorization: (976, 552)
training shape after feature selection: (976, 10)
X.shape: (976, 18)
X: [[ 16.67   0.    16.67 ...,  16.67   0.     0.  ]
 [  0.     0.    14.29 ...,   0.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 ..., 
 [  0.     0.     0.   ...,   5.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 [ 22.22   0.     0.   ...,   0.     0.     0.  ]]
training shape after LIWC features added: (976, 28)
cv_scores: [ 0.81632653  0.65306122  0.76530612  0.75510

training shape after vectorization: (976, 2301)
training shape after feature selection: (976, 28)
cv_scores: [ 0.71428571  0.70408163  0.74489796  0.7755102   0.74489796  0.74489796
  0.75510204  0.69387755  0.8125      0.85416667]
cv_scores mean: 0.754421768707483
testing shape after vectorization: (7584, 2301)
testing shape after feature_selection: (7584, 28)
1068
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 1068
training shape after vectorization: (976, 435)
training shape after feature selection: (976, 13)
X.shape: (976, 18)
X: [[ 16.67   0.    16.67 ...,  16.67   0.     0.  ]
 [  0.     0.    14.29 ...,   0.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 ..., 
 [  0.     0.     0.   ...,   5.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 [ 22.22   0.     0.   ...,   0.     0.     0.  ]]
training shape after LIWC features added: (976, 31)
cv_scores: [ 0.73469388  0.62244898  0.7244898   0.7959

training shape after vectorization: (976, 401)
training shape after feature selection: (976, 11)
X.shape: (976, 18)
X: [[1 0 1 ..., 1 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
training shape after LIWC features added: (976, 29)
cv_scores: [ 0.73469388  0.64285714  0.71428571  0.75510204  0.66326531  0.74489796
  0.7755102   0.67346939  0.71875     0.78125   ]
cv_scores mean: 0.7204081632653061
testing shape after vectorization: (7584, 401)
testing shape after feature_selection: (7584, 11)
X.shape: (7584, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
testing shape after LIWC features added: (7584, 29)
880
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 880
training shape after vectorization: (976, 552)
training shape after feature selection: (976, 8)
X.shape: (976, 18)
X: [[1 0 1 ..., 1

training shape after vectorization: (976, 401)
training shape after feature selection: (976, 12)
cv_scores: [ 0.68367347  0.60204082  0.71428571  0.74489796  0.71428571  0.71428571
  0.7244898   0.67346939  0.75        0.80208333]
cv_scores mean: 0.7123511904761904
testing shape after vectorization: (7584, 401)
testing shape after feature_selection: (7584, 12)
830
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 830
training shape after vectorization: (976, 401)
training shape after feature selection: (976, 11)
cv_scores: [ 0.68367347  0.60204082  0.71428571  0.74489796  0.71428571  0.70408163
  0.7244898   0.66326531  0.75        0.80208333]
cv_scores mean: 0.7103103741496597
testing shape after vectorization: (7584, 401)
testing shape after feature_selection: (7584, 11)
840
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 840
training shape after vectorization: (976, 552)
training shape after featur

testing shape after vectorization: (7584, 552)
testing shape after feature_selection: (7584, 8)
772
y_us_pred: [0 1 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in hawaii: 772
training shape after vectorization: (976, 401)
training shape after feature selection: (976, 11)
X.shape: (976, 18)
X: [[1 0 1 ..., 1 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
training shape after LIWC features added: (976, 29)
cv_scores: [ 0.75510204  0.6122449   0.70408163  0.69387755  0.66326531  0.78571429
  0.75510204  0.67346939  0.67708333  0.71875   ]
cv_scores mean: 0.7038690476190476
testing shape after vectorization: (7584, 401)
testing shape after feature_selection: (7584, 11)
X.shape: (7584, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
testing shape after LIWC features added: (7584, 29)
1285
y_us_pred: [0 1 0 ..., 0 0 1]
t

In [7]:
# sort by count
us_clfs_by_count = sorted(us_clfs, key=operator.itemgetter(0))
lc_count = us_clfs_by_count[0][0]
lc_score = us_clfs_by_count[0][1]
lc_info = us_clfs_by_count[0][2]['info']
lc_pred = us_clfs_by_count[0][2]['y_pred']
print(lc_count, lc_score, lc_info)
us_df['mortality_salience'] = lc_pred
us_df[us_df['mortality_salience'] == 1].to_csv('data/us_tweets-predicted_mortality_salience-lowcount.csv')

48 0.619789227166 {'id': 'tt_tfidfvec_nb_mult_liwc-False_binary-True_ngram-1', 'score': 0.61978922716627638, 'vocab_weights': [('sirens', 0.073817342805066119), ('safe', 0.080824836444753342), ('hawaii', 0.13651558923538695), ('real', 0.15638596578254021), ('got', 0.22582891326568416), ('wtf', 0.31107812142606106), ('missile', 1.6373633983606282)], 'tok': 'tt', 'clf': 'nb_mult', 'vec': 'tfidfvec', 'liwc': False, 'binary': True, 'ngram': 1}


In [8]:
# sort by score
us_clfs_by_score = sorted(us_clfs, key=operator.itemgetter(1), reverse=True)
hs_count = us_clfs_by_score[0][0]
hs_score = us_clfs_by_score[0][1]
hs_info = us_clfs_by_score[0][2]['info']
hs_pred = us_clfs_by_score[0][2]['y_pred']
print(hs_count, hs_score, hs_info)
us_df['mortality_salience'] = hs_pred
us_df[us_df['mortality_salience'] == 1].to_csv('data/us_tweets-predicted_mortality_salience-hiscore.csv')
#sorted_scores = sorted(us_clfs_by_score, reverse=True)
#for i in sorted_scores:
#    print("{}: {}".format(i, us_clfs_by_score[i]['count']))


1336 0.748374788016 {'id': 'pos_countvec_logreg_liwc-True_binary-True_ngram-1', 'score': 0.74837478801582824, 'vocab_weights': [('VBZ_is', 0.066009787114163165), ('NN_hawaii', 0.086882415773714958), ('JJ_real', 0.090620400368584667), ('NN_everyone', 0.097785201298334179), ('JJ_ballistic', 0.098726857391658457), ('DT_all', 0.10621920447925723), ('PRP_we', 0.18407143223052258), ('RB_just', 0.20796172039655209), ('VBD_got', 0.21729562292510352), ('NN_threat', 0.27079149352628051), ('JJ_safe', 0.29440384074143044), ('NNS_sirens', 0.31508083649014856), ('WP_what', 0.32797840538376316), ('NN_wtf', 0.33051636961481712), ('DT_this', 0.4942601231867732), ('NN_missile', 0.78522861221732776)], 'tok': 'pos', 'clf': 'logreg', 'vec': 'countvec', 'liwc': True, 'binary': True, 'ngram': 1}


In [16]:
#print(len(tex_df))
#tex_df = tex_df[tex_df['Last Statement'].notnull()]
tex_labelled_file = "data/offenders-LIWC2015-clean_and_labelled.csv"
#tex_df.to_csv(tex_labelled_file)
# NOTE: manually label mortality_salience, set id and text fields

# open up labelled texas data
from copy import deepcopy

# format labelled texas data for merging
fields = deepcopy(ALL_FIELDS)
fields.insert(0, 'mortality_salience')
fields.insert(0, 'text')
tex_df = read_csv(tex_labelled_file, parse_dates=['created_at'])[fields]
print("labelled texas data: {}".format(len(tex_df)))

# merge labelled texas data with labelled hawaii crisis data
merged_ms_df = pd.concat([tex_df, ms_df[fields]])
print("merged mortality texas and hi data: {}".format(len(merged_ms_df)))
print("mortality_salience in merged: {}".format(len(merged_ms_df[merged_ms_df['mortality_salience'] == 1])))
print("non-mortality_salience in merged: {}".format(len(merged_ms_df[merged_ms_df['mortality_salience'] == 0])))

# get control data
ctrl1_df = hmc_df[(hmc_df['created_at'] >= datetime(2017, 1, 14, 0, 0, 0)) &
                  (hmc_df['created_at'] <= datetime(2017, 1, 14, 23, 59, 59))].sample(len(merged_ms_df)//2, random_state=18)
ctrl2_df = hmc_df[(hmc_df['created_at'] >= datetime(2018, 3, 3, 0, 0, 0)) &
                  (hmc_df['created_at'] <= datetime(2018, 3, 3, 23, 59, 59))].sample(len(merged_ms_df)//2, random_state=18)
ctrl_df = pd.concat([ctrl1_df, ctrl2_df]).sort_values(by=['created_at'])[fields]
#ctrl_df = ctrl_df[(ctrl_df['WC'] >= tex_mean)]
ctrl_df['mortality_salience'] = 0
print("labelled control data: {}".format(len(ctrl_df)))
ms_train_df = pd.concat([merged_ms_df, ctrl_df])
print("labelled training data: {}".format(len(ms_train_df)))
ms_train_df.to_csv('data/mortality_salience-train.csv')
print("mortality_salience in training: {}".format(len(ms_train_df[ms_train_df['mortality_salience'] == 1])))
print("non-mortality_salience in training: {}".format(len(ms_train_df[ms_train_df['mortality_salience'] == 0])))

# train test split
X_train, X_test, y_train, y_test = train_test_split(ms_train_df, ms_train_df['mortality_salience'],
                                                    stratify=ms_train_df['mortality_salience'],
                                                    random_state=69)
print("mortality_salience in X_train: {}".format(len(X_train[X_train['mortality_salience'] == 1])))
print("non-mortality_salience in X_train: {}".format(len(X_train[X_train['mortality_salience'] == 0])))
print("mortality_salience in X_test: {}".format(len(X_test[X_test['mortality_salience'] == 1])))
print("non-mortality_salience in X_test: {}".format(len(X_test[X_test['mortality_salience'] == 0])))

clfs = rank_clf_methods(tokenizers, vectorizers, classifiers, X_train, y_train,
                        X_test, y_test)

labelled texas data: 424
merged mortality texas and hi data: 912
mortality_salience in merged: 912
non-mortality_salience in merged: 0
labelled control data: 912
labelled training data: 1824
mortality_salience in training: 912
non-mortality_salience in training: 912
mortality_salience in X_train: 684
non-mortality_salience in X_train: 684
mortality_salience in X_test: 228
non-mortality_salience in X_test: 228
################################################################################
tok: tt
vec: countvec
clf: logreg
liwc: False
binary: False
ngram: 1
training shape after vectorization: (1368, 4507)
training shape after feature selection: (1368, 40)
cv_scores: [ 0.84782609  0.81884058  0.87681159  0.8115942   0.81617647  0.86764706
  0.80147059  0.81617647  0.77941176  0.82352941]
cv_scores mean: 0.8259484228473999
testing shape after vectorization: (456, 4507)
testing shape after feature_selection: (456, 40)
y_test: id
236                   1
261                   1
9697874368374

training shape after vectorization: (1368, 4507)
training shape after feature selection: (1368, 36)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 54)
cv_scores: [ 0.87681159  0.8115942   0.83333333  0.8115942   0.77205882  0.85294118
  0.77205882  0.81617647  0.80147059  0.82352941]
cv_scores mean: 0.817156862745098
testing shape after vectorization: (456, 4507)
testing shape after feature_selection: (456, 36)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
9697894786497

training shape after vectorization: (1368, 4507)
training shape after feature selection: (1368, 36)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 54)
cv_scores: [ 0.84782609  0.76086957  0.85507246  0.78985507  0.80882353  0.86764706
  0.80882353  0.80882353  0.79411765  0.80882353]
cv_scores mean: 0.8150682011935209
testing shape after vectorization: (456, 4507)
testing shape after feature_selection: (456, 36)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 4507)
training shape after feature selection: (1368, 36)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 54)
cv_scores: [ 0.80434783  0.72463768  0.77536232  0.76811594  0.76470588  0.83088235
  0.75        0.70588235  0.73529412  0.80147059]
cv_scores mean: 0.766069906223359
testing shape after vectorization: (456, 4507)
testing shape after feature_selection: (456, 36)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
9697894786497

training shape after vectorization: (1368, 4507)
training shape after feature selection: (1368, 36)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 54)
cv_scores: [ 0.79710145  0.76086957  0.79710145  0.7826087   0.74264706  0.83823529
  0.76470588  0.80147059  0.77941176  0.76470588]
cv_scores mean: 0.7828857630008526
testing shape after vectorization: (456, 4507)
testing shape after feature_selection: (456, 36)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1034)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.8115942   0.75362319  0.74637681  0.77536232  0.73529412  0.77205882
  0.77941176  0.76470588  0.76470588  0.76470588]
cv_scores mean: 0.7667838874680306
testing shape after vectorization: (456, 1034)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1034)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.84782609  0.75362319  0.76086957  0.76086957  0.75735294  0.76470588
  0.79411765  0.77941176  0.75        0.80882353]
cv_scores mean: 0.7777600170502984
testing shape after vectorization: (456, 1034)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1034)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.79710145  0.68115942  0.74637681  0.75362319  0.75        0.77205882
  0.72794118  0.69117647  0.71323529  0.75      ]
cv_scores mean: 0.7382672634271099
testing shape after vectorization: (456, 1034)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1034)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.79710145  0.71014493  0.72463768  0.76086957  0.69852941  0.73529412
  0.72058824  0.73529412  0.67647059  0.71323529]
cv_scores mean: 0.7272165387894287
testing shape after vectorization: (456, 1034)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 6194)
training shape after feature selection: (1368, 52)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 70)
cv_scores: [ 0.88405797  0.8115942   0.83333333  0.87681159  0.81617647  0.875
  0.83823529  0.84558824  0.86029412  0.90441176]
cv_scores mean: 0.8545502983802218
testing shape after vectorization: (456, 6194)
testing shape after feature_selection: (456, 52)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
96978947864979046

training shape after vectorization: (1368, 6194)
training shape after feature selection: (1368, 52)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 70)
cv_scores: [ 0.86956522  0.8115942   0.84057971  0.87681159  0.83823529  0.85294118
  0.82352941  0.83823529  0.86029412  0.90441176]
cv_scores mean: 0.851619778346121
testing shape after vectorization: (456, 6194)
testing shape after feature_selection: (456, 52)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
9697894786497

training shape after vectorization: (1368, 6194)
training shape after feature selection: (1368, 52)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 70)
cv_scores: [ 0.80434783  0.73188406  0.75362319  0.80434783  0.78676471  0.82352941
  0.73529412  0.72794118  0.72794118  0.77941176]
cv_scores mean: 0.7675085251491901
testing shape after vectorization: (456, 6194)
testing shape after feature_selection: (456, 52)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 6194)
training shape after feature selection: (1368, 52)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 70)
cv_scores: [ 0.81884058  0.74637681  0.84057971  0.84057971  0.79411765  0.82352941
  0.80147059  0.80147059  0.83088235  0.80882353]
cv_scores mean: 0.8106670929241263
testing shape after vectorization: (456, 6194)
testing shape after feature_selection: (456, 52)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1375)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.84782609  0.74637681  0.76811594  0.84782609  0.76470588  0.82352941
  0.75735294  0.77205882  0.76470588  0.80882353]
cv_scores mean: 0.7901321398124467
testing shape after vectorization: (456, 1375)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1375)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.86956522  0.7826087   0.79710145  0.82608696  0.78676471  0.80882353
  0.77941176  0.81617647  0.80147059  0.83823529]
cv_scores mean: 0.8106244671781757
testing shape after vectorization: (456, 1375)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1375)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.79710145  0.71014493  0.73913043  0.75362319  0.79411765  0.79411765
  0.71323529  0.71323529  0.74264706  0.78676471]
cv_scores mean: 0.7544117647058824
testing shape after vectorization: (456, 1375)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1375)
training shape after feature selection: (1368, 11)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 29)
cv_scores: [ 0.78985507  0.66666667  0.7173913   0.7826087   0.69117647  0.71323529
  0.66911765  0.70588235  0.68382353  0.71323529]
cv_scores mean: 0.7132992327365728
testing shape after vectorization: (456, 1375)
testing shape after feature_selection: (456, 11)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 3793)
training shape after feature selection: (1368, 40)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 58)
cv_scores: [ 0.88405797  0.83333333  0.84782609  0.83333333  0.80882353  0.875
  0.82352941  0.83088235  0.83088235  0.82352941]
cv_scores mean: 0.8391197783461211
testing shape after vectorization: (456, 3793)
testing shape after feature_selection: (456, 40)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
96978947864979046

training shape after vectorization: (1368, 3793)
training shape after feature selection: (1368, 40)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 58)
cv_scores: [ 0.86956522  0.79710145  0.84057971  0.84057971  0.82352941  0.88235294
  0.80147059  0.80882353  0.82352941  0.77941176]
cv_scores mean: 0.8266943734015346
testing shape after vectorization: (456, 3793)
testing shape after feature_selection: (456, 40)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 3793)
training shape after feature selection: (1368, 40)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 58)
cv_scores: [ 0.81884058  0.74637681  0.76086957  0.7826087   0.78676471  0.83823529
  0.76470588  0.71323529  0.75735294  0.80147059]
cv_scores mean: 0.7770460358056266
testing shape after vectorization: (456, 3793)
testing shape after feature_selection: (456, 40)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 3793)
training shape after feature selection: (1368, 40)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 58)
cv_scores: [ 0.83333333  0.78985507  0.84782609  0.79710145  0.76470588  0.86764706
  0.78676471  0.80882353  0.78676471  0.80147059]
cv_scores mean: 0.808429241261722
testing shape after vectorization: (456, 3793)
testing shape after feature_selection: (456, 40)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
9697894786497

training shape after vectorization: (1368, 1006)
training shape after feature selection: (1368, 13)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 31)
cv_scores: [ 0.84782609  0.76811594  0.76086957  0.82608696  0.74264706  0.78676471
  0.77941176  0.75735294  0.76470588  0.80882353]
cv_scores mean: 0.7842604433077578
testing shape after vectorization: (456, 1006)
testing shape after feature_selection: (456, 13)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1006)
training shape after feature selection: (1368, 13)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 31)
cv_scores: [ 0.86231884  0.76811594  0.78985507  0.80434783  0.76470588  0.83088235
  0.79411765  0.78676471  0.77941176  0.82352941]
cv_scores mean: 0.8004049445865304
testing shape after vectorization: (456, 1006)
testing shape after feature_selection: (456, 13)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1006)
training shape after feature selection: (1368, 13)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 31)
cv_scores: [ 0.8115942   0.73188406  0.73913043  0.76086957  0.75735294  0.77205882
  0.75        0.69117647  0.69852941  0.75735294]
cv_scores mean: 0.7469948849104859
testing shape after vectorization: (456, 1006)
testing shape after feature_selection: (456, 13)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

training shape after vectorization: (1368, 1006)
training shape after feature selection: (1368, 13)
X.shape: (1368, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [1 0 1 ..., 1 1 1]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
training shape after LIWC features added: (1368, 31)
cv_scores: [ 0.7826087   0.70289855  0.70289855  0.76811594  0.68382353  0.75735294
  0.72058824  0.75        0.70588235  0.73529412]
cv_scores mean: 0.7309462915601024
testing shape after vectorization: (456, 1006)
testing shape after feature_selection: (456, 13)
y_test: id
236                   1
261                   1
969787436837437442    0
952245036913065984    1
970004060177747968    0
969871974083190784    0
820195274572304385    0
128                   1
969995458146807808    0
969774081900883968    0
952241476636884992    1
952242795510665216    1
969785785208602625    0
820375725609844736    0
325                   1
970048716122943488    0
85                    1
969789478649

In [18]:
# print top classification method in terms of score
top_clf = clfs[0][1]
print(top_clf)

# predict mortality_salience labels on US tweets using merged 
# texas death row statements + HI crisis tweets + control. 
# and track accuracy score and number of positive predictions
ms_clfs = []
for eval_score, info in clfs:
    v, clf, y_us_pred, score, vocab_weights = \
        analyze(vectorizers[info['vec']](tokenizers[info['tok']], 
                                         info['ngram'], info['binary']), 
                classifiers[info ['clf']], ms_train_df, 
                ms_train_df['mortality_salience'], 
                us_df, None, do_liwc=info ['liwc'])#, ms_test=True)
    print("y_us_pred: {}".format(y_us_pred))
    print("total us tweets: {}".format(len(y_us_pred)))
    ms_count = np.sum(y_us_pred)
    print("predicted mortality_salience in US: {}".format(ms_count))
    ms_clfs.append([ms_count, eval_score, { 'info': info,
                                            'vectorizer': v,
                                            'classifier': clf,
                                            'y_pred': y_us_pred,
                                            'eval_score': eval_score,
                                            'count': ms_count,
                                            'vocab_weights': vocab_weights }])

{'id': 'tt_countvec_logreg_liwc-True_binary-True_ngram-1', 'score': 0.82151315789473678, 'vocab_weights': [('happening', 0.039561016299611514), ('know', 0.053859818719145837), ('thank', 0.058945390206538634), ('goodbye', 0.083680337650619357), ('going', 0.11197164540868451), ('tell', 0.1146033050420473), ('s', 0.12473534412692705), ('did', 0.12663396924233322), ('missle', 0.13050034939511518), ('just', 0.13911792376950285), ('hawaii', 0.15554901777848207), ('ready', 0.16887303742654342), ('alert', 0.17895087160414916), ('t', 0.1934369811900146), ('got', 0.19748545065836154), ('yes', 0.21657855593937531), ('sorry', 0.21964220929049486), ('threat', 0.22432380571986182), ('warden', 0.22661944762886255), ('real', 0.23864018837863923), ('lord', 0.29698240699705686), ('die', 0.32553934562074721), ('m', 0.32843081635193982), ('say', 0.36805175132477153), ('sirens', 0.40004119874428956), ('fuck', 0.4075804658808741), ('safe', 0.46790281173851539), ('family', 0.53845432907905089), ('love', 0.54

cv_scores: [ 0.94021739  0.92934783  0.91208791  0.87912088  0.81868132  0.75824176
  0.80769231  0.76923077  0.78571429  0.84065934]
cv_scores mean: 0.8440993788819876
testing shape after vectorization: (7584, 7296)
testing shape after feature_selection: (7584, 64)
X.shape: (7584, 18)
X: [[ 11.11   0.     0.   ...,   0.     0.     0.  ]
 [  0.     5.26   0.   ...,   0.     0.     0.  ]
 [  0.     0.     4.55 ...,   0.     0.     0.  ]
 ..., 
 [  4.55   0.     4.55 ...,   0.     0.     0.  ]
 [ 19.05   0.     0.   ...,   0.     0.     0.  ]
 [ 11.11   0.     0.   ...,   0.     0.     0.  ]]
testing shape after LIWC features added: (7584, 82)
1037
y_us_pred: [0 1 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1037
training shape after vectorization: (1824, 7296)
training shape after feature selection: (1824, 64)
X.shape: (1824, 18)
X: [[ 14.56   0.     7.59 ...,   8.86   4.43   1.27]
 [ 10.87   0.     6.52 ...,   2.17   0.     0.  ]
 [ 15.38   0.     7.69 ...,  

cv_scores: [ 0.91304348  0.96195652  0.94505495  0.8956044   0.84615385  0.71978022
  0.8021978   0.78571429  0.7967033   0.86263736]
cv_scores mean: 0.8528846153846154
testing shape after vectorization: (7584, 4401)
testing shape after feature_selection: (7584, 57)
X.shape: (7584, 18)
X: [[ 11.11   0.     0.   ...,   0.     0.     0.  ]
 [  0.     5.26   0.   ...,   0.     0.     0.  ]
 [  0.     0.     4.55 ...,   0.     0.     0.  ]
 ..., 
 [  4.55   0.     4.55 ...,   0.     0.     0.  ]
 [ 19.05   0.     0.   ...,   0.     0.     0.  ]
 [ 11.11   0.     0.   ...,   0.     0.     0.  ]]
testing shape after LIWC features added: (7584, 75)
901
y_us_pred: [0 0 0 ..., 0 0 1]
total us tweets: 7584
predicted mortality_salience in US: 901
training shape after vectorization: (1824, 5268)
training shape after feature selection: (1824, 55)
cv_scores: [ 0.95652174  0.96195652  0.97802198  0.92857143  0.78021978  0.63736264
  0.6978022   0.68681319  0.68131868  0.74725275]
cv_scores mean: 0.80

testing shape after vectorization: (7584, 4401)
testing shape after feature_selection: (7584, 53)
X.shape: (7584, 18)
X: [[1 0 0 ..., 0 0 0]
 [0 1 0 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 ..., 
 [1 0 1 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]
 [1 0 0 ..., 0 0 0]]
testing shape after LIWC features added: (7584, 71)
675
y_us_pred: [0 1 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 675
training shape after vectorization: (1824, 7296)
training shape after feature selection: (1824, 64)
cv_scores: [ 0.88586957  0.91847826  0.92307692  0.87362637  0.79120879  0.73076923
  0.78021978  0.74725275  0.73626374  0.84065934]
cv_scores mean: 0.8227424749163881
testing shape after vectorization: (7584, 7296)
testing shape after feature_selection: (7584, 64)
1013
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1013
training shape after vectorization: (1824, 5268)
training shape after feature selection: (1824, 55)
X.shape: (1824, 18)
X: [[1 0 1 ..., 1 

training shape after vectorization: (1824, 7296)
training shape after feature selection: (1824, 64)
X.shape: (1824, 18)
X: [[ 14.56   0.     7.59 ...,   8.86   4.43   1.27]
 [ 10.87   0.     6.52 ...,   2.17   0.     0.  ]
 [ 15.38   0.     7.69 ...,   7.69   7.69   0.  ]
 ..., 
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]
 [  0.     0.     0.   ...,   0.     0.     0.  ]]
training shape after LIWC features added: (1824, 82)
cv_scores: [ 0.91847826  0.92391304  0.9010989   0.91758242  0.6978022   0.57692308
  0.59340659  0.59340659  0.65384615  0.66483516]
cv_scores mean: 0.7441292403248926
testing shape after vectorization: (7584, 7296)
testing shape after feature_selection: (7584, 64)
X.shape: (7584, 18)
X: [[ 11.11   0.     0.   ...,   0.     0.     0.  ]
 [  0.     5.26   0.   ...,   0.     0.     0.  ]
 [  0.     0.     4.55 ...,   0.     0.     0.  ]
 ..., 
 [  4.55   0.     4.55 ...,   0.     0.     0.  ]
 [ 19.05   0.     0

testing shape after vectorization: (7584, 5268)
testing shape after feature_selection: (7584, 55)
1020
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1020
training shape after vectorization: (1824, 4401)
training shape after feature selection: (1824, 53)
cv_scores: [ 0.94565217  0.90217391  0.8956044   0.90659341  0.72527473  0.64835165
  0.73076923  0.66483516  0.67032967  0.71978022]
cv_scores mean: 0.7809364548494984
testing shape after vectorization: (7584, 4401)
testing shape after feature_selection: (7584, 53)
1103
y_us_pred: [0 0 0 ..., 0 1 1]
total us tweets: 7584
predicted mortality_salience in US: 1103
training shape after vectorization: (1824, 1308)
training shape after feature selection: (1824, 26)
cv_scores: [ 0.92391304  0.91847826  0.91758242  0.84615385  0.76373626  0.66483516
  0.76923077  0.68681319  0.73076923  0.79120879]
cv_scores mean: 0.8012720974677496
testing shape after vectorization: (7584, 1308)
testing shape after fe

1107
y_us_pred: [0 0 1 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1107
training shape after vectorization: (1824, 1249)
training shape after feature selection: (1824, 25)
cv_scores: [ 0.91304348  0.96195652  0.92857143  0.85714286  0.75824176  0.64285714
  0.71428571  0.6978022   0.71428571  0.73076923]
cv_scores mean: 0.7918956043956044
testing shape after vectorization: (7584, 1249)
testing shape after feature_selection: (7584, 25)
479
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 479
training shape after vectorization: (1824, 1705)
training shape after feature selection: (1824, 20)
cv_scores: [ 0.85869565  0.91847826  0.92307692  0.80769231  0.75274725  0.71978022
  0.73626374  0.71978022  0.73076923  0.73076923]
cv_scores mean: 0.78980530339226
testing shape after vectorization: (7584, 1705)
testing shape after feature_selection: (7584, 20)
1366
y_us_pred: [0 0 0 ..., 1 0 0]
total us tweets: 7584
predicted mortali

training shape after vectorization: (1824, 1249)
training shape after feature selection: (1824, 26)
cv_scores: [ 0.92934783  0.96195652  0.93406593  0.87362637  0.75824176  0.64285714
  0.70879121  0.7032967   0.71978022  0.73076923]
cv_scores mean: 0.7962732919254658
testing shape after vectorization: (7584, 1249)
testing shape after feature_selection: (7584, 26)
452
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 452
training shape after vectorization: (1824, 1705)
training shape after feature selection: (1824, 17)
cv_scores: [ 0.79347826  0.85326087  0.79120879  0.75274725  0.70879121  0.68131868
  0.70879121  0.68681319  0.71428571  0.74725275]
cv_scores mean: 0.7437947921643573
testing shape after vectorization: (7584, 1705)
testing shape after feature_selection: (7584, 17)
1076
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1076
training shape after vectorization: (1824, 1249)
training shape after fe

training shape after vectorization: (1824, 1308)
training shape after feature selection: (1824, 26)
cv_scores: [ 0.89673913  0.88043478  0.88461538  0.76923077  0.70879121  0.60989011
  0.69230769  0.65934066  0.64285714  0.6978022 ]
cv_scores mean: 0.7442009077878644
testing shape after vectorization: (7584, 1308)
testing shape after feature_selection: (7584, 26)
401
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 401
training shape after vectorization: (1824, 1705)
training shape after feature selection: (1824, 20)
cv_scores: [ 0.82065217  0.83152174  0.76373626  0.74725275  0.6978022   0.64835165
  0.66483516  0.64285714  0.71428571  0.73626374]
cv_scores mean: 0.7267558528428094
testing shape after vectorization: (7584, 1705)
testing shape after feature_selection: (7584, 20)
1344
y_us_pred: [0 0 0 ..., 0 0 0]
total us tweets: 7584
predicted mortality_salience in US: 1344
training shape after vectorization: (1824, 1249)
training shape after fe

In [20]:
# sort by count
ms_clfs_by_count = sorted(ms_clfs, key=operator.itemgetter(0), reverse=True)
hc_count = ms_clfs_by_count[0][0]
hc_score = ms_clfs_by_count[0][1]
hc_info = ms_clfs_by_count[0][2]['info']
hc_pred = ms_clfs_by_count[0][2]['y_pred']
print(hc_count, hc_score, hc_info)
us_df['mortality_salience'] = hc_pred
us_df[us_df['mortality_salience'] == 1].to_csv('data/ms_tweets-predicted_mortality_salience-hicount.csv')

2103 0.666055835937 {'id': 'pos_countvec_nb_mult_liwc-False_binary-False_ngram-1', 'score': 0.66605583593687423, 'vocab_weights': [('NN_lord', 0.0043734021248437475), ('RB_not', 0.0050007734981160545), ('NN_alert', 0.011268975379428687), ('RB_just', 0.029945832125124083), ('NN_hawaii', 0.030304083549766308), ('PRP_it', 0.031541456413591802), ('NN_anything', 0.033589239121869383), ('RB_here', 0.036359161643111412), ('PRP_me', 0.045201540886076086), ('CC_and', 0.045818556755077018), ('NN_fuck', 0.048498206816498808), ('IN_on', 0.048629890891974586), ('DT_no', 0.051333210294108471), ('TO_to', 0.064765223545754008), ('JJ_ballistic', 0.071154383569841856), ('VBP_am', 0.07299148929584473), ('VBP_have', 0.099252171130173261), ('NN_i', 0.10761939084454281), ('VBZ_is', 0.12677064990704504), ('JJ_i', 0.13780788204998629), ('PRP_we', 0.13849926643534435), ('VBD_got', 0.16506407610174278), ('NN_missle', 0.16690750073278074), ('NN_family', 0.24710756390006267), ('NN_god', 0.26044321031895795), ('JJ

In [21]:
# sort by score
ms_clfs_by_score = sorted(ms_clfs, key=operator.itemgetter(1), reverse=True)
hs_count = ms_clfs_by_score[0][0]
hs_score = ms_clfs_by_score[0][1]
hs_info = ms_clfs_by_score[0][2]['info']
hs_pred = ms_clfs_by_score[0][2]['y_pred']
print(hs_count, hs_score, hs_info)
us_df['mortality_salience'] = hs_pred
us_df[us_df['mortality_salience'] == 1].to_csv('data/ms_tweets-predicted_mortality_salience-hiscore.csv')

806 0.821513157895 {'id': 'tt_countvec_logreg_liwc-True_binary-True_ngram-1', 'score': 0.82151315789473678, 'vocab_weights': [('happening', 0.039561016299611514), ('know', 0.053859818719145837), ('thank', 0.058945390206538634), ('goodbye', 0.083680337650619357), ('going', 0.11197164540868451), ('tell', 0.1146033050420473), ('s', 0.12473534412692705), ('did', 0.12663396924233322), ('missle', 0.13050034939511518), ('just', 0.13911792376950285), ('hawaii', 0.15554901777848207), ('ready', 0.16887303742654342), ('alert', 0.17895087160414916), ('t', 0.1934369811900146), ('got', 0.19748545065836154), ('yes', 0.21657855593937531), ('sorry', 0.21964220929049486), ('threat', 0.22432380571986182), ('warden', 0.22661944762886255), ('real', 0.23864018837863923), ('lord', 0.29698240699705686), ('die', 0.32553934562074721), ('m', 0.32843081635193982), ('say', 0.36805175132477153), ('sirens', 0.40004119874428956), ('fuck', 0.4075804658808741), ('safe', 0.46790281173851539), ('family', 0.53845432907905

### Manual verification of predicted results
Now manually verify predicted mortality_salience field in ms_tweets-predicted_mortality_salience-hiscore.csv and adjust as necessary. They will be included in the results for statistical analysis. Save as "ms_tweets-predicted_mortality_salience-hiscore-verified.csv".