## EIOS articles

In [1]:
# 79 characters:
# -----------------------------------------------------------------------------
# 72 characters (docstrings or comments):
# ----------------------------------------------------------------------

In [2]:
import pandas as pd
pd.options.display.max_colwidth = 1000
import numpy as np
import altair as alt
alt.renderers.set_embed_options(theme='dark')
# alt.data_transformers.enable('json')
alt.data_transformers.disable_max_rows()
import re
import random
import pickle
import datetime
import os
from os.path import isfile
from lxml import etree
import json
from langdetect import detect, detect_langs
import gensim
import nltk
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
# nltk.download('wordnet')
# nltk.download('punkt')
from gensim.models.phrases import Phrases, Phraser
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec 
from gensim.models.ldamodel import LdaModel
from gensim.matutils import Sparse2Corpus
from textblob import TextBlob
# e.g. in conda console: python -m textblob.download_corpora lite
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler, ADASYN
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score,
                             roc_curve, precision_recall_curve, accuracy_score,
                             precision_score, recall_score, f1_score, matthews_corrcoef,
                             balanced_accuracy_score)
from imblearn.metrics import geometric_mean_score, make_index_balanced_accuracy



In [3]:
# Parameters
# Pipeline
get_id_url = False
clean_eios_id_url = False
load_signals_label = False
compare_signals_eios = False
label_eios = False
find_signals_to_remove = False
explore_mismatch = False
sample_eios = False
tokenize_eios = False
train_trigrams = False
compute_tfidf = False
compute_w2v = False
find_empty_w2v = False
topic_modeling = False
sentiment_analysis = False
plot_sentiment = False
perform_tsne = False
plot_tsne = False
build_train_test_sets = False
train_classification_models = False
compute_scores = False
plot_write_scores = True
overview_ouput = True

# Options
load_w2v_from_bin = False

output_dir = 'out'
log_path = (output_dir + '/log/log-' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') 
            + '-message.txt')
warning_path = (output_dir + '/log/log-' + datetime.datetime.now().strftime('%Y%m%d%H%M%S') 
                + '-warning.txt')

keep_boards = ['Priority Diseases Global', 'Priority Sources']
eios_data_path = {'json':'../data/eios_articles/priority_boards_en_201711_201908/json',
                  'xml':('../data/eios_articles/'
                         + 'en_onlyfor Stephane 2018 July to August')}
eios_data_type = 'json'
eiosid_to_remove_manually = [42941746]
filter_text_n_letters = 30 # articles have to have at least filter_text_n_letters
                           # latin letters
filter_word_length = 1 # tokens have to be strictly longer than filter_word_length
read_eios_from_date = pd.Timestamp(2017, 11, 1, 0)
read_eios_to_date = pd.Timestamp(2019, 8, 31, 23)
nosignal_sample_frac = 0.1
nosignal_sample_seed = 42

ngram_range_tfidf = (1,1)
limit_load_w2v = None # None to load all embeddings (~3M), 500000 considered OK

vectorization_methods = ['tfidf', 'tfidf_dr', 'w2v']
standardizing = ['no_st','stand']
n_components_dim_reduction = 300
n_topics = 20
chunksize_topics = 10000
upsampling_methods = ['no_us','duplicate','adasyn']
classification_methods = ['complement_naive_bayes','logistic_regression',
                          'random_forest','multilayer_perceptron','svm_rbf']
max_iter_lr = 10000
randomforest_n_estimators = 100
max_iter_mlp = 500
mlp_lsize = (100,)

scores_list = ['accuracy','precision','recall','specificity','f1','mcc','ba',
               'geom_mean','iba_gm']
n_thresholds = 1000
alpha_iba = 0.1
recall_target = 0.9

In [4]:
# Generic functions
def file_name_date(from_date, to_date):
    return (from_date.strftime('%Y%m%d%H') + '_' + to_date.strftime('%Y%m%d%H'))

def write_log(log_p, message):
    if log_p is not None:
        with open(log_p,'a+') as log_file:    
            log_file.write(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                           + ' ' + message + '\n')
    return None

def promed_id(url):
    if (url.split('/')[0] == 'promedmail.org') & (url != 'promedmail.org'):
        promedid = url.split('/')[-1].split('.')[-1]
    else:
        promedid = None
    return promedid

def match_urls(url, url_list):
    
    ## DEBUG:
    # url = 'promedmail.org'
    # url = 'promedmail.org/direct.php?id=20180702.58823' # in EIOS
    # url = 'promedmail.org/post/5773973' # in signal list
    # url = 'promedmail.org/direct.php?id=20190817.6624680' # in signal list
    # url = 'elevenmyanmar.com/local/14680'
    # url = 'reuters.com/article/venezuela-malaria/venezuelans-suffer-as-malaria-outbreak-spreads-in-drug-short-nation-idUSL1N1NQ0KJ'
    # url_list = unique_signals_urls
    # url_list = eios_id_url_cleaned.url_pp
    
    is_match = int(url.lower() in [u.lower() for u in url_list])
    
    # Match by ProMED ID but ignore when only domain given
    # (URLs can be promedmail.org/direct.php?id=xxxxxxxx.yyyyyyy or
    # promedmail.org/post/yyyyyyy or 
    # promedmail.org/post/xxxxxxxx.yyyyyyy)
    if url == 'promedmail.org':
        is_match = 0
    if promed_id(url) is not None:
        is_match = int(promed_id(url) in 
                       [promed_id(u) for u in url_list 
                        if promed_id(u) is not None])
    
    # info.gov.hk
    if url.split('/')[0]=='info.gov.hk':
        is_match = int(url.split('?')[0].lower() 
                       in [u.split('?')[0].lower() for u in url_list])
    
    # foodnews.com
    if url.split('/')[0]=='foodnews.com':
        is_match = int('/'.join(url.split('/')[0:4]).lower() 
                       in ['/'.join(u.split('/')[0:4]).lower() for u in url_list])
                
    return is_match

In [5]:
# NLP functions
def preprocess_text(text, removedots, codedigits, removenumbers, lowercase, removeaccent, lemm, stem, filter_w):
    # N.B. stop words are always removed, in gensim they include numbers 
    # written in letters: "one", "two", "twenty", ...
    
    ## DEBUG:
#     text = ("Good muffins cost $3.88\nin New York."
#             + "  Please buy me\ntwo of them!\n\nThanks."
#             + " Well, he said: \"They're the best!\" ``I'm not so sure,´´ I replied.")
#     removedots, codedigits, removenumbers, lowercase, removeaccent, lemm, stem, filter_w = True, False, True, True, True, True, True, 1
#     removedots, codedigits, removenumbers, lowercase, removeaccent, lemm, stem, filter_w = False, True, False, False, False, False, False, 1

    sentence_list = []
    if (text is not None):
        sentence_tokens = (nltk.tokenize.punkt.PunktSentenceTokenizer()
                           .sentences_from_text(text))
        for sentence in sentence_tokens:
            tokenized_sentence = []
            token_list = nltk.word_tokenize(sentence)
            for token in token_list:
                token = re.sub('[^A-zÀ-ÿ0-9\.]','',token)
                token = re.sub('[\´\`\^]','',token)
                # If there are no other characters than ".", set token to ""
                if len(re.sub('\.', '', token)) == 0:
                    token = ''
                if ((len(token) > filter_w) & 
                    (token.lower() not in gensim.parsing.preprocessing.STOPWORDS)):
                    if removedots:
                        token = re.sub('\.','',token)
                    if codedigits:
                        token = re.sub('[0-9]','#',token)
                    if removenumbers:
                        if len(re.sub('[0-9]', '', token)) == 0:
                            token = ''
                    if lowercase:
                        token = token.lower()
                    if removeaccent:
                        token = gensim.utils.deaccent(token)
                    if lemm:
                        token = WordNetLemmatizer().lemmatize(token, pos='v')
                    if stem:
                        token = SnowballStemmer('english').stem(token)
                    if len(token) > filter_w:
                        tokenized_sentence.append(token)
            sentence_list.append(tokenized_sentence)    
    return sentence_list

def word_embeddings_mean(wv, token_list):
    # N.B. mean weighted by tf-idf might be better than simple mean, see:
    # https://doi.org/10.1016/j.patrec.2016.06.012
    # http://nadbordrozd.github.io/blog/2016/05/20/...
    #...text-classification-with-word2vec/
    
    # If no embeddings are found, e.g. because the text is not in 
    # English, returns None
    
    ## DEBUG:
#     wv = w2v
#     token_list = ['Hi', 'there', 'flying', 'Paris', 'New_York', 'muffins', 'U.N.']
#     token_list = text_list_simple_pp[0]
    embeddings = []
    for tok in token_list:
        if tok in wv.vocab:
            embeddings.append(wv.vectors_norm[wv.vocab[tok].index])
    if len(embeddings) == 0:
        mean_embedding = None
    else:    
        mean_embedding = np.array(embeddings, dtype='float32').mean(axis=0)
    return mean_embedding

In [6]:
# EIOS functions
def load_process_eios(eios_file, content_type, content_sample, boards_present,
                      filter_w, filter_n_l, log_p):

    # Loads and processes the desired content from the given EIOS
    # file, either XML or JSON

    ## DEBUG:
#     eios_file = ('../data/eios_articles/'
#                  + 'en_onlyfor Stephane 2018 July to August/201807/'
#                  + 'Finder_2018-07-15 02_00_00 UTC_2018-07-15 '
#                  + '02_59_59 UTC_en.xml')
#     eios_file = ('../data/eios_articles/priority_boards_en_201711_201908/'
#                  + 'json/2018-07/'
#                  + 'priority_boards_en_2018-07-08.json')
#     content_sample = None
# #     content_sample = sample_eios_labels
#     content_type = ['id', 'url_pp', 'date']
#     boards_present = keep_boards
#     filter_w = 1
#     filter_n_l = filter_text_n_letters
#     log_p = warning_path
#     # Example of eiosID with signal: 39315172

    # For URLs: perform the same operations as for URLs in signals
    # list: remove strings used there for splitting as well as
    # leading and trailing characters (special or punctuation).
    noninfo_substr = ['http://','https://','www.','www2.','wwwnc.']
    strip_lead_trail = ' /,;.?!<>\r\n'

    content_df = pd.DataFrame(columns = content_type)

    file_format = eios_file.split('.')[-1]
    if file_format == 'json':
        known_content = {'id':'eiosId', 'title':'title', 'url_pp':'originalUrl', 
                         'url_full':'originalUrl', 'tokens_simple_pp':'fullText',
                         'tokens_full_pp':'fullText', 'date':'fetchDate'}
        if any([ct not in known_content.keys() for ct in content_type]):
            raise ValueError("load_process_eios: I don't know how to extract "
                             + ', '.join(content_type) + ', I only know '
                             + ', '.join(known_content.keys()) + '.')
        with open(eios_file, encoding='utf8') as ef:
            file_content = json.load(ef)
        
        # If the sample of EIOS articles isn't given, remove articles...
        remove_article = ['no' for i in range(len(file_content))]
        if content_sample is None:
            for j in range(len(file_content)):
                warning_text = ('load_process_eios: In file ' + eios_file + ':\n'
                                + 'Removing the article with index ' + str(j)
                                + ' (eiosId: ' + str(file_content[j]['eiosId'])
                                + ', date: ' + file_content[j]['fetchDate'] + ')')
                # ... that don't have full text or at least some latin letters
                if len(re.sub('[^A-zÀ-ÿ]', '', file_content[j]['fullText'])) < filter_n_l:
                    write_log(log_p, warning_text + ' as it has no full text content'
                              + ' or less than ' + str(filter_n_l)
                              + ' latin letters!')
                    remove_article[j] = 'alphabet'
                # ... that don't belong to at least one of the desired boards
                if (remove_article[j] == 'no') & (boards_present is not None):
                    boards = [file_content[j]['relatedBoards'][i]['title']
                              for i in range(len(file_content[j]['relatedBoards']))]
                    if not any(kb in boards for kb in boards_present):
                        write_log(log_p, warning_text 
                                  + " as it doesn't belong to any of " 
                                  + 'the desired boards!')
                        remove_article[j] = 'boards'
                # ... that are not in English
                if remove_article[j] == 'no':
                    try:
                        language = detect(re.sub('[^A-zÀ-ÿ0-9\ \.\;\,\!\?]', '',
                                          file_content[j]['fullText']))
                    except Exception as e:
                        write_log(log_p, warning_text + ': langdetect: '
                                  + str(e))
                        language = 'unknown'
                    
                    if language != 'en':
                        write_log(log_p, warning_text  
                                  + ' as it is apparently not in English!')
                        remove_article[j] = 'language'
        
        # Process and save content
        for iart in range(len(file_content)):
            process_article = False
            if content_sample is None:
                process_article = True
            else:
                article_signal = content_sample.signal.loc[
                    content_sample.id==file_content[iart][known_content['id']]
                ]
                if article_signal.shape[0] == 1:
                    process_article = True
                elif article_signal.shape[0] > 1:
                    raise ValueError(warnin_text
                                     + ' More than one article found with the article ID!')
                    
            if process_article:
                tmp_content = {}
                for ct in content_type:
                    content = file_content[iart][known_content[ct]]
                    if ct == 'url_pp':
                        content = re.sub('|'.join(noninfo_substr), '', content)
                        content = content.strip().strip(strip_lead_trail).strip()
                    elif ct == 'tokens_simple_pp':
                        content = preprocess_text(content, removedots=False,
                                                  codedigits=True,
                                                  removenumbers=False,
                                                  lowercase=False, 
                                                  removeaccent=False,
                                                  lemm=False, stem=False, 
                                                  filter_w=filter_w)
                    elif ct == 'tokens_full_pp':
                        content = preprocess_text(content, removedots=True,
                                                  codedigits=False,
                                                  removenumbers=True,
                                                  lowercase=True, 
                                                  removeaccent=True,
                                                  lemm=True, stem=True, 
                                                  filter_w=filter_w)          
                    elif ct == 'date':
                        content = pd.Timestamp(content).tz_convert('UTC')
                    tmp_content[ct] = content
                if content_sample is None:
                    tmp_content['remove'] = remove_article[iart]
                else:
                    tmp_content['signal'] = int(article_signal)
                content_df = content_df.append(tmp_content, ignore_index=True)

    elif file_format == 'xml':
        # N.B. The attribute prefix "emm:" is defined in the XML file as
        # "{http://emm.jrc.it}" via 'xmlns:emm="http://emm.jrc.it"' and we
        # have to explicitly replace it (or at least I didn't how to do it
        # automatically).
        known_content = {'url_pp':'link', 'tokens_simple_pp':'{http://emm.jrc.it}text',
                         'tokens_full_pp':'{http://emm.jrc.it}text', 'date':'pubDate'}
        if any([ct not in known_content.keys() for ct in content_type]):
            raise ValueError("load_process_eios: I don't know how to extract "
                             + ', '.join(content_type) + ', I only know '
                             + ', '.join(known_content.keys()) + '.')

        # Read EIOS data
        tree = etree.parse(eios_file)
        root = tree.getroot()

        for item in root[0].iter('item'):
            tmp_content = {}
            for ct in content_type:
                for co in item.iter(known_content[ct]):
                    # There should be only one URL, text, date, etc., for
                    # each article, but I couldn't find them without
                    # iterating on item.
                    if ct == 'url_pp':
                        content = re.sub('|'.join(noninfo_substr), '', co.text)
                        content = (content.strip().strip(strip_lead_trail)
                                   .strip())
                    elif ct == 'tokens_simple_pp':
                        content = preprocess_text(co.text, removedots=False,
                                                  codedigits=True,
                                                  removenumbers=False,
                                                  lowercase=False,
                                                  removeaccent=False,
                                                  lemm=False, stem=False, 
                                                  filter_w=filter_w)
                    elif ct == 'tokens_full_pp':
                        content = preprocess_text(co.text, removedots=True,
                                                  codedigits=False,
                                                  removenumbers=True,
                                                  lowercase=True,
                                                  removeaccent=True,
                                                  lemm=True, stem=True, 
                                                  filter_w=filter_w)
                    elif ct == 'date':
                        content = pd.Timestamp(co.text).tz_convert('UTC')
                    else:
                        content = co.text
                tmp_content[ct] = content
                content_df = content_df.append(tmp_content, ignore_index=True)

    else:
        raise ValueError('load_process_eios: '
                         + "I don't know how to deal with EIOS file format: "
                         + file_format + '.')

    return content_df

def get_eios(content_type, content_sample, data_source_type, time_from, time_to,
             data_path, boards_present, filter_w, filter_n_l, verbose, log_p):

    # Looks for EIOS files, assuming each corresponds to one
    # hour of articles, from 0min 0s of time_from to 59min 59s of
    # time_to (included).
    # It can deal with two source types: data_source_type = 'json' or
    # data_source_type = 'xml', which actually contain different information.
    # Calls load_process_eios() on each file to load and process the
    # desired content.

    ## DEBUG:
#     content_type = ['url_pp', 'date']
#     content_sample = None
#     content_sample = sample_eios_labels
#     data_source_type = 'json'
#     time_from = pd.Timestamp(2018, 7, 12, 0)
#     time_to = pd.Timestamp(2019, 7, 12, 23)
#     data_path = eios_data_path
#     filter_w = 3
#     filter_n_l = filter_text_n_letters
#     verbose = True
#     log_p = warning_path

    file_path = data_path[data_source_type]
    file_list = []
    if data_source_type == 'json':
        for timestamp in pd.date_range(time_from, time_to, freq = 'D'):
            day_string = (str(timestamp.year) + '-' + '%02d' % timestamp.month
                          + '-' + '%02d' % timestamp.day)
            file_name = (file_path + '/' + str(timestamp.year) + '-'
                         + '%02d' % timestamp.month + '/'
                         + 'priority_boards_en_' + day_string + '.json')
            if isfile(file_name):
                file_list.append(file_name)
            else:
                write_log(log_p, 'get_eios: EIOS file ' 
                          + file_name + ' not found!')

    elif data_source_type == 'xml':
        for timestamp in pd.date_range(time_from, time_to, freq = 'H'):
            day_string = (str(timestamp.year) + '-' + '%02d' % timestamp.month
                          + '-' + '%02d' % timestamp.day)
            file_name = (file_path + '/' + str(timestamp.year)
                         + '%02d' % timestamp.month + '/' + 'Finder_'
                         + day_string + ' ' + '%02d' % timestamp.hour
                         + '_00_00 UTC_' + day_string + ' '
                         + '%02d' % timestamp.hour + '_59_59 UTC_en.xml')
            if isfile(file_name):
                file_list.append(file_name)
            else:
                write_log(log_p, 'get_eios: EIOS file ' + file_name + ' not found!')

    else:
        raise ValueError("get_eios: I don't know hopw to find data of type "
                         + data_source_type + '!')

    eios_data = pd.DataFrame(columns = content_type)

    if verbose:
        print('Find content:',
              datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
              end = ' ')
        ifile = 0
    for file in file_list:
        if verbose:
            print(ifile, end = ' ')
            ifile += 1
        eios_data = eios_data.append(
            load_process_eios(file, content_type, content_sample, boards_present,
                              filter_w, filter_n_l, log_p),
            ignore_index=True)
    if verbose:
        print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
    
    return eios_data

In [7]:
# File paths
if not os.path.exists('pickles'):
    os.makedirs('pickles')
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(output_dir + '/sentiment'):
    os.makedirs(output_dir + '/sentiment')
if not os.path.exists(output_dir + '/tsne'):
    os.makedirs(output_dir + '/tsne')    
    
eios_id_url_file = ('pickles/eios_id_url-' 
                    + file_name_date(read_eios_from_date, read_eios_to_date)
                    + '.pickle')
eios_id_url_cleaned_file = ('pickles/eios_id_url_cleaned-' 
                            + file_name_date(read_eios_from_date, read_eios_to_date)
                            + '.pickle')
signals_data_file = 'pickles/signals_data.pickle'
ebola_alerts_data_file = 'pickles/ebola_alerts_data.pickle'
signals_media_links_file = 'pickles/signals_media_links.pickle'
signals_domains_no_url_in_eios_file = (
    'pickles/signals_domains_no_url_in_eios-' 
    + file_name_date(read_eios_from_date, read_eios_to_date)
    + '.pickle'
) 
eios_labels_file = ('pickles/eios_labels-' 
                    + file_name_date(read_eios_from_date, read_eios_to_date)
                    + '.pickle')                                     
sample_eios_labels_file = ('pickles/sample_eios_labels-' + str(nosignal_sample_frac) 
                           + '_' + str(nosignal_sample_seed) + '-' 
                           + file_name_date(read_eios_from_date, read_eios_to_date)
                           + '.pickle')
eios_tokens_file = ('pickles/eios_tokens-' + str(nosignal_sample_frac) 
                    + '_' + str(nosignal_sample_seed) + '-' 
                    + file_name_date(read_eios_from_date, read_eios_to_date)
                    + '.pickle')
bigram_simple_pp_file = ('pickles/bigram_simple_pp-' + str(nosignal_sample_frac) 
                         + '_' + str(nosignal_sample_seed) + '-' 
                         + file_name_date(read_eios_from_date, read_eios_to_date)
                         + '.pickle')
trigram_simple_pp_file = ('pickles/trigram_simple_pp-' + str(nosignal_sample_frac) 
                         + '_' + str(nosignal_sample_seed) + '-' 
                         + file_name_date(read_eios_from_date, read_eios_to_date)
                         + '.pickle')
bigram_full_pp_file = ('pickles/bigram_full_pp-' + str(nosignal_sample_frac) 
                       + '_' + str(nosignal_sample_seed) + '-' 
                       + file_name_date(read_eios_from_date, read_eios_to_date)
                       + '.pickle')
trigram_full_pp_file = ('pickles/trigram_full_pp-' + str(nosignal_sample_frac) 
                       + '_' + str(nosignal_sample_seed) + '-' 
                       + file_name_date(read_eios_from_date, read_eios_to_date)
                       + '.pickle')
eios_tfidf_file = ('pickles/eios_tfidf-' + str(nosignal_sample_frac) 
                   + '_' + str(nosignal_sample_seed) + '-' 
                   + file_name_date(read_eios_from_date, read_eios_to_date)
                   + '.pickle')
eios_tfidf_dictionary_file = ('pickles/eios_tfidf_dictionary-' 
                              + str(nosignal_sample_frac) 
                              + '_' + str(nosignal_sample_seed) + '-' 
                              + file_name_date(read_eios_from_date, read_eios_to_date)
                              + '.pickle')
w2v_pickle_file = 'pickles/w2v_' + str(limit_load_w2v) + '.pickle'
eios_w2v_file = ('pickles/eios_w2v-' + str(limit_load_w2v) + '-'
                 + str(nosignal_sample_frac) + '_' 
                 + str(nosignal_sample_seed) + '-' 
                 + file_name_date(read_eios_from_date, read_eios_to_date)
                 + '.pickle')
w2v_examples_file = (output_dir + '/w2v_examples-' + str(limit_load_w2v) + '.txt')
topics_file = ('pickles/topics-' 
               + str(nosignal_sample_frac) 
               + '_' + str(nosignal_sample_seed) + '-' 
               + file_name_date(read_eios_from_date, read_eios_to_date)
               + '.pickle')
topics_text_file = (output_dir + '/topics-'
                    + str(nosignal_sample_frac) 
                    + '_' + str(nosignal_sample_seed) + '-' 
                    + file_name_date(read_eios_from_date, read_eios_to_date)
                    + '.txt')
sentiment_file = ('pickles/sentiment-' 
               + str(nosignal_sample_frac) 
               + '_' + str(nosignal_sample_seed) + '-' 
               + file_name_date(read_eios_from_date, read_eios_to_date)
               + '.pickle')
sentiment_plot_file = (output_dir + '/sentiment/sentiment_plot-'
                       + str(nosignal_sample_frac) 
                       + '_' + str(nosignal_sample_seed) + '-' 
                       + file_name_date(read_eios_from_date, read_eios_to_date))                       
tsne_results_file = ('pickles/tsne_results-' + str(nosignal_sample_frac) 
                     + '_' + str(nosignal_sample_seed) + '-'
                     + str(limit_load_w2v) + '-'
                     + file_name_date(read_eios_from_date, read_eios_to_date)
                     + '.pickle')
vec_tag = {'tfidf':'tfidf', 'tfidf_dr':'tfidf_dr',
           'w2v': 'w2v_' + str(limit_load_w2v)}
tsne_plot_files = {}
for vc_met in ['tfidf','w2v']:
    tsne_plot_files[vc_met] = (output_dir + '/tsne/tsne_signal_plot-'
                               + str(nosignal_sample_frac) 
                               + '_' + str(nosignal_sample_seed) + '-'
                               + vec_tag[vc_met] + '-'
                               + file_name_date(read_eios_from_date, read_eios_to_date))
trainset_file = ('pickles/trainset-' + str(nosignal_sample_frac) 
                 + '_' + str(nosignal_sample_seed) + '-' 
                 + str(limit_load_w2v) + '-'
                 + file_name_date(read_eios_from_date, read_eios_to_date)
                 + '.pickle')
testset_file = ('pickles/testset-' + str(nosignal_sample_frac) 
                + '_' + str(nosignal_sample_seed) + '-'
                + str(limit_load_w2v) + '-'
                + file_name_date(read_eios_from_date, read_eios_to_date)
                + '.pickle')
trained_models_file = ('pickles/trained_models-' + str(nosignal_sample_frac) 
                       + '_' + str(nosignal_sample_seed) + '-'
                       + str(limit_load_w2v) + '-'
                       + file_name_date(read_eios_from_date, read_eios_to_date)
                       + '.pickle')
scores_file = ('pickles/scores-' + str(nosignal_sample_frac) + '_'
               + str(nosignal_sample_seed) + '-' + str(limit_load_w2v)
               + '-' + file_name_date(read_eios_from_date, read_eios_to_date)
               + '.pickle')

In [8]:
# Get EIOS article IDs and URLs
if get_id_url:
    write_log(log_path, 'Start getting EIOS IDs, URLs and dates')
    eios_id_url = get_eios(['id', 'url_pp', 'date'], None,
                           eios_data_type, read_eios_from_date, read_eios_to_date, 
                           eios_data_path, keep_boards, filter_word_length, 
                           filter_text_n_letters, True, None)
    write_log(log_path, 'Getting EIOS IDs, URLs and dates done')
    pickle.dump(eios_id_url, open(eios_id_url_file, 'wb'))
    del eios_id_url

In [9]:
# Find and remove duplicates, keeping the oldest one; remove articles manually
if clean_eios_id_url:
    if not 'eios_id_url' in globals():
        eios_id_url = pickle.load(open(eios_id_url_file, 'rb'))
    write_log(log_path, 'Number of EIOS URL duplicates = '
              + str(len(eios_id_url.url_pp)-len(eios_id_url.url_pp.unique())))
    eios_id_url_cleaned = eios_id_url.copy()
    eios_id_url_cleaned['date_min'] = (eios_id_url_cleaned
                                       .groupby('url_pp')
                                       .date
                                       .transform('min'))
    eios_id_url_cleaned = eios_id_url_cleaned.drop(
        eios_id_url_cleaned[eios_id_url_cleaned.date != eios_id_url_cleaned.date_min]
        .index).reset_index(drop=True)
    eios_id_url_cleaned = eios_id_url_cleaned.drop(columns='date_min')

    write_log(log_path, 'Number of EIOS URL duplicates after cleaning by date = '
              + str(len(eios_id_url_cleaned.url_pp)
                    - len(eios_id_url_cleaned.url_pp.unique())))

    # Remove duplicate rows
    idx_keep_articles = eios_id_url_cleaned[['url_pp','date']].drop_duplicates().index
    eios_id_url_cleaned = eios_id_url_cleaned.loc[idx_keep_articles].reset_index(drop=True)
    
    # Manually remove some articles
    eios_id_url_cleaned = (eios_id_url_cleaned.drop(
        eios_id_url_cleaned[eios_id_url_cleaned.id.isin(eiosid_to_remove_manually)].index)
                           .reset_index(drop=True))
    
    write_log(log_path, 'After cleaning EIOS URLs, of overall ' 
              + str(eios_id_url_cleaned.shape[0]) + ' articles, '
              + str(sum([rem != 'no' for rem in eios_id_url_cleaned.remove])) 
              + ' were flagged for removal (' 
              + str(sum([rem == 'alphabet' for rem in eios_id_url_cleaned.remove])) 
              + ' because of alphabet, '
              + str(sum([rem == 'boards' for rem in eios_id_url_cleaned.remove])) 
              + ' because of boards, ' 
              + str(sum([rem == 'language' for rem in eios_id_url_cleaned.remove])) 
              + ' because of language)')
    pickle.dump(eios_id_url_cleaned, open(eios_id_url_cleaned_file, 'wb'))
    del eios_id_url, eios_id_url_cleaned

In [10]:
# Load signals and Ebola alerts, label if matches EIOS URL
if load_signals_label:
    if not 'eios_id_url_cleaned' in globals():
        eios_id_url_cleaned = pickle.load(open(eios_id_url_file, 'rb'))

    signals_data = pickle.load(open(signals_data_file, 'rb'))
    ebola_alerts_data = pickle.load(open(ebola_alerts_data_file, 'rb'))
    signals_media_links = (signals_data.append(ebola_alerts_data)
                           .reset_index(drop=True))
    signals_media_links = signals_media_links.drop_duplicates()
    signals_media_links['in_eios'] = [match_urls(url, eios_id_url_cleaned.url_pp) 
                                      for url in signals_media_links.url]
    display(signals_media_links.head())
    pickle.dump(signals_media_links, open(signals_media_links_file, 'wb'))

    # TODO: clean disease and country; by duplicates, keep oldest
    # (signals_media_links.loc[signals_media_links.url.duplicated(keep=False)]
    #  .sort_values(by='url'))
    # np.sort(signals_media_links.disease.unique())
    # np.sort(signals_media_links.country.unique())
    unique_signals_urls = np.sort(signals_media_links.url.unique())
    write_log(log_path, 'There are ' + str(len(unique_signals_urls))
              + ' unique signal URLs for overall ' + str(signals_media_links.shape[0]) 
              + ' signals (before cleaning disease and country)')
    del eios_id_url_cleaned

In [11]:
# Compare URLs in signal list and in EIOS
if compare_signals_eios:
    if not 'eios_id_url_cleaned' in globals():
        eios_id_url_cleaned = pickle.load(open(eios_id_url_cleaned_file, 'rb'))
    if not 'signals_media_links' in globals():    
        signals_media_links = pickle.load(open(signals_media_links_file, 'rb'))

    signals_media_links_timerange = (
        signals_media_links.loc[
            (signals_media_links.date >= read_eios_from_date + datetime.timedelta(days=7)) 
            & (signals_media_links.date <= read_eios_to_date),
            :]
    )
    write_log(log_path, 'Number of signals between first date'
              + ' plus 7 days and last date'
              + ' (date is signal date) = '
              + str(signals_media_links_timerange.shape[0]))
    write_log(log_path, 
              'Of those, number that could *not* be matched to EIOS = '
              + str(signals_media_links_timerange.shape[0]
                    - sum(signals_media_links_timerange.in_eios)))

    signals_domains_timerange = np.sort(np.unique(
        [url.split('/')[0] for url in signals_media_links_timerange.url]
    ))
    eios_domains = np.sort(np.unique(
        [url.split('/')[0] for url in eios_id_url_cleaned.url_pp]
    ))
    signals_domains_not_in_eios = signals_media_links_timerange.loc[
        [signals_media_links_timerange.url.iloc[i].split('/')[0] not in eios_domains
         for i in range(signals_media_links_timerange.shape[0])]
    ]
    signals_domains_no_url_in_eios = signals_media_links_timerange.loc[
        [(signals_media_links_timerange.url.iloc[i].split('/')[0] in eios_domains)
         & (signals_media_links_timerange.in_eios.iloc[i] == 0)
         for i in range(signals_media_links_timerange.shape[0])]
    ]
    write_log(log_path,
              'Number of signals which domain is *not* in EIOS = '
              + str(signals_domains_not_in_eios.shape[0]))
    write_log(log_path,
              'Number of signals which domain *is* in EIOS'
              + ' but URLs could *not* be matched = '
              + str(signals_domains_no_url_in_eios.shape[0]))
    write_log(log_path, 'Domains that have signals but are not in EIOS: '
              + ', '.join(np.sort(np.unique(
                  [url.split('/')[0] for url in signals_domains_not_in_eios.url]))))
    display(signals_domains_no_url_in_eios.head()) 
    
    pickle.dump(signals_domains_no_url_in_eios,
                open(signals_domains_no_url_in_eios_file, 'wb'))
    
    del eios_id_url_cleaned

In [12]:
# Labels for articles: 1 if URL in signal list, 0 else
if label_eios:
    if not 'eios_id_url_cleaned' in globals():
        eios_id_url_cleaned = pickle.load(open(eios_id_url_cleaned_file, 'rb'))
    eios_labels = eios_id_url_cleaned.copy()
    eios_labels['signal'] = [match_urls(url, unique_signals_urls) 
                             for url in eios_labels.url_pp]
    display(eios_labels.head())
    pickle.dump(eios_labels, open(eios_labels_file, 'wb'))

In [13]:
# Signals flagged for removal
if find_signals_to_remove:
    if not 'eios_labels' in globals():
        eios_labels = pickle.load(open(eios_labels_file, 'rb'))    
    eios_labels_signals_remove = (eios_labels.loc[(eios_labels.signal == 1)
                                                 & (eios_labels.remove != 'no')]
                                  .sort_values(by=['remove','url_pp']))
    display(eios_labels_signals_remove)
    write_log(log_path, 'From '
              + str(sum(eios_labels.signal)) + ' matched signal URLs, '
              + str(eios_labels_signals_remove.shape[0]) 
              + ' were flagged for removal (' 
              + str(sum([rem == 'alphabet' for rem in eios_labels_signals_remove.remove])) 
              + ' because of alphabet, '
              + str(sum([rem == 'boards' for rem in eios_labels_signals_remove.remove])) 
              + ' because of boards, ' 
              + str(sum([rem == 'language' for rem in eios_labels_signals_remove.remove])) 
              + ' because of language)... But I\'ll keep those not in the desired boards.')   
    del eios_labels

In [14]:
# Explore mismatches: Signal URLs not found in the EIOS dataset
if explore_mismatch:
    if not 'eios_labels' in globals():
        eios_labels = pickle.load(open(eios_labels_file, 'rb'))
    if not 'signals_domains_no_url_in_eios' in globals():
        signals_domains_no_url_in_eios = pickle.load(open(signals_domains_no_url_in_eios_file, 'rb'))
        
    signals_domains_no_url_in_eios = signals_domains_no_url_in_eios.sort_values(by='url')
    signals_domains_no_url_in_eios['domain'] = [url.split('/')[0] 
                                                for url in signals_domains_no_url_in_eios.url]
    signals_domains_no_url_in_eios['subdirectory'] = [
        '/'.join(url.split('/')[0:-1]) for url in signals_domains_no_url_in_eios.url
    ]
    domain_count = (signals_domains_no_url_in_eios.domain
                    .value_counts(dropna=False).to_frame()) 
    eios_labels['domain'] = [url.split('/')[0] for url in eios_labels.url_pp]
    eios_labels['subdirectory'] = [
        '/'.join(url.split('/')[0:-1]) for url in eios_labels.url_pp
    ]
    eios_labels_subdirectories = np.sort(eios_labels.subdirectory.unique())

    url_mismatch_log = 'Explore mismatches: Signal URLs not found in the EIOS dataset\n\n'
    url_mismatch_log += ('='*80 + '\n')
    url_mismatch_log += ('Number of signal URLs not in EIOS: '
                         + str(signals_domains_no_url_in_eios.shape[0]) + '\n')
    url_mismatch_log += ('Number of corresponding domains: ' + str(len(domain_count)) + '\n')
    url_mismatch_log += (', '.join([str(dom) + ': ' + str(domain_count.loc[dom].domain)
                                    for dom in domain_count.index]) + '\n')
    url_mismatch_log += ('='*80 + '\n')

    # Inspect 10 selected domains (were those with the most signal URLs
    # before improving the matching function)
    depth_generic_subdirectory = {
        'promedmail.org':1,'reliefweb.int':2,'moh.gov.sa':5,'g1.globo.com':2,
        'paho.org':2,'reuters.com':2,'timesofindia.indiatimes.com':2,
        'info.gov.hk':3,'angop.ao':5,'foodsafetynews.com':1
    }
    url_mismatch_log += ('Selected domains: '
                          + ', '.join([k for k in depth_generic_subdirectory.keys()]) + '\n')
    url_mismatch_log += ('Number of signal URLs in those domains: '
                         + str(sum([domain_count.loc[dom].domain
                                    for dom in depth_generic_subdirectory.keys()]))
                         + '\n\n')

    manually_retained_urls = {
        'promedmail.org':['promedmail.org/direct.php?id=20171107.5426321',
                          'promedmail.org/direct.php?id=20171107.5428288',
                          'promedmail.org/direct.php?id=20171107.5429761',
                          'promedmail.org/direct.php?id=20171108.5429481',
                          'promedmail.org/direct.php?id=20171109.5431669',
                          'promedmail.org/direct.php?id=20180828.5989333',
                          'promedmail.org/post/20171109.5433077',
                          'promedmail.org/post/5828155',
                          'promedmail.org/post/6312986'],
        'reliefweb.int':['reliefweb.int/report/kenya/quarantine-tana-delta-after-rift-valley-fever-outbreak',
                         'reliefweb.int/report/palau/dengue-3-outbreak-palau-december-2018-may-2019-report-date-may-28-2019',
                         'reliefweb.int/report/sudan/acute-watery-diarrhoea-dengue-fever-reported-after-rains-sudan-s-blue-nile-red-sea',
                         'reliefweb.int/report/sudan/red-sea-hospital-reports-new-watery-diarrhoea-cases'],
        'moh.gov.sa':['None'],'g1.globo.com':['None'],
        'paho.org':['None'],
        'reuters.com':['reuters.com/article/us-france-babymilk-victims/lactalis-victims-group-says-10-more-babies-have-salmonella-idUSKBN1FF239',
                       'reuters.com/article/us-health-birdflu-japan/japan-reports-first-suspected-bird-flu-case-in-poultry-this-winter-idUSKBN1EZ0R3?feedType=RSS&feedName=healthNews',
                       'reuters.com/article/us-kenya-cholera/cholera-cases-rise-in-kenyas-capital-top-hospital-says-idUSKCN1RS0Y8',
                       'reuters.com/article/us-mideast-crisis-syria-hama/syrian-state-media-says-rebels-shell-village-with-gas-injuring-21-idUSKCN1R40SB',
                       'reuters.com/article/us-nigeria-health-cholera/cholera-outbreak-kills-12-in-northeast-nigeria-idUSKCN1IO11E',
                       'reuters.com/article/us-uganda-congo-refugees/diarrhea-kills-26-congolese-refugees-in-uganda-infects-hundreds-u-n-idUSKCN1G61UI',
                       'reuters.com/article/us-venezuela-malaria/venezuelans-suffer-as-malaria-outbreak-spreads-in-drug-short-nation-idUSKBN1DO1ES'],
        'timesofindia.indiatimes.com':['timesofindia.indiatimes.com/city/bengaluru/three-nipah-cases-suspected-from-bengaluru/articleshow/64374584.cms',
                                       'timesofindia.indiatimes.com/city/kozhikode/west-nile-fever-case-confirmed-in-malappuram/articleshow/68378544.cms',
                                       'timesofindia.indiatimes.com/city/kozhikode/west-nile-fever-case-confirmed-in-malappuram/articleshow/68378544.cms',
                                       'timesofindia.indiatimes.com/city/mumbai/eight-deaths-of-children-in-2-months-aes-suspected/articleshow/70388048.cms',
                                       'timesofindia.indiatimes.com/city/puducherry/another-patient-suspected-of-nipah-infection-admitted-at-jipmer/articleshow/69849226.cms',
                                       'timesofindia.indiatimes.com/city/puducherry/patient-with-suspected-nipah-infection-still-critical/articleshow/69763366.cms'],
        'info.gov.hk':['info.gov.hk/gia/general/201801/16/P2018011600785.htm',
                       'info.gov.hk/gia/general/201801/25/P2018012500863.htm?fontSize=1',
                       'info.gov.hk/gia/general/201802/14/P2018021400759.htm',
                       'info.gov.hk/gia/general/201907/22/P2019072200459.htm?fontSize=1'],
        'angop.ao':['angop.ao/angola/en_us/noticias/sociedade/2018/3/15/Cuanza-Norte-with-dozens-suspected-cases-dengue-fever,7b19294e-388d-4f5b-a7c9-bb17e70ff466.html'],
        'foodsafetynews.com':['foodsafetynews.com/2018/06/salmonella-outbreaks-from-raw-frozen-chicken-not-related/#.WxfJAO6FNhF',
                              'foodsafetynews.com/2018/08/sweden-investigates-nationwide-e-coli-outbreak-source-unknown',
                              'foodsafetynews.com/2018/10/a-dozen-cases-and-two-dead-in-listeria-outbreak-swiss-officials-suspect-food',
                              'foodsafetynews.com/2018/12/denmark-probes-pork-link-in-salmonella-outbreak-strain-is-antibiotic-resistant',
                              'foodsafetynews.com/2019/03/united-nations-food-aid-linked-to-3-deaths-262-poisonings-in-one-week']
    }
    for dom in depth_generic_subdirectory.keys():
        url_mismatch_log += ('Number of signal URLs in ' + dom + ' = '
                             + str(int(domain_count.loc[dom])) + '\n')
        sig_not_in_eios = signals_domains_no_url_in_eios.loc[
            [domain==dom for domain in signals_domains_no_url_in_eios.domain]
        ]
        url_mismatch_log += ('    ' + '\n    '.join(sig_not_in_eios.url) + '\n')
        subdir = ['/'.join(sb.split('/')[0:depth_generic_subdirectory[dom]]) 
                  for sb in sig_not_in_eios.subdirectory]
        subdir_eios = ['/'.join(sb1.split('/')[0:depth_generic_subdirectory[dom]])
                       for sb1 in eios_labels_subdirectories]
        url_mismatch_log += ('\nGeneric subdirectories have level '
                             + str(depth_generic_subdirectory[dom]-1)
                             + ':' + '\n')
        url_mismatch_log += ('    ' + '\n    '.join(np.unique(subdir)) + '\n\n')
        url_mismatch_log += ('Of those, found in the EIOS dataset:\n')
        url_mismatch_log += ('    '
                             + '\n    '.join(np.unique([sb for sb in subdir_eios if sb in subdir]))
                             + '\n\n')
        url_mismatch_log += (
            'Number of URLs with corresponding subdirectory found in the EIOS dataset = '
            + str(sum([sb in subdir_eios for sb in subdir]))
            + '\n'
        )
        url_mismatch_log += (
            'After manual inspection, URLs that are valid (no 404 error, even'
            + ' after removing manually possible tags, e.g.'
            + ' "?utm_medium=email&utm_source=user"),'
            + ' have the subdirectory in the EIOS dataset,'
            + ' are in English and are not PDFs:\n'
        )
        url_mismatch_log += ('    '
                             + '\n    '.join(manually_retained_urls[dom])
                             + '\n\n')
        if manually_retained_urls[dom] != ['None']:
            url_mismatch_log += (
                'These ' + str(len(manually_retained_urls[dom])) 
                + ' URLs were presumably not categorized in the desired boards or as English...?\n'
            )
        if dom == 'reuters.com':
            url_mismatch_log += (
                'There are close matches for two:\n'
                + 'reuters.com/article/us-france-babymilk-victims/lactalis-victims-group-says-10-more-babies-have-salmonella-idUSKBN1FF239'
                + ' has a similar article later in EIOS'
                + ' (same subdir "us-france-babymilk-victims")\n'
                + 'reuters.com/article/us-venezuela-malaria/venezuelans-suffer-as-malaria-outbreak-spreads-in-drug-short-nation-idUSKBN1DO1ES'
                + ' has a different subdirectory ("venezuela-malaria") and a different "id" in EIOS,'
                + ' but the redirects to this one.\n')
            url_mismatch_log += (
                'OK that the first doesn\'t match, for the second one'
                + ' one could write a Reuters-specific matching but not clear'
                + ' it wouldn\'t have side effects.\n')
        elif dom == 'paho.org':
            url_mismatch_log += (
                'All 4 are PDFs... but one could maybe parse'
                + ' the URLs for "ItemId" and other tags and find the page in'
                + ' EIOS that links to the PDF.\n')
        url_mismatch_log += ('='*80 + '\n')

    url_mismatch_log += ('Conclusion: After inspecting 10 selected domains:\n')
    url_mismatch_log += (
        str(sum([len(manually_retained_urls[dom])
                 for dom in manually_retained_urls.keys()
                 if manually_retained_urls[dom] != ['None']]))
        + ' (1 URL is counted twice) out of '
        + str(sum([domain_count.loc[dom].domain
                   for dom in depth_generic_subdirectory.keys()]))
        + ' "should" have been found in the EIOS dataset, but'
        + ' were presumably either not catagorized in the desired boards or as English.'
        + ' One could maybe find a way to get the 4 paho.org'
        + ' URLs by parsing for tags such as "ItemId". Other than that, the matching'
        + ' function works well (no URL found that should have been matched).\n\n')
    
    write_log(log_path, url_mismatch_log)
    del eios_labels

In [15]:
# Get all articles that are signals, keep nosignal_sample_frac articles that are not signals
if sample_eios:
    if not 'eios_labels' in globals():
        eios_labels = pickle.load(open(eios_labels_file, 'rb'))
    eios_id_signal_keep = eios_labels.loc[(eios_labels.signal==1) 
                                          & (eios_labels.remove.isin(['no','boards']))]
    eios_id_nosignal_keep = eios_labels.loc[(eios_labels.signal==0) 
                                            & (eios_labels.remove=='no')]
    frac_eios_id_nosignal = eios_id_nosignal_keep.sample(frac=nosignal_sample_frac,
                                                    random_state=nosignal_sample_seed)
    sample_eios_labels = eios_id_signal_keep.append(frac_eios_id_nosignal)
    pickle.dump(sample_eios_labels, open(sample_eios_labels_file, 'wb'))
    del eios_labels, sample_eios_labels

In [16]:
# Load tokenized EIOS articles
if tokenize_eios:
    if not 'sample_eios_labels' in globals():
        sample_eios_labels = pickle.load(open(sample_eios_labels_file, 'rb'))
    write_log(log_path, 'Start tokenizing sample EIOS')
    eios_tokens = get_eios(['id', 'tokens_simple_pp', 'tokens_full_pp'], 
                           sample_eios_labels,
                           eios_data_type, read_eios_from_date, read_eios_to_date, 
                           eios_data_path, keep_boards, filter_word_length,
                           filter_text_n_letters, True, None)
    write_log(log_path, 'Tokenizing sample EIOS done')
    pickle.dump(eios_tokens, open(eios_tokens_file, 'wb'))
    del sample_eios_labels, eios_tokens    

In [17]:
# Find relevant bigrams in both lists of tokenized texts
if train_trigrams:
    if not 'eios_tokens' in globals():   
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
        
    # Examples: 
    # trigram_simple_pp[bigram_simple_pp[['human','immunodeficiency','virus']]]
    # > ['human_immunodeficiency_virus']
    # trigram_simple_pp[bigram_simple_pp[['human','immunodeficiency','apple']]]
    # > ['human_immunodeficiency', 'apple']
    
    # Full preprocessing
    phrases_full_pp = Phrases([sentence for text in eios_tokens.tokens_full_pp 
                               for sentence in text])
    bigram_full_pp = Phraser(phrases_full_pp)
    phrases_full_pp_bi = Phrases([bigram_full_pp[sentence] 
                                  for text in eios_tokens.tokens_full_pp 
                                  for sentence in text])
    trigram_full_pp = Phraser(phrases_full_pp_bi)
    bigram_full_pp.save(bigram_full_pp_file)
    trigram_full_pp.save(trigram_full_pp_file)
                              
    # Simple preprocessing
    phrases_simple_pp = Phrases([sentence for text in eios_tokens.tokens_simple_pp 
                                 for sentence in text])
    bigram_simple_pp = Phraser(phrases_simple_pp)
    phrases_simple_pp_bi = Phrases([bigram_simple_pp[sentence] 
                                    for text in eios_tokens.tokens_simple_pp 
                                    for sentence in text])
    trigram_simple_pp = Phraser(phrases_simple_pp_bi)
    bigram_simple_pp.save(bigram_simple_pp_file)
    trigram_simple_pp.save(trigram_simple_pp_file)
    del (eios_tokens, phrases_full_pp, phrases_simple_pp, bigram_full_pp,
         bigram_simple_pp, phrases_full_pp_bi, phrases_simple_pp_bi, trigram_full_pp,
         trigram_simple_pp)

In [18]:
# Vectorization: Bag-Of-Words with tf-idf
if compute_tfidf:
    if not 'eios_tokens' in globals():   
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    if not 'bigram_full_pp' in globals():
        bigram_full_pp = Phrases().load(bigram_full_pp_file)
        trigram_full_pp = Phrases().load(trigram_full_pp_file)

    text_list_full_pp = []
    for text in eios_tokens.tokens_full_pp:
        sentence_conc = []
        for sentence in text:
            sentence_conc = sentence_conc + sentence
        text_list_full_pp.append(trigram_full_pp[bigram_full_pp[sentence_conc]])
                                 
    tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range_tfidf)
    eios_tfidf = tfidf_vectorizer.fit_transform(
        [' '.join(text) for text in text_list_full_pp])
    eios_tfidf_dictionary = tfidf_vectorizer.vocabulary_
    pickle.dump(eios_tfidf, open(eios_tfidf_file, 'wb'))
    pickle.dump(eios_tfidf_dictionary,
                open(eios_tfidf_dictionary_file, 'wb'))
    del (bigram_full_pp, trigram_full_pp, eios_tokens, text_list_full_pp,
         eios_tfidf, eios_tfidf_dictionary)    

In [19]:
# Vectorization: Word2Vec
if load_w2v_from_bin:
    w2v = gensim.models.KeyedVectors.load_word2vec_format(
        '../data/embeddings/GoogleNews-vectors-negative300.bin', 
        binary=True, limit=limit_load_w2v)
    w2v.init_sims(replace=True)
    pickle.dump(w2v, open(w2v_pickle_file, 'wb'))

if compute_w2v:
    if not 'eios_tokens' in globals():   
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    if not 'bigram_simple_pp' in globals():
        bigram_simple_pp = Phrases().load(bigram_simple_pp_file)
        trigram_simple_pp = Phrases().load(trigram_simple_pp_file)

    write_log(log_path, 'Start concatenate sentences and build w2v bigrams')
    text_list_simple_pp = []
    for text in eios_tokens.tokens_simple_pp:
        sentence_conc = []
        for sentence in text:
            sentence_conc = sentence_conc + sentence
        text_list_simple_pp.append(trigram_simple_pp[bigram_simple_pp[sentence_conc]])
    del eios_tokens, bigram_simple_pp, trigram_simple_pp
    write_log(log_path, 'Concatenate sentences and build w2v bigrams done')
    
    write_log(log_path, 'Start loading w2v from pickle')
    if not load_w2v_from_bin:
        w2v = pickle.load(open(w2v_pickle_file, 'rb'))
    write_log(log_path, 'Loading w2v from pickle done')
    
    # Give examples of how w2v works
    w2v_examples_string = ''
    for word in ['Ebola','HIV','influenza','H#N#']:
        w2v_examples_string += ('> w2v.vectors_norm[w2v.vocab[\''
                                + word + '\'].index]' + '\n')
        w2v_examples_string += (
            '> [' 
            + ', '.join(
                [str(coord) 
                 for coord in w2v.vectors_norm[w2v.vocab[word].index][0:10]])
            + ' ... ]'
        )
        w2v_examples_string += '\n\n'
        w2v_examples_string += '> w2v.most_similar(\'' + word + '\')' + '\n'
        w2v_examples_string += '> '+'\n'.join(
            [str(msw) for msw in w2v.most_similar(word)])
        w2v_examples_string += '\n\n'
    with open(w2v_examples_file,'w+') as out_file:
        out_file.write(w2v_examples_string)
    
    write_log(log_path, 'Start w2v processing')        
    eios_w2v = [word_embeddings_mean(w2v, text) for text in text_list_simple_pp]
    write_log(log_path, 'w2v processing done')
    pickle.dump(eios_w2v, open(eios_w2v_file, 'wb'))
    del w2v, eios_w2v, text_list_simple_pp

In [20]:
# Empty w2v
if find_empty_w2v:
    if not 'eios_labels' in globals():
        eios_labels = pickle.load(open(eios_labels_file, 'rb'))
    if not 'eios_w2v' in globals():
        eios_w2v = pickle.load(open(eios_w2v_file, 'rb'))        
    i_empty = [i for i in range(len(eios_w2v)) if eios_w2v[i] is None]
    eios_labels_empty_w2v = eios_labels.loc[i_empty]
    display(eios_labels_empty_w2v)
    write_log(log_path, 'There are ' + str(eios_labels_empty_w2v.shape[0]) 
              + ' articles in the sample without embeddings, of them '
              + str(sum(eios_labels_empty_w2v.signal)) + ' is/are signal/s')
    del eios_labels, eios_w2v

In [21]:
# Topic modeling
if topic_modeling:
    if not 'eios_tokens' in globals():
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    if not 'eios_tfidf' in globals():
        eios_tfidf = pickle.load(open(eios_tfidf_file, 'rb'))
    if not 'eios_tfidf_dictionary' in globals():        
        eios_tfidf_dictionary = pickle.load(open(eios_tfidf_dictionary_file, 'rb'))
    
    write_log(log_path, 'Start topic modeling')
    gensim_dict_tfidf = dict((i, word) for word, i in eios_tfidf_dictionary.items())
    topics = {}
    topics_string = ''
    for is_signal in [0,1]:
        topics[is_signal] = LdaModel(
            corpus=Sparse2Corpus(
                eios_tfidf[[i for i in range(eios_tokens.shape[0])
                            if eios_tokens.signal.iloc[i]==is_signal]]),
            id2word=gensim_dict_tfidf, num_topics=n_topics, random_state=42, update_every=1,
            chunksize=chunksize_topics, passes=10, alpha='auto', per_word_topics=True
        )
        topics_string += ('Topics for is_signal = '+str(is_signal)+'\n'
                          + '\n'.join([str(top) for top in topics[is_signal].print_topics()])
                          + '\n')
    with open(topics_text_file,'w+') as out_file:
        out_file.write(topics_string)
    
    write_log(log_path, 'Topic modeling done')
    pickle.dump(topics, open(topics_file,'wb'))
    del eios_tokens, eios_tfidf, eios_tfidf_dictionary

In [22]:
# Sentiment analysis
# N.B. It would be better to keep the punctuation and "I"
if sentiment_analysis:
    if not 'eios_tokens' in globals():
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    
    write_log(log_path, 'Start sentiment analysis')
    sentiment = pd.DataFrame(columns=['polarity','subjectivity','signal'])
    for i in eios_tokens.index:
        text = eios_tokens.tokens_simple_pp.iloc[i]
        signal = eios_tokens.signal.iloc[i]
        sentence_conc = []
        for sentence in text:
            sentence_conc = sentence_conc + sentence
        text_sentiment = TextBlob(' '.join(sentence_conc)).sentiment
        sentiment = sentiment.append({
            'polarity':text_sentiment.polarity,
            'subjectivity':text_sentiment.subjectivity,
            'signal':signal
        }, ignore_index=True)
    write_log(log_path, 'Sentiment analysis done')
    pickle.dump(sentiment, open(sentiment_file,'wb'))
    del eios_tokens

In [23]:
# Plot sentiment
if plot_sentiment:
    if not 'sentiment' in globals():
        sentiment = pickle.load(open(sentiment_file, 'rb'))    
    
    sentiment_plot = (
        alt.Chart(sentiment.loc[sentiment.signal==0])
        .mark_circle(size=5,opacity=0.2).encode(
            x='subjectivity:Q',
            y='polarity:Q',
            color='signal:N'
        )
        +
        alt.Chart(sentiment.loc[sentiment.signal==1],
                  title = 'sentiment')
        .mark_circle().encode(
            x='subjectivity:Q',
            y='polarity:Q',
            color='signal:N'
        )
    ).interactive()
    sentiment_plot.save(sentiment_plot_file + '.html')
    
    # Box plots
    for sent_type in ['polarity','subjectivity']:
        sentiment_boxplot = (alt.Chart(sentiment)
                             .mark_boxplot(outliers=True).encode(
                                 x=sent_type+':Q',
                                 y=alt.Y('signal:N', scale=alt.Scale(rangeStep=75))
                             )).interactive()
        sentiment_boxplot.save(sentiment_plot_file + '-box-' + sent_type + '.html')

In [24]:
# t-SNE of vectorizations
if perform_tsne:
    if not 'eios_tokens' in globals():
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    if not 'eios_tfidf' in globals():
        eios_tfidf = pickle.load(open(eios_tfidf_file, 'rb'))
    if not 'eios_w2v' in globals():
        eios_w2v = pickle.load(open(eios_w2v_file, 'rb'))
    
    eios_vectorizations = {'tfidf': eios_tfidf, 'w2v': eios_w2v}
    tsne_results = {}    
    for vc_met in ['tfidf','w2v']:
        data_set = eios_vectorizations[vc_met]
        labels = eios_tokens.signal
        if vc_met == 'w2v':
            i_notempty = [i for i in range(len(data_set)) 
                          if data_set[i] is not None]
            data_set = [data_set[i] for i in i_notempty]
            labels = labels.iloc[i_notempty].reset_index(drop=True)
        
        # Reduce number of tf-idf dimensions first, so data set is manageable
        # for t-SNE
        write_log(log_path, 'Start dim reduction / ' + vc_met)
        if vc_met == 'tfidf':
            dim_reduction_model = TruncatedSVD(n_components=n_components_dim_reduction)
            data_set = dim_reduction_model.fit_transform(data_set)
            write_log(log_path, 'Dim reduction done / ' + vc_met)
        
        write_log(log_path, 'Start t-SNE / ' + vc_met)
        tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
        tsne_results_fit = tsne.fit_transform(data_set)
        write_log(log_path, 't-SNE done / ' + vc_met)    
        tsne_results_df = pd.DataFrame({'dim1':tsne_results_fit[:,0],
                                        'dim2':tsne_results_fit[:,1],'signal':labels})
        tsne_results[vc_met] = tsne_results_df
    del eios_vectorizations, eios_tokens, eios_tfidf, eios_w2v
    pickle.dump(tsne_results, open(tsne_results_file, 'wb'))

In [25]:
# Plot t-SNE
if plot_tsne:
    if not 'tsne_results' in globals():
        tsne_results = pickle.load(open(tsne_results_file, 'rb'))

    for vc_met in ['tfidf','w2v']:
        tsne_results_df = tsne_results[vc_met]

        tsne_plot_nosignal = (alt.Chart(tsne_results_df.loc[tsne_results_df.signal==0])
                              .mark_circle(size=5,opacity=0.2).encode(
                                  x='dim1',
                                  y='dim2',
                                  color='signal:N'
                              ))
        tsne_plot_signal = (alt.Chart(tsne_results_df.loc[tsne_results_df.signal==1],
                                      title = 'dimension reduction of ' + vc_met + ' (t-SNE)')
                            .mark_circle().encode(
                                x='dim1',
                                y='dim2',
                                color='signal:N'
                            ))
        (tsne_plot_nosignal + tsne_plot_signal).save(tsne_plot_files[vc_met] + '.html')

    del tsne_results

In [26]:
# Build training and test sets
if build_train_test_sets:
    if not 'eios_tokens' in globals():
        eios_tokens = pickle.load(open(eios_tokens_file, 'rb'))
    if not 'eios_tfidf' in globals():
        eios_tfidf = pickle.load(open(eios_tfidf_file, 'rb'))
    if not 'eios_w2v' in globals():
        eios_w2v = pickle.load(open(eios_w2v_file, 'rb'))
        
    write_log(log_path, 'Start building train and test sets')
    eios_vectorizations = {'tfidf':eios_tfidf, 'tfidf_dr':eios_tfidf,
                           'w2v': eios_w2v}
    train_set = {}
    test_set = {}    
    for vc_met in vectorization_methods:
        data_set = eios_vectorizations[vc_met]
        labels = eios_tokens.signal        
        if vc_met == 'w2v':
            # w2v: Remove empty data
            i_notempty = [i for i in range(len(data_set)) 
                          if data_set[i] is not None]
            data_set = [data_set[i] for i in i_notempty]
            labels = labels.iloc[i_notempty].reset_index(drop=True)
            
        X_train, X_test, y_train, y_test = train_test_split(
            data_set, labels, test_size=0.2, random_state=10)
        if sum(y_test) == 0:
            raise ValueError('No positive labels in ', vec, ' test set... '
                             + 'Try another random_state.')
        train_set[vc_met] = {}
        test_set[vc_met] = {}
        # Balance dataset...
        for us_met in upsampling_methods:
            train_set[vc_met][us_met] = {}
            test_set[vc_met][us_met] = {}
            write_log(log_path, 'Upsampling: ' + vc_met + ' / ' + us_met)
            if us_met == 'no_us':
                # ... not
                X_train_us, y_train_us = X_train, y_train
            elif us_met == 'duplicate':
                # ... by duplicating signals
                ros = RandomOverSampler(random_state=42)
                X_train_us, y_train_us = ros.fit_resample(X_train, y_train)
            elif us_met == 'adasyn':
                # ... by generating synthetic signals
                X_train_us, y_train_us = ADASYN().fit_resample(X_train, y_train)
                
            # tfidf_dr = tfidf with dimension reduction, after upsampling
            # also called latent semantic analysis
            if vc_met == 'tfidf_dr':
                dim_reduction_model = (TruncatedSVD(n_components=n_components_dim_reduction,
                                                    random_state=42)
                                       .fit(X_train_us))
                X_train_us_dr = dim_reduction_model.transform(X_train_us)
                X_test_dr = dim_reduction_model.transform(X_test)
            else:
                X_train_us_dr, X_test_dr = X_train_us, X_test 

            # Standardize, or not
            for st in standardizing:
                if st == 'stand':
                    write_log(log_path, 'Standardize: ' + vc_met + ' / ' + us_met)
                    # No centering for sparse data
                    with_mean = not vc_met == 'tfidf'
                    scaler = preprocessing.StandardScaler(
                        with_mean=with_mean).fit(X_train_us_dr)
                    X_train_us_dr_st = scaler.transform(X_train_us_dr)
                    X_test_us_dr_st = scaler.transform(X_test_dr)        
                else:
                    X_train_us_dr_st = X_train_us_dr
                    X_test_us_dr_st = X_test_dr
                train_set[vc_met][us_met][st] = {'X': X_train_us_dr_st, 'y': y_train_us}
                test_set[vc_met][us_met][st] = {'X': X_test_us_dr_st, 'y': y_test}
        
    write_log(log_path, 'Building train and test sets done')    
    del eios_tokens, eios_vectorizations, eios_tfidf, eios_w2v
    pickle.dump(train_set, open(trainset_file, 'wb'))
    pickle.dump(test_set, open(testset_file, 'wb'))    

In [27]:
# Classification: Train
if train_classification_models:
    if not 'train_set' in globals():
        train_set = pickle.load(open(trainset_file, 'rb'))
    
    trained_models = {}
    for cl_met in classification_methods:
        trained_models[cl_met] = {}
        for vc_met in vectorization_methods:
            trained_models[cl_met][vc_met] = {}
            for us_met in upsampling_methods:
                trained_models[cl_met][vc_met][us_met] = {}
                for st in standardizing:
                    write_log(log_path, 'Start training ' + cl_met 
                              + ' / ' + vc_met + ' / ' + us_met
                              + ' / ' + st)
                    X_train = train_set[vc_met][us_met][st]['X']
                    y_train = train_set[vc_met][us_met][st]['y']
                    if cl_met == 'complement_naive_bayes':
                        clf = ComplementNB()
                    elif cl_met == 'logistic_regression':
                        clf = LogisticRegression(solver='lbfgs', penalty='l2',
                                                 max_iter=max_iter_lr,
                                                 random_state=42)
                    elif cl_met == 'random_forest':
                        clf = RandomForestClassifier(n_estimators=randomforest_n_estimators)
                    elif cl_met == 'multilayer_perceptron':
                        clf = MLPClassifier(hidden_layer_sizes=mlp_lsize,
                                            max_iter=max_iter_mlp,
                                            shuffle=True, random_state=42)                        
                    elif cl_met == 'svm_rbf':
                        clf = SVC(kernel='rbf', gamma='scale', random_state=42, 
                                  probability=True)
                    else:
                        raise ValueError('Unknown classification method ' + cl_met) 
                
                    if ( (cl_met=='complement_naive_bayes') 
                          & ((vc_met=='w2v') | (vc_met=='tfidf_dr')) ):
                        # Complement Naive Bayes works only with positive features, here:
                        # only 'tfdif', both standardized (it's not centered) and not, i.e.
                        # exclude 'tfidf_dr' and 'w2v'.
                        trained_models[cl_met][vc_met][us_met][st] = None
                    else:     
                        clf.fit(X_train, y_train)
                        trained_models[cl_met][vc_met][us_met][st] = clf
                        
    write_log(log_path, 'Training done')
    del train_set
    pickle.dump(trained_models, open(trained_models_file, 'wb'))

In [28]:
# Classification: Test, i.e. compute scores
if compute_scores:
    if not 'trained_models' in globals():
        trained_models = pickle.load(open(trained_models_file, 'rb'))
    if not 'test_set' in globals():   
        test_set = pickle.load(open(testset_file, 'rb'))
    
    ## DEBUG:
#     cl_met, vc_met, us_met = 'logistic_regression', 'w2v', 'adasyn'
#     cl_met, vc_met, us_met, st = 'complement_naive_bayes', 'tfidf', 'adasyn', 'no_st'
    scores_thresholds_init = pd.DataFrame(columns = ['classifier','vectorization','upsampling',
                                                     'standardize','score_type','threshold',
                                                     'score_value'])
    scores_thresholds_cm_max_init = pd.DataFrame(columns = ['classifier','vectorization',
                                                            'upsampling','standardize',
                                                            'score_type','threshold',
                                                            'score_value','confusion_matrix'])
    scores_fixed_recall_init = pd.DataFrame(columns = ['classifier','vectorization',
                                                       'upsampling','standardize',
                                                       'score_type','threshold',
                                                       'score_value','confusion_matrix'])
    proba_signal_df_all = pd.DataFrame(columns = ['classifier','vectorization',
                                                   'upsampling','standardize',
                                                   'probability','signal'])
    
    scores_thresholds_all = scores_thresholds_init
    scores_thresholds_cm_max_all = scores_thresholds_cm_max_init
    scores_fixed_recall_all = scores_fixed_recall_init
    
    write_log(log_path, 'Start testing')
    for cl_met in classification_methods:
        for vc_met in vectorization_methods:
            for us_met in upsampling_methods:
                for st in standardizing:
                    clf = trained_models[cl_met][vc_met][us_met][st]
                    if clf is not None:
                        
                        write_log(log_path, 'Start scoring ' + cl_met 
                                  + ' / ' + vc_met + ' / ' + us_met
                                  + ' / ' + st)
                        
                        scores_thresholds = scores_thresholds_init
                        scores_thresholds_cm_max = scores_thresholds_cm_max_init
                        
                        X_test = test_set[vc_met][us_met][st]['X']
                        y_test = test_set[vc_met][us_met][st]['y']
                        probabilities = clf.predict_proba(X_test)[:,1]
                        
                        # Set the threholds list
                        th_list = [th/n_thresholds for th in range(n_thresholds+1)]
                        # Use roc_curve() and precision_recall_curve() to refine them,
                        # to catch rapid variations in the scores
                        th_list = (th_list
                                   + list(roc_curve(y_test, probabilities)[2])
                                   + list(precision_recall_curve(y_test, probabilities)[2]))
                        # Remove artifacts of threshold > 1
                        th_list = [th for th in th_list if th <= 1]
                        th_list = np.sort(np.unique(th_list))
                        
                        # Scores for varying thresholds, find optima
                        for sc in scores_list:

                            if sc == 'mcc':
                                score_max = -1
                            else:
                                score_max = 0
                            th_max = None
                            y_pred_th_max = None

                            for th in th_list:
                                y_pred_th = [1 if y >= th else 0 for y in probabilities]
                                if sc == 'accuracy':
                                    score = accuracy_score(y_test, y_pred_th)
                                elif sc == 'precision':
                                    score = precision_score(y_test, y_pred_th)
                                elif sc == 'recall':
                                    score = recall_score(y_test, y_pred_th, pos_label=1)
                                elif sc == 'specificity':
                                    score = recall_score(y_test, y_pred_th, pos_label=0)
                                elif sc == 'f1':   
                                    score = f1_score(y_test, y_pred_th)
                                elif sc == 'mcc':
                                    score = matthews_corrcoef(y_test, y_pred_th)
                                elif sc == 'ba':
                                    score = balanced_accuracy_score(y_test, y_pred_th)
                                elif sc == 'geom_mean':
                                    score = geometric_mean_score(y_test, y_pred_th)
                                elif sc == 'iba_gm':
                                    iba_gm = make_index_balanced_accuracy(
                                        alpha=alpha_iba, squared=True)(geometric_mean_score)
                                    score = iba_gm(y_test, y_pred_th)
                                else:
                                    raise ValueError('Unknown score ' + sc + '!\n')

                                if score >= score_max:
                                    score_max = score
                                    th_max = th
                                    y_pred_th_max = y_pred_th

                                scores_thresholds = scores_thresholds.append({
                                    'classifier':cl_met,'vectorization':vc_met,
                                    'upsampling':us_met,'standardize':st,
                                    'score_type':sc,'threshold':th,
                                    'score_value':score
                                }, ignore_index=True)

                            cm_max_df = pd.DataFrame(confusion_matrix(y_test, y_pred_th_max),
                                                     columns=['pred0','pred1'],
                                                     index=['label0','label1'])
                            scores_thresholds_cm_max = scores_thresholds_cm_max.append({
                                'classifier':cl_met,'vectorization':vc_met,
                                'upsampling':us_met,'standardize':st,
                                'score_type':sc,'threshold':th_max,
                                'score_value':score_max,
                                'confusion_matrix':cm_max_df
                            }, ignore_index=True)

                        auc = roc_auc_score(y_test, probabilities)
                        scores_thresholds_cm_max = scores_thresholds_cm_max.append({
                            'classifier':cl_met,'vectorization':vc_met,
                            'upsampling':us_met,'standardize':st,
                            'score_type':'auc','threshold':None,
                            'score_value':auc,
                            'confusion_matrix':None
                        }, ignore_index=True)

                        proba_signal_df = pd.DataFrame({
                            'classifier':cl_met,'vectorization':vc_met,
                            'upsampling':us_met,'standardize':st,
                            'probability': probabilities,
                            'signal': y_test
                        })
                        rel_p_gap = (
                            2*(np.mean(proba_signal_df[proba_signal_df.signal==1].probability)
                               - np.mean(proba_signal_df[proba_signal_df.signal==0].probability))
                            / (np.std(proba_signal_df[proba_signal_df.signal==1].probability)
                               + np.std(proba_signal_df[proba_signal_df.signal==0].probability))
                        )    
                        scores_thresholds_cm_max = scores_thresholds_cm_max.append({
                            'classifier':cl_met,'vectorization':vc_met,
                            'upsampling':us_met,'standardize':st,
                            'score_type':'rel_p_gap','threshold':None,
                            'score_value':rel_p_gap,
                            'confusion_matrix':None
                        }, ignore_index=True)

                        # Find scores for threshold such that recall=recall_target
                        recall_above_target = [recall 
                            for recall in scores_thresholds[
                                    scores_thresholds.score_type=='recall'
                                ].score_value
                            if recall >= recall_target]
                        # If there are many thresholds found, take the largest, that would
                        # lead to a smaller recall (presumably closer to target) and e.g. 
                        # higher precision or specificity
                        th_fixed_recall = max(scores_thresholds.loc[
                            [(scores_thresholds.score_type.iloc[i]=='recall')
                             & (scores_thresholds.score_value.iloc[i]==min(recall_above_target))
                             for i in range(scores_thresholds.shape[0])]
                        ].threshold)
                        scores_fixed_recall = scores_thresholds.copy().loc[
                            [i for i in scores_thresholds.index
                             if scores_thresholds.threshold.iloc[i]==th_fixed_recall]
                        ]
                        y_pred_fixed_recall = [1 if y >= float(th_fixed_recall) else 0
                                               for y in probabilities]
                        cm_fr_df = pd.DataFrame(confusion_matrix(y_test, y_pred_fixed_recall),
                                                columns=['pred0','pred1'],
                                                index=['label0','label1'])
                        scores_fixed_recall['confusion_matrix'] = [cm_fr_df 
                                                                   for i in scores_fixed_recall.index]
                        
                        # Append data frames
                        scores_thresholds_all = scores_thresholds_all.append(
                            scores_thresholds, ignore_index=True
                        )
                        scores_thresholds_cm_max_all = (
                            scores_thresholds_cm_max_all.append(
                                scores_thresholds_cm_max, ignore_index=True)
                        )
                        scores_fixed_recall_all = scores_fixed_recall_all.append(
                            scores_fixed_recall, ignore_index=True
                        )
                        proba_signal_df_all = proba_signal_df_all.append(
                            proba_signal_df,  ignore_index=True
                        )

    # Save scores
    scores_dict = {'scores_thresholds':scores_thresholds_all,
                   'scores_max':scores_thresholds_cm_max_all,
                   'scores_recall':scores_fixed_recall_all,
                   'proba_signal':proba_signal_df_all}
    pickle.dump(scores_dict, open(scores_file,'wb'))
    
    write_log(log_path, 'testing done')
    del (trained_models, test_set, scores_dict, scores_thresholds_all,
         scores_thresholds_cm_max_all, scores_fixed_recall_all)

In [29]:
# Plot and write classification scores
if plot_write_scores:
    if not 'scores_dict' in globals():
        scores_dict = pickle.load(open(scores_file, 'rb'))
        scores_thresholds_all = scores_dict['scores_thresholds']
        scores_thresholds_cm_max_all = scores_dict['scores_max']
        scores_fixed_recall_all = scores_dict['scores_recall']        
        proba_signal_df_all = scores_dict['proba_signal']
    
    ## DEBUG:
#     cl_met, vc_met, us_met, st = 'complement_naive_bayes', 'tfidf', 'adasyn', 'no_st'   
#     cl_met, vc_met, us_met, st = 'logistic_regression', 'tfidf_dr', 'duplicate', 'no_st'
    
    for cl_met in classification_methods:
        for vc_met in vectorization_methods:
            for us_met in upsampling_methods:
                for st in standardizing:
                    if (not ((cl_met=='complement_naive_bayes') 
                          & ((vc_met=='w2v') | (vc_met=='tfidf_dr')))):
                        
                        scores_thresholds = scores_thresholds_all.loc[
                            (scores_thresholds_all.classifier==cl_met)
                            & (scores_thresholds_all.vectorization==vc_met)
                            & (scores_thresholds_all.upsampling==us_met)
                            & (scores_thresholds_all.standardize==st)
                        ]
                        scores_thresholds_cm_max = scores_thresholds_cm_max_all.loc[
                            (scores_thresholds_cm_max_all.classifier==cl_met)
                            & (scores_thresholds_cm_max_all.vectorization==vc_met)
                            & (scores_thresholds_cm_max_all.upsampling==us_met)
                            & (scores_thresholds_cm_max_all.standardize==st)
                        ]
                        scores_fixed_recall = scores_fixed_recall_all.loc[
                            (scores_fixed_recall_all.classifier==cl_met)
                            & (scores_fixed_recall_all.vectorization==vc_met)
                            & (scores_fixed_recall_all.upsampling==us_met)
                            & (scores_fixed_recall_all.standardize==st)
                        ]
                        proba_signal_df = proba_signal_df_all.loc[
                            (proba_signal_df_all.classifier==cl_met)
                            & (proba_signal_df_all.vectorization==vc_met)
                            & (proba_signal_df_all.upsampling==us_met)
                            & (proba_signal_df_all.standardize==st)
                        ]
                        th_list = [th 
                                  for th in scores_thresholds.loc[
                                      scores_thresholds.score_type=='recall'].threshold]

                        approach_dir = (output_dir + '/scores_individual_approaches/'
                                        + cl_met + '-' + vec_tag[vc_met] + '-'
                                        + us_met + '-' + st 
                                        + '_' + str(nosignal_sample_frac) + '_'
                                        + str(nosignal_sample_seed) + '-' 
                                        + str(limit_load_w2v) + '-' 
                                        + file_name_date(read_eios_from_date, read_eios_to_date))
                        if not os.path.exists(approach_dir):
                            os.makedirs(approach_dir)                

                        # ROC curve
                        fpr = [1-spec 
                               for spec in scores_thresholds.loc[
                                   scores_thresholds.score_type=='specificity'].score_value]
                        recall = [rec 
                                  for rec in scores_thresholds.loc[
                                   scores_thresholds.score_type=='recall'].score_value]
                        roc_curve_df = pd.DataFrame({
                            'fpr':fpr,
                            'recall':recall,
                            'threshold':th_list})
                        roc_curve_plot =( 
                            alt.Chart(roc_curve_df).mark_line().encode(
                                x=alt.X('fpr:Q', scale=alt.Scale(domain=(0,1))),
                                y=alt.Y('recall:Q', scale=alt.Scale(domain=(0,1)))
                            )
                            +
                             alt.Chart(roc_curve_df).mark_circle(size=20,opacity=0.2).encode(
                                x=alt.X('fpr:Q', scale=alt.Scale(domain=(0,1))),
                                y=alt.Y('recall:Q', scale=alt.Scale(domain=(0,1))),
                                tooltip=['fpr','recall','threshold']
                            ) 
                        ).interactive()
                        roc_curve_plot.save(approach_dir + '/roc_curve.html')

                        # Precision-Recall curve
                        precision = [prec 
                                     for prec in scores_thresholds.loc[
                                         scores_thresholds.score_type=='precision'].score_value]
                        pr_curve_df = pd.DataFrame({
                            'precision':precision,
                            'recall':recall,
                            'threshold':th_list})
                        pr_curve_plot = (
                            alt.Chart(pr_curve_df).mark_line().encode(
                                x=alt.X('recall:Q', scale=alt.Scale(domain=(0,1))),
                                y=alt.Y('precision:Q', scale=alt.Scale(domain=(0,1)))
                            ) 
                            +
                            alt.Chart(pr_curve_df).mark_circle(size=20,opacity=0.2).encode(
                                x=alt.X('recall:Q', scale=alt.Scale(domain=(0,1))),
                                y=alt.Y('precision:Q', scale=alt.Scale(domain=(0,1))),
                                tooltip=['recall','precision','threshold']
                            )
                        ).interactive()
                        pr_curve_plot.save(approach_dir + '/pr_curve.html')
                        
                        # All scores vs. threshold
                        scores_thresholds_plot = (
                            alt.Chart(scores_thresholds).mark_line().encode(
                                x='threshold:Q',
                                y='score_value:Q',
                                color='score_type:N'
                            )
                            + 
                            alt.Chart(scores_thresholds).mark_circle(size=20,opacity=0.2).encode(
                                x='threshold:Q',
                                y='score_value:Q',
                                color='score_type:N',
                                tooltip=['score_type','score_value','threshold']
                            ) 
                        ).interactive()
                        scores_thresholds_plot.save(approach_dir + '/scores_thresholds.html')
                        scores_thresholds_plot_log = (
                            alt.Chart(scores_thresholds.loc[scores_thresholds.threshold > 0])
                            .mark_line().encode(
                                x=alt.X('threshold:Q',scale=alt.Scale(type='log')),
                                y='score_value:Q',
                                color='score_type:N'
                            )
                            + 
                            alt.Chart(scores_thresholds.loc[scores_thresholds.threshold > 0])
                            .mark_circle(size=20,opacity=0.2)
                            .encode(
                                x=alt.X('threshold:Q',scale=alt.Scale(type='log')),
                                y='score_value:Q',
                                color='score_type:N',
                                tooltip=['score_type','score_value','threshold']
                            ) 
                        ).interactive()
                        scores_thresholds_plot_log.save(
                            approach_dir + '/scores_thresholds_log.html')

                        # Probability vs. signal
                        proba_signal_plot = (alt.Chart(proba_signal_df)
                                              .mark_boxplot(outliers=True).encode(
                                                  x='probability:Q',
                                                  y=alt.Y('signal:N',
                                                          scale=alt.Scale(rangeStep=75))
                                              )).interactive()
                        proba_signal_plot.save(approach_dir + '/proba_signal.html')
                        proba_signal_plot_log = (alt.Chart(proba_signal_df)
                                                  .mark_boxplot(outliers=True).encode(
                                                      x=alt.X('probability:Q',
                                                              scale=alt.Scale(type='log')),
                                                      y=alt.Y('signal:N',
                                                          scale=alt.Scale(rangeStep=75))
                                                  )).interactive()
                        proba_signal_plot_log.save(approach_dir + '/proba_signal_log.html')

                        # Scores and confusion matrix at fixed recall
                        scores_fixed_recall_print = scores_fixed_recall.copy()
                        scores_fixed_recall_print.confusion_matrix = [
                            None if cmat is None 
                            else ' / '.join([rn+cn+' '+str(cmat.loc[rn,cn])
                                             for rn in cmat.index for cn in cmat.columns])
                            for cmat in scores_fixed_recall.confusion_matrix
                        ]
                        with open(approach_dir+'/scores_fixed_recall.txt','w+') as out_file:
                            out_file.write(scores_fixed_recall_print.to_string())

                        # Max scores, AUC, relative prob gap
                        scores_thresholds_cm_max_print = scores_thresholds_cm_max.copy()
                        scores_thresholds_cm_max_print.confusion_matrix = [
                            None if cmat is None 
                            else ' / '.join([rn+cn+' '+str(cmat.loc[rn,cn])
                                             for rn in cmat.index for cn in cmat.columns])
                            for cmat in scores_thresholds_cm_max.confusion_matrix
                        ]
                        with open(approach_dir+'/scores_max_auc_rpg.txt','w+') as out_file:
                            out_file.write(scores_thresholds_cm_max_print.to_string())                 
    
    del (scores_dict, scores_thresholds_all,
         scores_thresholds_cm_max_all, scores_fixed_recall_all)

  "Defaulting to nominal.".format(typ))


In [30]:
# Overview
if overview_ouput:
    if not 'scores_dict' in globals():
        scores_dict = pickle.load(open(scores_file, 'rb'))
        scores_thresholds_cm_max_all = scores_dict['scores_max']
        scores_fixed_recall_all = scores_dict['scores_recall']

    overview_dir = (output_dir + '/scores_overview/'
                    + str(nosignal_sample_frac) + '_'
                    + str(nosignal_sample_seed) + '-' 
                    + str(limit_load_w2v) + '-' 
                    + file_name_date(read_eios_from_date, read_eios_to_date))
    if not os.path.exists(overview_dir):
        os.makedirs(overview_dir)

    best_approaches = {'fixed_recall':pd.DataFrame(columns=['score_type','score_value',
                                                            'approach','confusion_matrix']),
                       'max_auc_rpg': pd.DataFrame(columns=['score_type','score_value',
                                                            'approach','confusion_matrix'])}
    for sc in scores_list+['auc','rel_p_gap']:
        if sc == 'mcc':
            min_score = -1
        else:
            min_score = 0

        for overview_type in ['fixed_recall','max_auc_rpg']:
            if overview_type == 'fixed_recall':
                # Scores at fixed recall
                scores_df = scores_fixed_recall_all.copy()
                rank_plot_title = 'approaches ranked by ' + sc + ' (fixed recall)'
            elif overview_type == 'max_auc_rpg':
                # Max scores, AUC, relative prob gap
                scores_df = scores_thresholds_cm_max_all.copy()
                if sc in ['auc','rel_p_gap']:
                    rank_plot_title = 'approaches ranked by ' + sc
                else:
                    rank_plot_title = 'approaches ranked by best ' + sc

            # Exclude trivial max scores
            if (((overview_type == 'fixed_recall')
                 & (not sc in ['recall','auc','rel_p_gap']))
                or ((overview_type == 'max_auc_rpg')
                    & (not sc in ['recall','specificity']))):

                # Plot ranking of approaches
                rank_plot_df = (scores_df[scores_df.score_type==sc]
                                .sort_values(by='score_value',ascending=False))
                rank_plot_df['approach'] = [
                    '-'.join([''.join([lab1[0] for lab1 in lab.split('_')]) 
                              for lab in rank_plot_df.loc[i,
                                  ['classifier','vectorization','upsampling',
                                   'standardize']]])                    
                    for i in rank_plot_df.index
                ]
                rank_plot_df['approach_full'] = [
                    '-'.join(rank_plot_df.loc[i,['classifier','vectorization',
                                                 'upsampling','standardize']]) 
                    for i in rank_plot_df.index
                ]                
                if sc == 'rel_p_gap':
                    rank_plot_df = rank_plot_df[~np.isnan(rank_plot_df.score_value)]
                else:    
                    rank_plot_df = rank_plot_df[rank_plot_df.score_value > min_score]

                rank_plot = (
                    alt.Chart(rank_plot_df.loc[:,['approach','score_value']],
                              title = rank_plot_title)
                    .mark_bar()
                    .encode(
                        x = alt.X('approach:N', sort = None),
                        y = alt.Y('score_value:Q', axis=alt.Axis(title=sc)),
                        tooltip = ['approach', 'score_value']
                    ).interactive()
                )
                rank_plot.save(overview_dir+'/ranked_approaches_'+overview_type+'_'+sc+'.html')
                rank_plot_top10 = (
                    alt.Chart(rank_plot_df.loc[:,['approach','score_value']].iloc[0:10,:],
                              title = 'top 10 ' + rank_plot_title)
                    .mark_bar()
                    .encode(
                        x = alt.X('approach:N', sort = None, scale=alt.Scale(rangeStep=50)),
                        y = alt.Y('score_value:Q', axis=alt.Axis(title=sc)),
                        tooltip = ['approach', 'score_value']
                    ).interactive()
                )
                rank_plot_top10.save(overview_dir+'/ranked_approaches_'
                                     +overview_type+'_'+sc+'_top10.html')
                
                # Get best approaches
                best_approaches[overview_type] = best_approaches[overview_type].append({
                    'score_type':sc,
                    'score_value':rank_plot_df.score_value.iloc[0],
                    'approach':rank_plot_df.approach_full.iloc[0],
                    'confusion_matrix':rank_plot_df.confusion_matrix.iloc[0]
                }, ignore_index=True)

    # Print best approaches
    for overview_type in ['fixed_recall','max_auc_rpg']:
        best_approaches_print = best_approaches[overview_type].copy()
        best_approaches_print.confusion_matrix = [
            None if cmat is None
            else ' / '.join([rn+cn+' '+str(cmat.loc[rn,cn])
                             for rn in cmat.index for cn in cmat.columns])
            for cmat in best_approaches[overview_type].confusion_matrix
        ]
        with open(overview_dir+'/best_approaches_'+overview_type+'.txt','w+') as out_file:
            out_file.write(best_approaches_print.to_string())