In [1]:
# coding=utf-8
from Decorators import memoize_to_disk
from load_data import load_process_essays, extract_features

from featurevectorizer import FeatureVectorizer
from featureextractionfunctions import *
from CrossValidation import cross_validation
from wordtagginghelper import *
from IterableFP import flatten
from results_procesor import ResultsProcessor, __MICRO_F1__
# Classifiers
from sklearn.linear_model import LogisticRegression

from window_based_tagger_config import get_config
from tag_frequency import get_tag_freq, regular_tag
from joblib import Parallel, delayed
# END Classifiers

import Settings
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger()

# Create persister (mongo client) - fail fast if mongo service not initialized
processor = ResultsProcessor()

# not hashed as don't affect persistence of feature processing
SPARSE_WD_FEATS     = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags
# end not hashed

# construct unique key using settings for pickling

settings = Settings.Settings()
folder =                            settings.data_directory + "CoralBleaching/BrattData/EBA1415_Merged/"
processed_essay_filename_prefix =   settings.data_directory + "CoralBleaching/BrattData/Pickled/essays_proc_pickled_"
features_filename_prefix =          settings.data_directory + "CoralBleaching/BrattData/Pickled/feats_pickled_"

out_metrics_file     =              settings.data_directory + "CoralBleaching/Results/metrics.txt"

config = get_config(folder)

""" FEATURE EXTRACTION """
config["window_size"] = 11
offset = (config["window_size"] - 1) / 2

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [6]:
config

{'folder': '/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/',
 'include_normal': False,
 'include_vague': True,
 'lower_case': True,
 'min_df': 2,
 'min_sentence_length': 3,
 'remove_infrequent': False,
 'remove_punctuation': False,
 'remove_stop_words': False,
 'replace_nums': True,
 'spelling_correct': True,
 'stem': False,
 'window_size': 11}

In [2]:
unigram_bow_window = fact_extract_bow_ngram_features(offset, 1)

unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
biigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)
trigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3)

extractors = [unigram_bow_window,
              unigram_window_stemmed,
              biigram_window_stemmed,
              trigram_window_stemmed,
              extract_brown_cluster,
              extract_dependency_relation
]

feat_config = dict(config.items() + [("extractors", extractors)])

""" LOAD DATA """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
logger.info("Essays loaded")
# most params below exist ONLY for the purposes of the hashing to and from disk
mem_extract_features = memoize_to_disk(filename_prefix=features_filename_prefix)(extract_features)
essay_feats = mem_extract_features(tagged_essays, **feat_config)
logger.info("Features loaded")

('Pickle Key:', 'folder_/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/_include_normal_False_include_vague_True_lower_case_True_min_df_2_min_sentence_length_3_remove_infrequent_False_remove_punctuation_False_remove_stop_words_False_replace_nums_True_spelling_correct_True_stem_False_window_size_11')
('Pickle Key:', 'extractors_fn_bow_ngram_feat[ngram_size:1 offset:5]_fn_pos_wd_feats_stemmed[offset:5]_fn_pos_ngram_feat_stemmed[ngram_size:2 offset:5]_fn_pos_ngram_feat_stemmed[ngram_size:3 offset:5]_extract_brown_cluster_extract_dependency_relation_folder_/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/BrattData/EBA1415_Merged/_include_normal_False_include_vague_True_lower_case_True_min_df_2_min_sentence_length_3_remove_infrequent_False_remove_punctuation_False_remove_stop_words_False_replace_nums_True_spelling_correct_True_stem_False_window_size_11')


In [3]:
""" DEFINE TAGS """

_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags) if t[0].isdigit())))

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags  = regular_tags

In [4]:
""" CLASSIFIERS """
""" Log Reg + Log Reg is best!!! """
fn_create_wd_cls   = lambda: LogisticRegression() # C=1, dual = False seems optimal
wd_algo   = str(fn_create_wd_cls())
print "Classifier:", wd_algo

folds = cross_validation(essay_feats, CV_FOLDS)

Classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)


In [7]:
def train_tagger(fold, essays_TD, essays_VD, wd_test_tags, wd_train_tags,
                 dual, C, penalty, fit_intercept, multi_class):

    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects

    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)

    wd_td_ys = get_wordlevel_powerset_ys(td_tags, wd_train_tags)
    wd_vd_ys = get_wordlevel_powerset_ys(vd_tags, wd_train_tags)

    wd_td_ys_by_code = get_by_code_from_powerset_predictions(wd_td_ys, wd_test_tags)
    wd_vd_ys_by_code = get_by_code_from_powerset_predictions(wd_vd_ys, wd_test_tags)

    """ TRAIN Tagger """

    solver = 'liblinear'
    if multi_class == 'multinomial':
        solver = "lbfgs"
    model = LogisticRegression(dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept, multi_class=multi_class, solver=solver)
    if fold == 0:
        print(model)

    model.fit(td_X, wd_td_ys)

    wd_td_pred = model.predict(td_X)
    wd_vd_pred = model.predict(vd_X)

    """ TEST Tagger """
    td_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_td_pred, wd_test_tags)
    vd_wd_predictions_by_code = get_by_code_from_powerset_predictions(wd_vd_pred, wd_test_tags)

    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_by_code, wd_vd_ys_by_code

multi_class = 'multinomial'
dual = False
penalty = 'l1'
fit_intercept = True 
C = 100.0

fold = 0

hyper_opt_params = {}

# Gather metrics per fold
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

""" This doesn't run in parallel ! """
essays_TD, essays_VD = folds[0]

result = train_tagger(fold, essays_VD, essays_VD, wd_test_tags, wd_train_tags,
                      dual=dual, C=C, penalty=penalty, fit_intercept=fit_intercept, multi_class=multi_class)

td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

# print results for each code
""" Persist Results to Mongo DB """

SUFFIX = "_WINDOW_CLASSIFIER_LBL_POWERSET_MULTICLASS_HYPER_PARAM_TUNING"
CB_TAGGING_TD, CB_TAGGING_VD = "CB_TAGGING_TD" + SUFFIX, "CB_TAGGING_VD" + SUFFIX
parameters = dict(config)
parameters["extractors"] = map(lambda fn: fn.func_name, extractors)
parameters["min_feat_freq"] = MIN_FEAT_FREQ
parameters.update(hyper_opt_params)

wd_td_objectid = processor.persist_results(CB_TAGGING_TD, cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag, parameters, wd_algo)
wd_vd_objectid = processor.persist_results(CB_TAGGING_VD, cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag, parameters, wd_algo)

    # This outputs 0's for MEAN CONCEPT CODES as we aren't including those in the outputs
avg_f1 = float(processor.get_metric(CB_TAGGING_VD, wd_vd_objectid, __MICRO_F1__)["f1_score"])
avg_f1

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          penalty='l1', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0)


NameError: name 'avg_f1_f1' is not defined

In [8]:
avg_f1


0.9935685729184774

In [10]:
#vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag
td_wd_predictions_by_code.keys()

['11', '13', '12', '14', '50', '1', '3', '2', '5', '4', '7', '6', '5b']

In [11]:
vd_wd_predictions_by_code.keys()

['11', '13', '12', '14', '50', '1', '3', '2', '5', '4', '7', '6', '5b']

In [12]:
map(len, td_wd_predictions_by_code.values())

[36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545]

In [13]:
map(len, vd_wd_predictions_by_code.values())

[36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545,
 36545]

In [15]:
np.mean(td_wd_predictions_by_code["50"])

0.080093035983034619

In [16]:
np.mean(vd_wd_predictions_by_code["50"])

0.080093035983034619

In [17]:
td_wd_predictions_by_code["50"] == vd_wd_predictions_by_code["50"]

True