In [1]:
import logging

# Classifiers
from sklearn.linear_model import LogisticRegression

import Settings
from CrossValidation import cross_validation
from Decorators import memoize_to_disk
from featureextractionfunctions import *
from featurevectorizer import FeatureVectorizer
from load_data import load_process_essays, extract_features
from window_based_tagger_config import get_config
from wordtagginghelper import *

In [14]:
SPARSE_WD_FEATS     = True

MIN_FEAT_FREQ       = 5        # 5 best so far
CV_FOLDS            = 5

MIN_TAG_FREQ        = 5
LOOK_BACK           = 0     # how many sentences to look back when predicting tags

WINDOW_SIZE         = 9

In [4]:
settings = Settings.Settings()

root_folder = settings.data_directory + "CoralBleaching/Thesis_Dataset/"
folder =                            root_folder + "Training/"
processed_essay_filename_prefix =   root_folder + "Pickled/essays_proc_pickled_"
features_filename_prefix =          root_folder + "Pickled/feats_pickled_"

config = get_config(folder)

Results Dir: /Users/simon.hughes/Google Drive/Phd/Results/
Data Dir:    /Users/simon.hughes/Google Drive/Phd/Data/
Root Dir:    /Users/simon.hughes/GitHub/NlpResearch/
Public Data: /Users/simon.hughes/GitHub/NlpResearch/Data/PublicDatasets/


In [7]:
# """ Load Essays """
mem_process_essays = memoize_to_disk(filename_prefix=processed_essay_filename_prefix)(load_process_essays)
tagged_essays = mem_process_essays( **config )
# """ End load Essays """

Pickle Key: folder_/Users/simon.hughes/Google Drive/Phd/Data/CoralBleaching/Thesis_Dataset/Training/_include_normal_False_include_vague_True_lower_case_True_min_df_2_min_sentence_length_3_remove_infrequent_False_remove_punctuation_False_remove_stop_words_False_replace_nums_True_spelling_correct_True_stem_False_window_size_9


In [15]:
config["window_size"] = WINDOW_SIZE
offset = int((config["window_size"] - 1) / 2)

unigram_bow_window = fact_extract_bow_ngram_features(offset, 1)
unigram_window_stemmed = fact_extract_positional_word_features_stemmed(offset)
bigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 2)
trigram_window_stemmed = fact_extract_ngram_features_stemmed(offset, 3)
pos_tag_window = fact_extract_positional_POS_features(offset)

# optimal feats from the tuning
feat_extractors = [
        unigram_window_stemmed,
        bigram_window_stemmed,
        pos_tag_window,
        unigram_bow_window,
        trigram_window_stemmed
    ]
feat_config = dict(list(config.items()) + [("extractors", feat_extractors)])

""" LOAD FEATURES """
essay_feats = extract_features(tagged_essays, **feat_config)

""" DEFINE TAGS """
_, lst_all_tags = flatten_to_wordlevel_feat_tags(essay_feats)
regular_tags = list(set((t for t in flatten(lst_all_tags) if t.lower().strip() == "anaphor" )))

""" works best with all the pair-wise causal relation codes """
wd_train_tags = regular_tags
wd_test_tags = regular_tags

""" CLASSIFIERS """
fn_create_wd_cls = lambda: LogisticRegression()  # C=1, dual = False seems optimal
wd_algo = str(fn_create_wd_cls())

In [16]:
def train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags):
    # TD and VD are lists of Essay objects. The sentences are lists
    # of featureextractortransformer.Word objects
    """ Data Partitioning and Training """
    td_feats, td_tags = flatten_to_wordlevel_feat_tags(essays_TD)
    vd_feats, vd_tags = flatten_to_wordlevel_feat_tags(essays_VD)
    feature_transformer = FeatureVectorizer(min_feature_freq=MIN_FEAT_FREQ, sparse=SPARSE_WD_FEATS)
    td_X, vd_X = feature_transformer.fit_transform(td_feats), feature_transformer.transform(vd_feats)
    wd_td_ys_bytag = get_wordlevel_ys_by_code(td_tags, wd_train_tags)
    wd_vd_ys_bytag = get_wordlevel_ys_by_code(vd_tags, wd_train_tags)
    """ TRAIN Tagger """
    tag2word_classifier = train_classifier_per_code(td_X, wd_td_ys_bytag, lambda: LogisticRegression(),
                                                    wd_train_tags, verbose=False)
    """ TEST Tagger """
    td_wd_predictions_by_code = test_classifier_per_code(td_X, tag2word_classifier, wd_test_tags)
    vd_wd_predictions_by_code = test_classifier_per_code(vd_X, tag2word_classifier, wd_test_tags)
    return td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag

In [20]:
folds = cross_validation(essay_feats, CV_FOLDS)

In [21]:
%%time

# Gather metrics per fold
cv_wd_td_ys_by_tag, cv_wd_td_predictions_by_tag = defaultdict(list), defaultdict(list)
cv_wd_vd_ys_by_tag, cv_wd_vd_predictions_by_tag = defaultdict(list), defaultdict(list)

essays_preds = []
for (essays_TD, essays_VD) in folds:
    
    result = train_tagger(essays_TD, essays_VD, wd_test_tags, wd_train_tags)    
    td_wd_predictions_by_code, vd_wd_predictions_by_code, wd_td_ys_bytag, wd_vd_ys_bytag = result
    
    essays_preds.append( (essays_VD, vd_wd_predictions_by_code) )
    
    merge_dictionaries(wd_td_ys_bytag, cv_wd_td_ys_by_tag)
    merge_dictionaries(wd_vd_ys_bytag, cv_wd_vd_ys_by_tag)
    merge_dictionaries(td_wd_predictions_by_code, cv_wd_td_predictions_by_tag)
    merge_dictionaries(vd_wd_predictions_by_code, cv_wd_vd_predictions_by_tag)

CPU times: user 1min 36s, sys: 3.89 s, total: 1min 40s
Wall time: 1min 41s


In [22]:
len(essays_preds)

5

## TODO - train with optimal hyper params, assign tags to the essays