In [1]:
%cd /home/slav/ai/claim-rank
!export PYTHONPATH=.
%load_ext autoreload
%autoreload 2

/home/slav/ai/claim-rank


In [2]:
import numpy as np
np.random.seed(42) # ! before importing keras!

In [3]:
from src.data.debates import read_debates, Debate

In [168]:
from src.features.feature_sets import get_experimential_pipeline, get_full_pipeline

In [169]:
from keras.layers import Input, Embedding, Conv1D, Dense
from keras.models import Model, clone_model
from keras.layers import Input, Dense, Dropout
from keras import optimizers
from src.stats.rank_metrics import average_precision, precision_at_n
from sklearn.metrics import average_precision_score

In [170]:
def get_claim_scores(sentences):
    return np.array([np.array([1 if s.label>0 else 0] +
           [int(l) for l in s.labels]) for s in sentences])
    

In [173]:
debates = [Debate.FIRST, Debate.VP, Debate.SECOND, Debate.THIRD]
debates_size = len(debates)
debate_sentences = np.array([read_debates(debate) for debate in debates])
debate_results = [get_claim_scores(debate_sentence) for debate_sentence in debate_sentences]
pipeline = get_full_pipeline()
debate_features = np.array([pipeline.fit_transform(debate_sentence) for debate_sentence in debate_sentences])

['sent', 'tokens_num', 'text_len', 'ner', 'pos', 'pf_match_score', 'pf_match_with_person', 'w2v_dist', 'w2v_dist_prev', 'w2v_dist_next', 'w2v_vector', 'tense', 'lexicons', 'speaker', '(laugh', '(crosstalk', '(applause', '(laughter', 'participant', 'opponent']


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  dist = 1.0 - uv / np.sqrt(uu * vv)


In [174]:
debate_results = np.array([get_claim_scores(debate_sentence) for debate_sentence in debate_sentences])

In [175]:
def create_baseline_model(in_count, out_count):
    input_layer = Input(shape=(in_count,))
    x = Dense(100, kernel_initializer='normal', activation='relu')(input_layer)
    
    outputs = list(map(lambda _: Dense(1, kernel_initializer='normal', activation='sigmoid')(x), range(out_count)))

    model = Model(inputs=[input_layer], outputs=outputs)
                         
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model;

In [176]:
def getFeatures(train_indices, val_indices, test_indices):
    train_sentences = np.concatenate(debate_sentences[train_indices])
    val_sentences = np.concatenate(debate_sentences[val_indices])
    test_sentences = np.concatenate(debate_sentences[test_indices])
    
    pipeline = get_experimential_pipeline(train_sentences)
    
    X_train = np.array([pipeline.fit_transform(debate_sentences) for debate_sentences in train_sentences])
    X_val = np.array([pipeline.fit_transform(debate_sentences) for debate_sentences in val_sentences])
    X_test = np.array([pipeline.fit_transform(debate_sentences) for debate_sentences in test_sentences])
    
    return X_train, X_val, X_test

In [177]:
def getDataByIndices(train_indices, val_indices, test_indices, train_targets, test_targets):
    X_train = np.concatenate(debate_features[train_indices])
    X_val = np.concatenate(debate_features[val_indices])
    X_test = np.concatenate(debate_features[test_indices])
    
    y_train = list(np.vstack(debate_results[train_indices])[:, train_targets].T)
    y_val = list(np.vstack(debate_results[val_indices])[:, test_targets].T)
    y_test = list(np.vstack(debate_results[test_indices])[:, test_targets].T)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [178]:
def run_single_model(X_train, X_val, X_test, train_target, val_targets, test_targets, iters = 20, epochs=5):
    model = create_baseline_model(X_train.shape[1], 1);
    best_models = [{'model': None, 'av_p': 0., 'iter': -1} for _ in test_targets]

    for ite in range(iters):
        
        model.fit(X_train, y=[train_target], epochs=epochs, verbose=0, batch_size=550)

        print(str(epochs * (ite + 1)) + ' ', end='')
        
        predicted_val = model.predict(X_val)[:, 0]
        
        for j in range(len(test_targets)):
            av_p = average_precision_score(val_targets[j], predicted_val)
            if (av_p > best_models[j]['av_p']):
                best_models[j] = {'model': clone_model(model), 'av_p': av_p, 'iter': ite}
    
    print()
    
    for j in range(len(test_targets)):
        best_model = best_models[j]
        best_model['av_p_test'] = average_precision_score(test_targets[j], best_model['model'].predict(X_test)[:, 0])
    
    return best_models
    

In [179]:
def run_multiple_model(X_train, X_val, X_test, train_targets, val_targets, test_targets, log_indexes, iters = 20, epochs=5):
    model = create_baseline_model(X_train.shape[1], len(train_targets));
    best_models = [{'model': None, 'av_p': 0., 'iter': -1} for _ in log_indexes]

    for ite in range(iters):
        model.fit(X_train, y=train_targets, epochs=epochs, verbose=0, batch_size=550)
    
        print(str(epochs * (ite + 1)) + ' ', end='')
        
        predicted_val = np.array(model.predict(X_val))
        
        for j, i in enumerate(log_indexes):
            av_p = average_precision_score(val_targets[j], predicted_val[i][:, 0])
            if (av_p > best_models[j]['av_p']):
                best_models[j] = {'model': clone_model(model), 'av_p': av_p, 'iter': ite}
    
    print()
    
    for j, i in enumerate(log_indexes):
        best_model = best_models[j]
        best_model['av_p_test'] = average_precision_score(test_targets[j], best_model['model'].predict(X_test)[i][:, 0])
    
    return best_models
    

In [164]:
X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices([0,1], [2], [3], [0], [0, 5])
run_single_model(X_train, X_val, X_test, y_train[0], y_val, y_test)

5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 


[{'av_p': 0.35230997567875699,
  'av_p_test': 0.14887425218375649,
  'iter': 19,
  'model': <keras.engine.training.Model at 0x7f76192d59b0>},
 {'av_p': 0.18705704589224104,
  'av_p_test': 0.059156327850934727,
  'iter': 19,
  'model': <keras.engine.training.Model at 0x7f76192a9f28>}]

In [165]:
X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices([0,1], [2], [3], [5], [0, 5])
run_single_model(X_train, X_val, X_test, y_train[0], y_val, y_test)

5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 


[{'av_p': 0.33423660607236566,
  'av_p_test': 0.22624190646654346,
  'iter': 19,
  'model': <keras.engine.training.Model at 0x7f761886b4a8>},
 {'av_p': 0.17578478031487371,
  'av_p_test': 0.067522342071457048,
  'iter': 18,
  'model': <keras.engine.training.Model at 0x7f76188be8d0>}]

In [91]:
X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices([0,1], [2], [3], [0, 5], [0, 5])
run_multiple_model(X_train, X_val, X_test, y_train, y_val, y_test, [0, 1])

5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 
85
0.39236876125
0.13823944287
40
0.159588772507
0.0547967679175


[{'av_p': 0.39236876125049991,
  'iter': 16,
  'model': <keras.engine.training.Model at 0x7f7662157fd0>},
 {'av_p': 0.15958877250737052,
  'iter': 7,
  'model': <keras.engine.training.Model at 0x7f7662427d30>}]

In [166]:
X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices([0,1], [2], [3], range(9), [0, 5])
run_multiple_model(X_train, X_val, X_test, y_train, y_val, y_test, [0, 5])

5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 


[{'av_p': 0.3418613844605623,
  'av_p_test': 0.15924850465430482,
  'iter': 19,
  'model': <keras.engine.training.Model at 0x7f76147fee48>},
 {'av_p': 0.18030329842282497,
  'av_p_test': 0.077014888074159363,
  'iter': 19,
  'model': <keras.engine.training.Model at 0x7f761469c828>}]

In [180]:
crossvalidation_indices = [[[0, 1], [2], [3]], [[0, 1], [3], [2]], [[0, 2], [3], [1]], [[1, 2], [3], [0]]]

In [181]:
def crossValidateSingle(train_targets, test_targets):
    models_sums = [0 for _ in test_targets]
    for indices in crossvalidation_indices:
        train_indices, val_indices, test_indices = indices
        X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices(
            train_indices, val_indices, test_indices, train_targets, test_targets)
        models = run_single_model(X_train, X_val, X_test, y_train[0], y_val, y_test)
        print(models)
        models_sums = [prev + curr['av_p_test'] for prev, curr in zip(models_sums, models)]
        
    
    return np.array(models_sums) / len(crossvalidation_indices)
    

In [158]:
crossValidateSingle([0, 5], [0, 5])

5 
[{'model': <keras.engine.training.Model object at 0x7f761c4518d0>, 'av_p': 0.25204736511935877, 'av_p_test': 0.17592987085240852, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761c1bbf28>, 'av_p': 0.12547488325175818, 'av_p_test': 0.058695693492578402, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761c0f2e80>, 'av_p': 0.2165409665232208, 'av_p_test': 0.12894253453652849, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761bdfeb00>, 'av_p': 0.067845092432430532, 'av_p_test': 0.097572666232368335, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761bd0ee48>, 'av_p': 0.2620117103439894, 'av_p_test': 0.13350956688007348, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761ba78ba8>, 'av_p': 0.099194279104324143, 'av_p_test': 0.049455539323674066, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761b98af60>, 'av_p': 0.26203684049085502, 'av_p_test': 0.14134092482703192, 'iter'

array([ 0.14493072,  0.0660977 ])

In [182]:
def crossValidateMultiple(train_targets, test_targets, log_indexes):
    models_sums = [0 for _ in test_targets]
    for indices in crossvalidation_indices:
        train_indices, val_indices, test_indices = indices
        X_train, X_val, X_test, y_train, y_val, y_test = getDataByIndices(
            train_indices, val_indices, test_indices, train_targets, test_targets)
        models = run_multiple_model(X_train, X_val, X_test, y_train, y_val, y_test, log_indexes)
        print(models)
        models_sums = [prev + curr['av_p_test'] for prev, curr in zip(models_sums, models)]
        
    
    return np.array(models_sums) / len(crossvalidation_indices)

In [162]:
crossValidateMultiple([0, 5], [0, 5], [0, 1])

5 
[{'model': <keras.engine.training.Model object at 0x7f761b5d3d68>, 'av_p': 0.12439200322704333, 'av_p_test': 0.2396622781793504, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761b0f3fd0>, 'av_p': 0.052681825570640169, 'av_p_test': 0.061406264374609595, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761afbaa58>, 'av_p': 0.28823080112630528, 'av_p_test': 0.11098749939125524, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761aa76fd0>, 'av_p': 0.072348512250391628, 'av_p_test': 0.089579202607556974, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761a9b4908>, 'av_p': 0.23989739691974807, 'av_p_test': 0.15543881318035924, 'iter': 0}, {'model': <keras.engine.training.Model object at 0x7f761a444cc0>, 'av_p': 0.080403028236606139, 'av_p_test': 0.097147880835494638, 'iter': 0}]
5 
[{'model': <keras.engine.training.Model object at 0x7f761a364828>, 'av_p': 0.19779758436527672, 'av_p_test': 0.16634786877562685, 'ite

array([ 0.16810911,  0.07545276])

In [183]:
crossValidateSingle([0], [0, 5])
crossValidateSingle([5], [0, 5])
crossValidateMultiple([0, 5], [0, 5], [0, 1])
crossValidateMultiple(range(9), [0, 5], [0,5])

5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 
[{'model': <keras.engine.training.Model object at 0x7f7641318d30>, 'av_p': 0.36214022741250707, 'av_p_test': 0.18947550275303224, 'iter': 17}, {'model': <keras.engine.training.Model object at 0x7f76411efa90>, 'av_p': 0.20954873837173996, 'av_p_test': 0.067317011458806572, 'iter': 5}]
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 
[{'model': <keras.engine.training.Model object at 0x7f76748e9390>, 'av_p': 0.36918571892346225, 'av_p_test': 0.15225166876864432, 'iter': 16}, {'model': <keras.engine.training.Model object at 0x7f769b67f978>, 'av_p': 0.16178218313855802, 'av_p_test': 0.09469362948168128, 'iter': 7}]
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90 95 100 
[{'model': <keras.engine.training.Model object at 0x7f76129f24e0>, 'av_p': 0.39540673003767746, 'av_p_test': 0.1826135036541158, 'iter': 10}, {'model': <keras.engine.training.Model object at 0x7f76749d7a58>, 'av_p': 0.20051750744988847, 'av_p_test'

array([ 0.16447835,  0.06341451])