In [1]:
import numpy as np
import sklearn.model_selection as sk
from whatwhy import QUESTION_WORDS
from whatwhy.text_processing.helper_methods import get_df_from_file
from whatwhy.resource_manager import get_glove_wiki_gigaword_50_model
%run "/home/stevengt/Documents/code/whatwhy/notebooks/Tweet CSV Consolidation.ipynb"

In [2]:
gensim_model = get_glove_wiki_gigaword_50_model()

INFO:whatwhy.resource_manager.gensim:Downloading glove-wiki-gigaword-50 model
INFO:gensim.scripts.glove2word2vec:converting 400000 vectors from /home/stevengt/anaconda3/envs/whatwhy/lib/python3.6/site-packages/whatwhy/resources/gensim/glove.6B.50d.txt to /home/stevengt/anaconda3/envs/whatwhy/lib/python3.6/site-packages/whatwhy/resources/gensim/glove.6B.50d.word2vec.txt
INFO:gensim.models.utils_any2vec:loading projection weights from /home/stevengt/anaconda3/envs/whatwhy/lib/python3.6/site-packages/whatwhy/resources/gensim/glove.6B.50d.word2vec.txt
INFO:gensim.models.utils_any2vec:loaded (400000, 50) matrix from /home/stevengt/anaconda3/envs/whatwhy/lib/python3.6/site-packages/whatwhy/resources/gensim/glove.6B.50d.word2vec.txt


In [5]:
batch_dir = "/home/stevengt/Documents/code/whatwhy-data/News-Articles/all-the-news/tokens"
target_csv_name = "/home/stevengt/Documents/code/whatwhy-data/News-Articles/all-the-news/wh_phrases.csv"

df = get_df_from_file(target_csv_name)

for question_type in QUESTION_WORDS:
    token_col = question_type + " tokens"
    df[token_col] = df[token_col].apply(get_text_as_list)

In [6]:
all_what_tokens = df["what tokens"].tolist()[:5]
all_why_tokens = df["why tokens"].tolist()[:5]
all_tokens = all_what_tokens + all_why_tokens

In [7]:
max_num_what_tokens = get_max_num_tokens(all_what_tokens)
max_num_why_tokens = get_max_num_tokens(all_why_tokens)
print('Max number of what tokens: %d' % max_num_what_tokens)
print('Max number of why tokens: %d' % max_num_why_tokens)

Max number of what tokens: 4
Max number of why tokens: 8
Longest what word has 10 letters
Longest why word has 9 letters


In [8]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = sk.train_test_split(all_what_tokens, all_why_tokens, test_size=0.33, random_state = 42)

X_train = embed_lists_of_tokens(gensim_model, X_train_raw)
X_test = embed_lists_of_tokens(gensim_model, X_test_raw)

y_train = get_one_hot_encodings_from_tokens_lists(gensim_model, y_train_raw)
y_test = get_one_hot_encodings_from_tokens_lists(gensim_model, y_test_raw)

print(f"X_train.shape={X_train.shape}")
print(f"X_test.shape={X_test.shape}")
print(f"y_train.shape={y_train.shape}")
print(f"y_test.shape={y_test.shape}")


X_train.shape=(3, 4, 50)
X_test.shape=(2, 4, 50)
y_train.shape=(3, 4, 400000)
y_test.shape=(2, 8, 400000)


In [12]:
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dense, TimeDistributed, Activation
from tensorflow.keras.optimizers import Adam

def define_model(gensim_model, max_num_what_tokens, max_num_why_tokens):
    vocabulary_size = get_num_words_in_vocab(gensim_model)
    hidden_size = gensim_model.vector_size
    input_shape = (max_num_what_tokens, hidden_size)
    output_shape = (max_num_why_tokens, vocabulary_size)
    use_dropout=False    
    
    model = Sequential()
    model.add(Input(shape=input_shape))
#     model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=True))
    if use_dropout:
        model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(vocabulary_size)))
    model.add(Activation('softmax'))

    optimizer = Adam()
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

    print(model.summary())
    return model

In [13]:
model=define_model(gensim_model, max_num_what_tokens, max_num_why_tokens)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 4, 50)             20200     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 4, 400000)         20400000  
_________________________________________________________________
activation_1 (Activation)    (None, 4, 400000)         0         
Total params: 20,420,200
Trainable params: 20,420,200
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.fit(x=X_train, y=y_train, epochs=1, batch_size=16)
# save the model
# model.save('model.h5')

In [43]:
what_tokens = [
    ["hello", "world", "my", "name", "is", "steven"],
    ["this", "is", "not", "some", "drill"],
    ["this", "is", "only", "the", "test"]
]
why_tokens = [
    ["this", "is", "some", "sample", "sentence"],
    ["i", "learn", "with", "examples"],
    ["we", "must", "figure", "out", "some", "solution"]
]

max_num_what_tokens = get_max_num_tokens(what_tokens)
max_num_why_tokens = get_max_num_tokens(why_tokens)

max_num_what_characters = get_max_token_length(what_tokens)
max_num_why_characters = get_max_token_length(why_tokens)

# what_vecs = embed_lists_of_tokens(gensim_model, what_tokens)
why_one_hot = get_flattened_one_hot_encodings_from_tokens_lists(why_tokens, max_num_why_tokens, max_num_why_characters)

In [10]:
input_shape = (max_num_what_tokens, gensim_model.vector_size)
output_length = y_train.shape[1]
model = define_model(input_shape, output_length)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 4, 50)]           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 4, 128)            58880     
_________________________________________________________________
dense (Dense)                (None, 4, 1872)           241488    
Total params: 300,368
Trainable params: 300,368
Non

In [None]:
model.fit( X_train,
           y_train,
           batch_size=3,
           epochs=10)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
