In [263]:
import json
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from pandas.io.json import json_normalize
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', 200)

In [264]:
def read_jsonl_list(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            line_data = json.loads(line.rstrip('\n|\r'))
            data.append(line_data)
    print(f'Loaded {len(data)} records from {input_path}')
    return data

In [265]:
dl = read_jsonl_list('../data/work/data_boolean_la_only.jsonl')
df = pd.DataFrame(dl)
df.columns

Loaded 4433 records from ../data/work/data_boolean_la_only.jsonl


Index(['question_text', 'boolean_answer', 'long_answer_html', 'example_id'], dtype='object')

In [266]:
random.shuffle(dl)
dl_join = [{'qja': x['question_text'] + ' [SEP] ' + x['long_answer_html'], 'bool': x['boolean_answer']} for x in dl]

In [267]:
df = pd.DataFrame(dl_join)
df

Unnamed: 0,qja,bool
0,"is the focal length of a convex lens negative [SEP] <P> For a thin lens in air , the focal length is the distance from the center of the lens to the principal foci ( or focal points ) of the lens ...",NO
1,have there been amendments proposed to change the electoral college [SEP] <P> The Twelfth Amendment ( Amendment XII ) to the United States Constitution provides the procedure for electing the Pres...,YES
2,"is there going to be a now you see me 3 [SEP] <P> Now You See Me is a series of heist thriller film written by Ed Solomon , Boaz Yakin , and Edward Ricourt . They focus on the actions of a team of...",YES
3,"a cutoff meander can become an oxbow lake [SEP] <P> An oxbow lake is a U shaped body of water that forms when a wide meander from the main stem of a river is cut off , creating a free - standing b...",YES
4,"did the movie philadelphia win any academy awards [SEP] <P> Hanks won the Academy Award for Best Actor for his role as Andrew Beckett in the film , while the song `` Streets of Philadelphia '' by ...",YES
...,...,...
4428,is it illegal to talk on the phone while driving [SEP] <Table> <Tr> <Th> State </Th> <Th> Total handheld device ban applied to : </Th> <Th> Any cell phone use by driver prohibited if : </Th> <Th> ...,NONE
4429,is the federal court the same as the supreme court [SEP] <P> The federal courts are composed of three levels of courts . The Supreme Court of the United States is the court of last resort . It is ...,NO
4430,"are two members of little big town married to each other [SEP] <P> Karen Fairchild and Jimi Westbrook married on May 31 , 2006 , although their marriage was not disclosed until two months later . ...",YES
4431,"will there be a new season of the strain [SEP] <P> On August 6 , 2014 , FX renewed The Strain for a 13 - episode second season which premiered on July 12 , 2015 . On August 7 , 2015 , FX renewed T...",NO


In [268]:
df['question_type'] = df['qja'].apply(lambda x: x.split(' ')[0])
df

Unnamed: 0,qja,bool,question_type
0,"is the focal length of a convex lens negative [SEP] <P> For a thin lens in air , the focal length is the distance from the center of the lens to the principal foci ( or focal points ) of the lens ...",NO,is
1,have there been amendments proposed to change the electoral college [SEP] <P> The Twelfth Amendment ( Amendment XII ) to the United States Constitution provides the procedure for electing the Pres...,YES,have
2,"is there going to be a now you see me 3 [SEP] <P> Now You See Me is a series of heist thriller film written by Ed Solomon , Boaz Yakin , and Edward Ricourt . They focus on the actions of a team of...",YES,is
3,"a cutoff meander can become an oxbow lake [SEP] <P> An oxbow lake is a U shaped body of water that forms when a wide meander from the main stem of a river is cut off , creating a free - standing b...",YES,a
4,"did the movie philadelphia win any academy awards [SEP] <P> Hanks won the Academy Award for Best Actor for his role as Andrew Beckett in the film , while the song `` Streets of Philadelphia '' by ...",YES,did
...,...,...,...
4428,is it illegal to talk on the phone while driving [SEP] <Table> <Tr> <Th> State </Th> <Th> Total handheld device ban applied to : </Th> <Th> Any cell phone use by driver prohibited if : </Th> <Th> ...,NONE,is
4429,is the federal court the same as the supreme court [SEP] <P> The federal courts are composed of three levels of courts . The Supreme Court of the United States is the court of last resort . It is ...,NO,is
4430,"are two members of little big town married to each other [SEP] <P> Karen Fairchild and Jimi Westbrook married on May 31 , 2006 , although their marriage was not disclosed until two months later . ...",YES,are
4431,"will there be a new season of the strain [SEP] <P> On August 6 , 2014 , FX renewed The Strain for a 13 - episode second season which premiered on July 12 , 2015 . On August 7 , 2015 , FX renewed T...",NO,will


In [270]:
# df['qja'] = df['qja'].str.replace('[^\w\s\<\>\/]','') # drop punctuation signs; keep html tags
# df

In [271]:
df['question_type'].value_counts().head(8)

is      1705
can      490
do       457
does     449
are      281
did      189
has      133
was      130
Name: question_type, dtype: int64

In [272]:
df.groupby('bool').count()

Unnamed: 0_level_0,qja,question_type
bool,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,1439,1439
NONE,635,635
YES,2359,2359


In [273]:
np.unique(df[['bool']].values, return_counts=True)

(array(['NO', 'NONE', 'YES'], dtype=object), array([1439,  635, 2359]))

In [274]:
df['bool'].value_counts().apply(lambda x: x / df['bool'].count())

YES     0.532145
NO      0.324611
NONE    0.143244
Name: bool, dtype: float64

In [275]:
df.columns
# x = df[['qja']] # x = df[['question_text', 'long_answer_html']]
# y = df['bool']
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# y_train.value_counts().apply(lambda x: x / y_train.count())
# y_test.value_counts().apply(lambda x: x / y_test.count())

Index(['qja', 'bool', 'question_type'], dtype='object')

In [276]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [277]:
Y = df['bool']
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(Y, num_classes=3);Y

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [278]:
import tensorflow as tf
max_features = 10000
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)

In [279]:
lq = list(df['qja'])

In [280]:
tok.fit_on_texts(lq)
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1 # 49896

49895


In [281]:
X = tok.texts_to_sequences(lq)

In [282]:
# sorted(list(enumerate([len(x) for x in qja_m])), key=lambda x: x[1], reverse=True)

In [283]:
maxlen = max([len(x) for x in X]); maxlen

22946

In [284]:
pd.DataFrame([len(x) for x in X]).describe()

Unnamed: 0,0
count,4433.0
mean,424.673133
std,1380.48194
min,13.0
25%,68.0
50%,101.0
75%,157.0
max,22946.0


In [285]:
maxlen = 1500 # 1500

In [286]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=maxlen); X

array([[   0,    0,    0, ...,    3, 6330,   18],
       [   0,    0,    0, ...,  150,   98,   18],
       [   0,    0,    0, ...,  387, 2086,   18],
       ...,
       [   0,    0,    0, ...,   12, 5141,   18],
       [   0,    0,    0, ...,  423, 1187,   18],
       [   0,    0,    0, ...,    1,    2,   97]], dtype=int32)

In [287]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=4321)

In [288]:
embedding_dim = 500 # 500

In [289]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size,    # embedding input
                           output_dim=embedding_dim, # embedding output
                           input_length=maxlen),     # maximum length of an input sequence
  tf.keras.layers.Flatten(),                         # flatten layer
  tf.keras.layers.Dense(30, activation=tf.nn.relu),  # 30
  tf.keras.layers.Dense(20, activation=tf.nn.relu),  # 200
  tf.keras.layers.Dense(9, activation=tf.nn.relu),   # 100
  tf.keras.layers.Dense(3, activation=tf.nn.softmax) # ouput layer a Dense layer with 3 probabilities
])                                                   # activation function which is the softmax function 

In [290]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [291]:
model.summary()

Model: "sequential_40"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_40 (Embedding)     (None, 1500, 500)         24948000  
_________________________________________________________________
flatten_40 (Flatten)         (None, 750000)            0         
_________________________________________________________________
dense_160 (Dense)            (None, 30)                22500030  
_________________________________________________________________
dense_161 (Dense)            (None, 20)                620       
_________________________________________________________________
dense_162 (Dense)            (None, 9)                 189       
_________________________________________________________________
dense_163 (Dense)            (None, 3)                 30        
Total params: 47,448,869
Trainable params: 47,448,869
Non-trainable params: 0
_________________________________________

In [292]:
model.fit(np.array(x_train), np.array(y_train), epochs=7)

Train on 3546 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x1a221aa210>

In [293]:
print(model.metrics_names)
model.evaluate(np.array(x_test), np.array(y_test), verbose=0)

['loss', 'accuracy']


[1.188759573148364, 0.6489966]