# CONV1D Model Tryout - 1

In [None]:
from __future__ import print_function, division
from builtins import range

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from sklearn.metrics import roc_auc_score

In [None]:
import tensorflow as tf

In [None]:
# some configuration
MAX_SEQUENCE_LENGTH = 1400
MAX_VOCAB_SIZE = 40000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1
BATCH_SIZE = 128
EPOCHS = 10

In [None]:
# load in pre-trained word vectors
print('Loading word vectors...')
word2vec = {}
with open(os.path.join('../input/glove6b/glove.6B.%sd.txt' % EMBEDDING_DIM)) as f:
  # is just a space-separated text file in the format:
  # word vec[0] vec[1] vec[2] ...
  for line in f:
    values = line.split()
    word = values[0]
    vec = np.asarray(values[1:], dtype='float32')
    word2vec[word] = vec
print('Found %s word vectors.' % len(word2vec))

In [None]:
train = pd.read_csv("../input/commonlitreadabilityprize/train.csv")

In [None]:
train.describe(include='all')

In [None]:
train.url_legal.value_counts()

In [None]:
train.isnull().sum()

In [None]:
sentences = train["excerpt"].fillna("DUMMY_VALUE").values
target = train["target"].values

In [None]:

# convert the sentences (strings) into integers
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
# print("sequences:", sequences); exit()

In [None]:
print("max sequence length:", max(len(s) for s in sequences))
print("min sequence length:", min(len(s) for s in sequences))


In [None]:
s = sorted(len(s) for s in sequences)
print("median sequence length:", s[len(s) // 2])

In [None]:
print("max word index:", max(max(seq) for seq in sequences if len(seq) > 0))

In [None]:
# get word -> integer mapping
word2idx = tokenizer.word_index
print('Found %s unique tokens.' % len(word2idx))

In [None]:
# pad sequences so that we get a N x T matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)


In [None]:
len(word2idx)

In [None]:
# prepare embedding matrix
print('Filling pre-trained embeddings...')
num_words = min(MAX_VOCAB_SIZE, len(word2idx) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word2idx.items():
  if i < MAX_VOCAB_SIZE:
    embedding_vector = word2vec.get(word)
    if embedding_vector is not None:
      # words not found in embedding index will be all zeros.
      embedding_matrix[i] = embedding_vector

In [None]:

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(
  num_words,
  EMBEDDING_DIM,
  weights=[embedding_matrix],
  input_length=MAX_SEQUENCE_LENGTH,
  trainable=False
)

# # Model

In [None]:
print('Building model...')
# train a 1D convnet with global maxpooling
input_ = Input(shape=(MAX_SEQUENCE_LENGTH,))
x = embedding_layer(input_)
x = Conv1D(512, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(256, 3, activation='relu')(x)
x = MaxPooling1D(3)(x)
x = Conv1D(128, 3, activation='relu')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(128, activation='tanh')(x)
output = Dense(1, activation='linear')(x)

model = Model(input_, output)
model.compile(
  loss='mse',
  optimizer='adam',
  metrics=[tf.keras.metrics.RootMeanSquaredError()]
)

In [None]:
print('Training model...')
r = model.fit(
  data,
  target,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=VALIDATION_SPLIT
)


In [None]:
# plot some data
plt.plot(r.history['loss'], label='loss')
plt.plot(r.history['val_loss'], label='val_loss')
plt.legend()
plt.show()

In [None]:
# accuracies
plt.plot(r.history['root_mean_squared_error'], label='rmse')
plt.plot(r.history['val_root_mean_squared_error'], label='Val_rmse')
plt.legend()
plt.show()

In [None]:
test = pd.read_csv("../input/commonlitreadabilityprize/test.csv")

In [None]:
test.head(1)

In [None]:
test_sent = test["excerpt"].fillna("DUMMY_VALUE").values
# convert the sentences (strings) into integers
test_tokens = tokenizer.texts_to_sequences(test_sent)

In [None]:
# pad sequences so that we get a N x T matrix
test_data = pad_sequences(test_tokens, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', test_data.shape)

In [None]:
# plot the mean AUC over each label
p = model.predict(test_data)

In [None]:
submission = pd.read_csv("../input/commonlitreadabilityprize/sample_submission.csv")

In [None]:
submission['target'] = p

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index=False)